Last commit for bin/media_updater.php: 9ff742e4cc2ef0dba312dd0c5f642890b6945730

First pass at converting files to use autoloading! Take care if you have an old yioop system you are upgrading, a=chris

Chris Pollett [2015-07-01 02:Jul:st]
First pass at converting files to use autoloading! Take care if you have an old yioop system you are upgrading, a=chris
<?php
/**
 * SeekQuarry/Yioop --
 * Open Source Pure PHP Search Engine, Crawler, and Indexer
 *
 * Copyright (C) 2009 - 2015  Chris Pollett chris@pollett.org
 *
 * LICENSE:
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 * END LICENSE
 *
 * @author Chris Pollett chris@pollett.org
 * @license http://www.gnu.org/licenses/ GPL3
 * @link http://www.seekquarry.com/
 * @copyright 2009 - 2015
 * @filesource
 */
if (php_sapi_name() != 'cli') {echo "BAD REQUEST"; exit();}
/**
 * Calculate base directory of script
 * @ignore
 */
define("BASE_DIR", substr(
    dirname(realpath($_SERVER['PHP_SELF'])), 0,
    -strlen("/bin")));
ini_set("memory_limit", "1300M");
/** Load in global configuration settings */
require_once BASE_DIR.'/configs/config.php';
if (!PROFILE) {
    echo "Please configure the search engine instance by visiting" .
        "its web interface on localhost.\n";
    exit();
}
/** CRAWLING means don't try to use memcache
 * @ignore
 */
define("NO_CACHE", true);
/** We do want logging, but crawl model and other will try to turn off
 * if we don't set this
 */
define("NO_LOGGING", false);
/**
 * Shortest time through one iteration of news updater's loop
 */
define("MINIMUM_UPDATE_LOOP_TIME", 10);
/** for crawlDaemon function */
require_once BASE_DIR."/lib/crawl_daemon.php";
/** To guess language based on page encoding */
require_once BASE_DIR."/lib/locale_functions.php";
/** Loads common constants for media convert files*/
require_once BASE_DIR."/lib/media_constants.php";
/** Used to manage the process of sending emails to users */
require_once BASE_DIR."/lib/mail_server.php";
/** Loads common constants for web crawling*/
require_once BASE_DIR."/lib/crawl_constants.php";
/** Used to reparse wiki pages after a media conversion */
require_once BASE_DIR."/lib/wiki_parser.php";
/**Load base model class used by source model */
require_once BASE_DIR."/models/model.php";
/** UrlParser is used to get file extensions*/
require_once BASE_DIR."/lib/url_parser.php";
/** Source model is used to manage news feed sites*/
if (file_exists(APP_DIR."/models/source_model.php")) {
    require_once APP_DIR."/models/source_model.php";
}  else {
    require_once BASE_DIR."/models/source_model.php";
}
/** Group model is used to manage media conversion*/
if (file_exists(APP_DIR."/models/group_model.php")) {
    require_once APP_DIR."/models/group_model.php";
}  else {
    require_once BASE_DIR."/models/group_model.php";
}
/*
 * We'll set up multi-byte string handling to use UTF-8
 */
mb_internal_encoding("UTF-8");
mb_regex_encoding("UTF-8");
if (function_exists('lcfirst') === false) {
    /**
     * Lower cases the first letter in a string
     *
     * This function is only defined if the PHP version is before 5.3
     * @param string $str  string to be lower cased
     * @return string the lower cased string
     */
    function lcfirst( $str )
    {
        return (string)(strtolower(substr($str, 0, 1)).substr($str, 1));
    }
}
/**
 * Separate process/command-line script which can be used to update
 * news sources for Yioop and also handle other kinds of activities such as
 * video conversion. This is as an alternative to using the web app
 * for updating. Makes use of the web-apps code.
 *
 * @author Chris Pollett
 * @package seek_quarry\bin
 */
class MediaUpdater implements CrawlConstants,MediaConstants
{
    /**
     * The last time feeds were checked for updates
     * @var int
     */
    var $update_time;
    /**
     * Controls whether media updating should be viewed as only occurring
     * on the name server or should it be viewed as a distributed process
     * amongst all machines in this Yioop instance
     * @var string
     */
    var $media_mode;
    /**
     * Supported file types of videos that we can convert to mp4.
     * @var array
     */
    var $video_convert_types = array("mov", "avi");
    /**
     * Mail Server object used to send mails from media updater
     * @var object
     */
    var $mail_server;
    /**
     * Sets up the field variables so that media updating can begin
     */
    function __construct()
    {
        $this->delete_time = 0;
        $this->retry_time = 0;
        $this->update_time = 0;
        $this->media_mode = "name_server";
        $this->mail_server = new MailServer(MAIL_SENDER, MAIL_SERVER,
            MAIL_SERVERPORT, MAIL_USERNAME, MAIL_PASSWORD,
            MAIL_SECURITY);
    }
    /**
     * This is the function that should be called to get the media_updater to
     * start to start updating. Calls init to handle the command-line
     * arguments then enters news_updaters main loop
     */
    function start()
    {
        global $argv;
        CrawlDaemon::init($argv, "media_updater");
        crawlLog("\n\nInitialize logger..", "media_updater", true);
        $this->sourceModel = new SourceModel();
        $this->groupModel = new GroupModel();
        $this->loop();
    }


    /**
     * Main loop for the news updater.
     */
    function loop()
    {
        crawlLog("In Media Update Loop");
        $info[self::STATUS] = self::CONTINUE_STATE;
        $local_archives = array("");
        while (CrawlDaemon::processHandler()) {
            $start_time = microtime();
            $this->getUpdateProperties();
            $this->newsUpdate();
            $this->videoUpdate();
            $this->sendGroupNotificationEmailsInBatches();
            $sleep_time = max(0, ceil(
                MINIMUM_UPDATE_LOOP_TIME - changeInMicrotime($start_time)));
            if ($sleep_time > 0) {
                crawlLog("Ensure minimum loop time by sleeping...".$sleep_time);
                sleep($sleep_time);
            }
        } //end while
        crawlLog("Media Updater shutting down!!");
    }
    /**
     * Makes a request to the name server to find out if we are running
     * as a media updater just on the name server or on both the name server
     * as well as all other machines in the Yioop instance
     */
    function getUpdateProperties()
    {
        crawlLog("Checking Name Server for Media Updater properties...");
        $current_machine = $this->sourceModel->getCurrentMachine();
        $pre_properties = $this->sourceModel->execMachines(
            "getUpdateProperties", array(NAME_SERVER), $current_machine);
        $properties = array();
        if (isset($pre_properties[0][self::PAGE])) {
            $properties =
                unserialize(webdecode($pre_properties[0][self::PAGE]));
            if(isset($properties['MEDIA_MODE'])) {
                $this->media_mode = $properties['MEDIA_MODE'];
                crawlLog("...Setting media mode to: " .
                    $properties['MEDIA_MODE']);
            }
        }
        crawlLog("Done checking Name Server for Media Updater properties");
    }
    /**
     * If news_update time has passed, then updates news feeds associated with
     * this Yioop instance
     *
     * @param array $data used by view to render itself. In this case, if there
     *     is a problem updating the news then we will flash a message
     * @param bool $no_news_process if true than assume media_updater.php is
     *     not running. If false, assume being run from media_updater.php so
     *     update news_process cron time.
     */
    function newsUpdate()
    {
        crawlLog("Checking for News Updates...");
        $time = time();
        $something_updated = false;
        $delta = $time - $this->update_time;
        // every hour get items from feeds
        if ($delta > ONE_HOUR) {
            $this->update_time = $time;
            crawlLog("Performing news feeds update");
            $this->sourceModel->updateFeedItems(ONE_WEEK, $this->media_mode);
            $something_updated = true;
        }
        /*
            if anything changed rebuild shard
         */
        if ($something_updated) {
            crawlLog("Deleting feed items and rebuild shard...");
            $this->sourceModel->rebuildFeedShard(ONE_WEEK);
            crawlLog("... delete complete, shard rebuilt");
        } else {
            crawlLog("No updates needed.");
        }
    }
    /**
     * This function calls the video updater functionality.
     * It calls the all the required functions based on single or multiple.
     * machine setting.
     */
    function videoUpdate()
    {
        crawlLog("Checking for video files to process...");
        $current_machine = $this->sourceModel->getCurrentMachine();
        if ($current_machine == crawlHash(NAME_SERVER)) {
           $this->splitVideos();
           $this->moveVideoFoldersToConvertedDirectory();
           $this->generateAssembleVideoFile();
           $this->concatenateVideos();
        } else {
           $this->checkGetConvertVideos();
        }
        crawlLog("Video updates done!...");
    }
    /**
     * Checks if it has been more than two minutes since the last time
     * a file was modified
     *
     * @param string $file_name file to check
     * @return bool true if it has been more than two minutes
     */
    function isNoLongerModified($file_name)
    {
        if (file_exists($file_name)) {
            clearstatcache($file_name);
            if (time() < filemtime($file_name) + 2 * ONE_MINUTE) {
                return false;
            }
        }
        return true;
    }
    /**
     * Generates a thumbnail from a video file assuming FFMPEG
     *
     * @param string $video_name full name and path of video file to make
     *      thumbnail from
     * @param string $thumb_name full name and path for thumbnail file
     */
    function thumbFileFromVideo($video_name, $thumb_name)
    {
        $make_thumb_string =
            FFMPEG." -i \"$video_name\" -vframes 1 -map 0:v:0".
            " -vf \"scale=".THUMB_DIM.":".THUMB_DIM."\" ".
            "\"$thumb_name\" 2>&1";
        crawlLog("Making thumb with $make_thumb_string");
        exec($make_thumb_string);
        clearstatcache($thumb_name);
    }
    /**
     * Splits a video into small chunks of 5 minutes
     *
     * @param string.$file_path full path of video file to be split
     * @param string file_name.name of video file along with extension
     * @param.string.$destination_directory.destination directory.name
     *      where split files would be produced
     */
    function splitVideo($file_path, $file_name, $destination_directory)
    {
        crawlLog("Splitting $file_path/$file_name...");
        $extension = "." . UrlParser::getDocumentType($file_name, "");
        $new_name = substr($file_name, 0, -strlen($extension));
        $ffmpeg = FFMPEG." -i \"$file_path/$file_name\" ".
            " -acodec copy -f segment -segment_time 300 ".
            "-vcodec copy -reset_timestamps 1 -map 0 ".
            "\"$destination_directory/%d$new_name$extension\"";
        crawlLog($ffmpeg);
        exec($ffmpeg);
    }
    /**
     * Function to look through all the video directories present in media.
     * convert folder generated by group model.and split the eligible.files.
     */
    function splitVideos()
    {
        $convert_folder = WORK_DIRECTORY.self::CONVERT_FOLDER;
        if(!defined('FFMPEG') || !file_exists($convert_folder)) { return; }
        crawlLog("   Looking for video files to split...");
        $type_string = "{" . implode(",", $this->video_convert_types) . "}";
        $video_paths = glob($convert_folder."/*");
        foreach ($video_paths as $video_path) {
            if (is_dir($video_path)){
                if (!file_exists($video_path.self::SPLIT_FILE)) {return; }
                if (file_exists($video_path.self::SPLIT_FILE)) {
                    crawlLog("Splitting the video $video_path");
                    $lines = file($video_path.self::FILE_INFO);
                    $folder_name = rtrim($lines[1]);
                    $file_name = rtrim($lines[3]);
                    crawlLog("$folder_name : $file_name");
                    if ($folder_name && $file_name){
                        $this->splitVideo($folder_name, $file_name,
                            $video_path);
                        unlink($video_path . self::SPLIT_FILE);
                        file_put_contents($video_path . self::COUNT_FILE,
                            count(glob($video_path . "/*.$type_string",
                                GLOB_BRACE)));
                    }
                }
            }
        }
    }
    /**
     * Function to look through all the video directories present in media.
     * convert folder and move them to converted folders if all the split files.
     * are converted and are present in video.directory.under.converted.
     */
    function moveVideoFoldersToConvertedDirectory()
    {
        crawlLog("Moving video folders from media_convert to converted...");
        $convert_folder = WORK_DIRECTORY.self::CONVERT_FOLDER;
        $converted_folder = WORK_DIRECTORY.self::CONVERTED_FOLDER;
        if(!file_exists($converted_folder)) {
            mkdir($converted_folder);
        }
        $video_paths = glob($convert_folder."/*");
        foreach ($video_paths as $video_path) {
            crawlLog("  Video Path : $video_path");
            $actual_count = file_get_contents($video_path.self::COUNT_FILE);
            crawlLog("  Actual_count : $actual_count");
            $timestamp_files = glob($video_path."/*.time.txt");
            $checked_out = count($timestamp_files);
            crawlLog("  Checked out count : $checked_out");
            $video_folder = str_replace($convert_folder."/", "", $video_path);
            $converted_video_path = $converted_folder . "/" . $video_folder;
            $converted_count = count(glob($converted_video_path .
                "/*.{mp4}", GLOB_BRACE));
            crawlLog("  Converted count : $converted_count");
            if ($converted_count == $actual_count) {
                crawlLog("  Conversion of segments complete!");
                rename($video_path . self::COUNT_FILE,
                    $converted_video_path . self::COUNT_FILE);
                rename($video_path . self::FILE_INFO,
                    $converted_video_path . self::FILE_INFO);
                $this->sourceModel->db->unlinkRecursive($video_path);
            }
        }
    }
    /**
     * Function to look through all the converted.video directories present in
     * media and generate the assemble video files needed for concatenating the
     * converted splitfiles.
     */
    function generateAssembleVideoFile()
    {
        crawlLog("Inside generateAssembleVideoFile function...");
        $converted_folder = WORK_DIRECTORY.self::CONVERTED_FOLDER;
        if(!file_exists($converted_folder)) {
            mkdir($converted_folder);
        }
        foreach (glob($converted_folder."/*") as $video_path) {
            if (file_exists($video_path . self::CONCATENATED_FILE)){ continue; }
            if (file_exists($video_path . self::ASSEMBLE_FILE)) {
                continue;
            }
            if (!file_exists($video_path.self::COUNT_FILE)) { continue; }
            $actual_count = file_get_contents($video_path.self::COUNT_FILE);
            $video_segments = glob($video_path . "/*.mp4");
            $converted_count = count($video_segments);
            if ($actual_count == $converted_count) {
                foreach($video_segments as $video_segment){
                    file_put_contents($video_path . self::ASSEMBLE_FILE,
                        "file "."'".(str_replace($video_path."/", "",
                        $video_segment))."'", FILE_APPEND);
                    file_put_contents($video_path.self::ASSEMBLE_FILE,
                        PHP_EOL, FILE_APPEND);
                }
            }
        }
    }
    /**
     * Concatenates split video files to generate one video file
     *
     * @param string.$text_file_name file path containing.the relative file.
     *      paths of the files to be concatenated
     * @param string file_name name of video file to be given to output file.
     * @param string $destination_directory.destination directory.name
     *      where concatenated file would be produced
     */
    function mergeVideo($text_file_name , $file_name, $destination_directory)
    {
        $extension = "." . UrlParser::getDocumentType($file_name, "");
        $new_name = substr($file_name, 0, -strlen($extension));
        if (!file_exists($text_file_name)) {return; }
        $generate_output = $destination_directory."/$new_name.mp4";
        $ffmpeg = FFMPEG." -f concat -i \"$text_file_name\" -c copy ".
            "\"$generate_output\"";
        crawlLog($ffmpeg);
        exec($ffmpeg);
        if(file_exists($generate_output)) {
            return true;
        }
        return false;
    }
    /**
     * Function to look.through each video directory and call the function to
     * concatenate split files.
     */
    function concatenateVideos()
    {
        crawlLog("Concatenating videos...");
        $converted_folder = WORK_DIRECTORY.self::CONVERTED_FOLDER;
        if(!file_exists($converted_folder)) {
            mkdir($converted_folder);
        }
        foreach (glob($converted_folder."/*") as $video_path) {
            crawlLog("  Video Path " . $video_path);
            if (is_dir($video_path)){
                if(!file_exists($video_path . self::ASSEMBLE_FILE)) {
                    continue;
                }
                $assemble_file = $video_path . self::ASSEMBLE_FILE;
                $lines = file($video_path . self::FILE_INFO);
                $folder = trim($lines[1]);
                $thumb_folder = trim($lines[2]);
                $file_name = trim($lines[3]);
                if($this->mergeVideo($assemble_file, $file_name, $folder)){
                    $this->sourceModel->db->unlinkRecursive($video_path);
                    $video_name = $folder. "/" . $file_name;
                    $extension_len = strlen(
                        UrlParser::getDocumentType($video_name));
                    $file_prefix = substr($file_name, 0, -$extension_len - 1);
                    $thumb_file_name = $file_prefix . ".mp4.jpg";
                    $thumb_name = $thumb_folder . "/" . $thumb_file_name;
                    $this->thumbFileFromVideo($video_name, $thumb_name);
                }
            }
        }
    }
    /**
     * Function to convert avi or mov file to mp4 format.
     *
     * @param string $file_name full path of the file.
     */
    function convertVideo($file_name)
    {
        $extension = "." . UrlParser::getDocumentType($file_name, "");
        $new_name = substr($file_name, 0, -strlen($extension));
        switch($extension)
        {
            case '.mov':
                $ffmpeg = FFMPEG." -i \"$file_name\" ".
                    " -vcodec h264 -acodec aac -preset veryfast -crf 28 ".
                    "-strict -2 \"$new_name.mp4\"";
            break;
            case '.avi':
                $ffmpeg = FFMPEG." -i \"$file_name\" ".
                    " -vcodec libx264  -preset slow -acodec aac -crf 28 ".
                    "-strict experimental -b:a 192k -ac 2 \"$new_name.mp4\"";
            break;
        }
        crawlLog($ffmpeg);
        exec($ffmpeg);
    }
    /**
     * Checks name server for a video segment to convert. If there are
     * converts the mov or avi segment file to an mp4 file
     * This function would only be called by slave media updaters.
     */
    function checkGetConvertVideos()
    {
        crawlLog("Checking Name Server for video segments to convert..");
        $convert_folder = WORK_DIRECTORY . self::CONVERT_FOLDER;
        if (!file_exists($convert_folder)) {
            @mkdir($convert_folder);
            if (!file_exists($convert_folder)) {
                crawlLog("Unable to create $convert_folder. Bailing!");
                return;
            }
        }
        $folders = glob($convert_folder."/*", GLOB_ONLYDIR);
        if (count($folders) > 0) {
            foreach($folders as $folder){
                $this->sourceModel->db->unlinkRecursive($folder);
            }
        }
        $time = time();
        $session = md5($time . AUTH_KEY);
        $server = NAME_SERVER;
        $folder_file = $this->sourceModel->requestFileForConversion();
        $db = $this->sourceModel->db;
        if ($folder_file) {
            $info = explode(":", $folder_file);
            $folder_name = trim($info[0]);
            $file_name = trim($info[1]);
            /* Download the file from the server */
            $request = "$server?c=resource&a=get&time=$time&session=$session".
                "&f=schedules&n=". urlencode($file_name)."&sf=$folder_name";
            $data = FetchUrl::getPage($request, NULL, true);
            $convert_path = $convert_folder."/".$folder_name;
            if(file_exists( $convert_path)) {
                $db->unlinkRecursive( $convert_path);
            }
            mkdir($convert_path);
            $downloaded_file =  $convert_path . "/" . $file_name;
            file_put_contents($downloaded_file, $data);
            $this->convertVideo($downloaded_file);
            $files = glob($convert_path . "/*.{mp4}", GLOB_BRACE);
            if (!$files[0]) {
                crawlLog("Will try to convert the file again later");
            } else {
                $converted_file_name = substr($files[0],
                    strlen($convert_path) + 1);
                /* Upload the file to the server */
                $file_data = file_get_contents($files[0]);
                $post_data['c'] = "resource";
                $post_data['a'] = "uploadConvertedVideoFile";
                $post_data['time'] = $time;
                $post_data['session'] = $session;
                $post_data['data'] =  webencode($file_data);
                $post_data['file_name'] = webencode($converted_file_name);
                $post_data['folder_name'] = webencode($folder_name);
                crawlLog("Attempting to upload converted video file...");
                $response = FetchUrl::getPage($server, $post_data , true);
                crawlLog("...server response:\n $response");
            }
        } else {
            crawlLog("No files on server to convert!");
        }
    }
    /**
     *
     * Function to send emails to mailer batches created by
     * mail_server. This function would periodically be invoked and
     * send emails reading data from the text files.
     */
    function sendGroupNotificationEmailsInBatches()
    {
        crawlLog("Checking for mailer files to be sent out...");
        $current_machine = $this->sourceModel->getCurrentMachine();
        if ($current_machine == crawlHash(NAME_SERVER) &&
            MEDIA_MODE != 'distributed') {
            $mail_directory = WORK_DIRECTORY . self::MAIL_FOLDER;
            if(!file_exists($mail_directory)) { return; }
            $files = glob($mail_directory."/*.txt");
            if(!isset($files[0])) {
                return;
            }
            $sendable_file = false;
            foreach($files as $email_file) {
                if(time() - filemtime($email_file) > MAX_MAIL_TIMESTAMP_LIMIT) {
                    $sendable_file = $email_file;
                    break;
                }
            }
            if(!$sendable_file) {
                return;
            }
            $emails_string = file_get_contents($sendable_file);
            unlink($email_file);
            $emails = explode(self::MESSAGE_SEPARATOR, $emails_string);
            foreach ($emails as $serialized_email) {
                $email = unserialize($serialized_email);
                if(count($email) == 4) {
                    crawlLog("Sending email to {$email[2]} about {$email[0]}");
                    $this->mail_server->sendImmediate(
                        $email[0], $email[1], $email[2], $email[3]);
                }
            }
        } else {
            $file_name = $this->sourceModel->requestFileForMailingList();
            if(empty($file_name)){
                crawlLog("...Could not get any response from name server!");
                return;
            }
            $time = time();
            $session = md5($time . AUTH_KEY);
            $server = NAME_SERVER;
            /* Download the file from the server */
            $request = "$server?c=resource&a=get&time=$time&session=$session".
                "&f=schedules&n=" . urlencode($file_name) . "&sf=mail";
            $emails_string = FetchUrl::getPage($request, NULL, true);
            if(!$emails_string) {
                crawlLog("  No mail data returning");
                return;
            }
            $emails = explode(self::MESSAGE_SEPARATOR, $emails_string);
            foreach ($emails as $serialized_email) {
                $email = unserialize($serialized_email);
                if(count($email) == 4) {
                    crawlLog("Sending email to {$email[2]} about {$email[0]}");
                    $this->mail_server->sendImmediate(
                        $email[0], $email[1], $email[2], $email[3]);
                }
            }
            $request = "$server?c=resource&a=removeMailingListFile".
                "&time=$time&session=$session&n=".urlencode($file_name);
            FetchUrl::getPage($request, NULL, true);
        }
    }
}
/*
 * Instantiate and run the MediaUpdater program
 */
$media_updater =  new MediaUpdater();
$media_updater->start();
ViewGit