Last commit for bin/media_updater.php: 9ff742e4cc2ef0dba312dd0c5f642890b6945730

First pass at converting files to use autoloading! Take care if you have an old yioop system you are upgrading, a=chris

Chris Pollett [2015-07-01 02:Jul:st]
First pass at converting files to use autoloading! Take care if you have an old yioop system you are upgrading, a=chris
<?php
/**
 * SeekQuarry/Yioop --
 * Open Source Pure PHP Search Engine, Crawler, and Indexer
 *
 * Copyright (C) 2009 - 2015  Chris Pollett chris@pollett.org
 *
 * LICENSE:
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 * END LICENSE
 *
 * @author Chris Pollett chris@pollett.org
 * @package seek_quarry
 * @subpackage bin
 * @license http://www.gnu.org/licenses/ GPL3
 * @link http://www.seekquarry.com/
 * @copyright 2009 - 2015
 * @filesource
 */
if(php_sapi_name() != 'cli') {echo "BAD REQUEST"; exit();}
/**
 * Calculate base directory of script
 * @ignore
 */
define("BASE_DIR", substr(
    dirname(realpath($_SERVER['PHP_SELF'])), 0,
    -strlen("/bin")));
ini_set("memory_limit", "1300M");
/** Load in global configuration settings */
require_once BASE_DIR.'/configs/config.php';
if(!PROFILE) {
    echo "Please configure the search engine instance by visiting" .
        "its web interface on localhost.\n";
    exit();
}
/** CRAWLING means don't try to use memcache
 * @ignore
 */
define("NO_CACHE", true);
/** We do want logging, but crawl model and other will try to turn off
 * if we don't set this
 */
define("NO_LOGGING", false);
/**
 * Shortest time through one iteration of news updater's loop
 */
define("MINIMUM_UPDATE_LOOP_TIME", 10);
/** for crawlDaemon function */
require_once BASE_DIR."/lib/crawl_daemon.php";
/** To guess language based on page encoding */
require_once BASE_DIR."/lib/locale_functions.php";
/** Loads common constants for web crawling*/
require_once BASE_DIR."/lib/crawl_constants.php";
/** Used to reparse wiki pages after a media conversion */
require_once BASE_DIR."/lib/wiki_parser.php";
/**Load base model class used by source model */
require_once BASE_DIR."/models/model.php";
/** Source model is used to manage news feed sites*/
if(file_exists(APP_DIR."/models/source_model.php")) {
    require_once APP_DIR."/models/source_model.php";
}  else {
    require_once BASE_DIR."/models/source_model.php";
}
/** Group model is used to manage media conversion*/
if(file_exists(APP_DIR."/models/group_model.php")) {
    require_once APP_DIR."/models/group_model.php";
}  else {
    require_once BASE_DIR."/models/group_model.php";
}
/*
 * We'll set up multi-byte string handling to use UTF-8
 */
mb_internal_encoding("UTF-8");
mb_regex_encoding("UTF-8");
if (function_exists('lcfirst') === false) {
    /**
     * Lower cases the first letter in a string
     *
     * This function is only defined if the PHP version is before 5.3
     * @param string $str  string to be lower cased
     * @return string the lower cased string
     */
    function lcfirst( $str )
    {
        return (string)(strtolower(substr($str, 0, 1)).substr($str, 1));
    }
}
/**
 * Separate process/command-line script which can be used to update
 * news sources for Yioop and also handle other kinds of activities such as
 * video conversion. This is as an alternative to using the web app
 * for updating. Makes use of the web-apps code.
 *
 * @author Chris Pollett
 * @package seek_quarry
 */
class MediaUpdater implements CrawlConstants
{
    /**
     * The last time feeds were checked for updates
     * @var int
     */
    var $update_time;
    /**
     * Sets up the field variables so that media updating can begin
     */
    function __construct()
    {
        $this->delete_time = 0;
        $this->retry_time = 0;
        $this->update_time = 0;
    }
    /**
     * This is the function that should be called to get the media_updater to
     * start to start updating. Calls init to handle the command-line
     * arguments then enters news_updaters main loop
     */
    function start()
    {
        global $argv;
        CrawlDaemon::init($argv, "media_updater");
        crawlLog("\n\nInitialize logger..", "media_updater", true);
        $this->sourceModel = new SourceModel();
        $this->groupModel = new GroupModel();
        $this->loop();
    }

    /**
     * Main loop for the news updater.
     */
    function loop()
    {
        crawlLog("In News Update Loop");
        $info[self::STATUS] = self::CONTINUE_STATE;
        $local_archives = array("");
        while (CrawlDaemon::processHandler()) {
            $start_time = microtime();
            crawlLog("Checking if news feeds should be updated...");
            $this->newsUpdate();
            $this->mediaConversion();
            $sleep_time = max(0, ceil(
                MINIMUM_UPDATE_LOOP_TIME - changeInMicrotime($start_time)));
            if($sleep_time > 0) {
                crawlLog("Ensure minimum loop time by sleeping...".$sleep_time);
                sleep($sleep_time);
            }
        } //end while
        crawlLog("News Updater shutting down!!");
    }
    /**
     * If news_update time has passed, then updates news feeds associated with
     * this Yioop instance
     *
     * @param array $data used by view to render itself. In this case, if there
     *     is a problem updating the news then we will flash a message
     * @param bool $no_news_process if true than assume media_updater.php is
     *     not running. If false, assume being run from media_updater.php so
     *     update news_process cron time.
     */
    function newsUpdate()
    {
        $time = time();
        $something_updated = false;
        $delta = $time - $this->update_time;
        // every hour get items from feeds
        if($delta > ONE_HOUR) {
            $this->update_time = $time;
            crawlLog("Performing news feeds update");
            $this->sourceModel->updateFeedItems(ONE_WEEK);
            $something_updated = true;
        }
        /*
            if anything changed rebuild shard
         */
        if($something_updated) {
            crawlLog("Deleting feed items and rebuild shard...");
            $this->sourceModel->rebuildFeedShard(ONE_WEEK);
            crawlLog("... delete complete, shard rebuilt");
        } else {
            crawlLog("No updates needed.");
        }
    }
    /**
     *  Checks for schedules of media to convert in the media_convert
     *  schedules folder. If it find a schedules, it reads it and tries to
     *  do the conversions. For now the conversions are limited to creating
     *  webm and mp4 files corresponding to a video file listed in a schedule.
     *  For this method to run, ffmpeg must be installed and the constant
     *  FFMPEG must be set to the path to ffmpeg.
     */
    function mediaConversion()
    {
        $convert_folder = WORK_DIRECTORY. "/schedules/media_convert";
        if(!defined('FFMPEG') || !file_exists($convert_folder)) { return; }
        crawlLog("Checking for media files to convert...");
        $to_convert = glob($convert_folder. "/*.txt");
        if(!isset($to_convert[0])) {
            crawlLog("...No media files found.");
            return;
        }
        $convert_file_struct = file($to_convert[0]);
        unlink($to_convert[0]);
        if(!isset($convert_file_struct[2])) {
            crawlLog("...Media conversion file mis-formatted... skipping");
            return;
        }
        $group_model = $this->groupModel;
        list($page_id, $media_path, $thumb_path, $media_file_name) =
            $convert_file_struct;
        $page_id = trim($page_id);
        $info = $group_model->getPageInfoByPageId($page_id);
        if(!$info) {
            crawlLog("...Media page lookup failed ... skipping");
            return;
        }
        $group_id = $info['GROUP_ID'];
        $locale_tag = $info['LOCALE_TAG'];
        $page_name = $info['PAGE_NAME'];
        $full_info = $group_model->getPageInfoByName($group_id, $page_name,
            $locale_tag, 'edit');
        if(!$full_info) {
            crawlLog("...Full Media page lookup failed ... skipping");
            return;
        }
        $media_base_name = $media_file_name;
        $media_path = trim($media_path);
        $thumb_path = trim($thumb_path);
        if(($pos = strrpos($media_file_name, ".")) !== false) {
            $media_base_name = substr($media_file_name, 0, $pos);
        }
        $input_media_file = "$media_path/$media_file_name";
        $mp4_file = "$media_path/$media_base_name.mp4";
        $webm_file = "$media_path/$media_base_name.webm";
        if($input_media_file != $mp4_file) {
            $ffmpeg = FFMPEG . " -i \"$input_media_file\" -y".
                " -vcodec h264 -acodec aac -preset veryfast -crf 28 ".
                "-strict -2 \"$mp4_file\"";
            crawlLog("Converting " . $media_file_name . " to mp4...");
            $output = system($ffmpeg);
            CrawlDaemon::processHandler(true); //prevent timeout killing process
            $make_thumb_string =
                FFMPEG." -i \"$mp4_file\" -vframes 1 -map 0:v:0".
                " -vf \"scale=".THUMB_DIM.":".THUMB_DIM."\" ".
                "\"$thumb_path/$media_base_name.mp4.jpg\" 2>&1";
            crawlLog($output);
            crawlLog("Making thumb with $make_thumb_string");
            exec($make_thumb_string);
            clearstatcache("$thumb_path/$media_base_name.mp4.jpg");
        }
        if($input_media_file != $webm_file) {
            $ffmpeg = FFMPEG . " -i \"$input_media_file\" -y".
                " -vcodec libvpx \"$webm_file\"";
            crawlLog("Converting " . $media_file_name . " to webm...");
            $output = system($ffmpeg);
            CrawlDaemon::processHandler(true); //prevent timeout killing process
            $make_thumb_string =
                FFMPEG." -i \"$webm_file\" -vframes 1 -map 0:v:0".
                " -vf \"scale=".THUMB_DIM.":".THUMB_DIM."\" ".
                "\"$thumb_path/$media_base_name.webm.jpg\" 2>&1";
            crawlLog($output);
            crawlLog("Making thumb with $make_thumb_string");
            exec($make_thumb_string);
            clearstatcache("$thumb_path/$media_base_name.webm.jpg");
        }
        crawlLog("...media file conversion done...");
        $parser = new WikiParser();
        $parsed_page = $parser->parse($full_info["PAGE"]);
        if($parsed_page) {
            crawlLog("... re-parsing media page...");
            $parsed_page = $group_model->insertResourcesParsePage($group_id,
                $page_id, $locale_tag, $parsed_page);
            $sql = "UPDATE GROUP_PAGE SET PAGE=? WHERE ID = ?";
            $group_model->db->execute($sql, array($parsed_page, $page_id));
        }
    }
}
/*
 * Instantiate and runs the MediaUpdater program
 */
$media_updater =  new MediaUpdater();
$media_updater->start();

?>
ViewGit