<?php /** * SeekQuarry/Yioop -- * Open Source Pure PHP Search Engine, Crawler, and Indexer * * Copyright (C) 2009 - 2015 Chris Pollett chris@pollett.org * * LICENSE: * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * * END LICENSE * * @author Chris Pollett chris@pollett.org * @license http://www.gnu.org/licenses/ GPL3 * @link http://www.seekquarry.com/ * @copyright 2009 - 2015 * @filesource */ if (php_sapi_name() != 'cli') {echo "BAD REQUEST"; exit();} /** * Calculate base directory of script * @ignore */ define("BASE_DIR", substr( dirname(realpath($_SERVER['PHP_SELF'])), 0, -strlen("/bin"))); ini_set("memory_limit", "1300M"); /** Load in global configuration settings */ require_once BASE_DIR.'/configs/config.php'; if (!PROFILE) { echo "Please configure the search engine instance by visiting" . "its web interface on localhost.\n"; exit(); } /** CRAWLING means don't try to use memcache * @ignore */ define("NO_CACHE", true); /** We do want logging, but crawl model and other will try to turn off * if we don't set this */ define("NO_LOGGING", false); /** * Shortest time through one iteration of news updater's loop */ define("MINIMUM_UPDATE_LOOP_TIME", 10); /** for crawlDaemon function */ require_once BASE_DIR."/lib/crawl_daemon.php"; /** To guess language based on page encoding */ require_once BASE_DIR."/lib/locale_functions.php"; /** Loads common constants for media convert files*/ require_once BASE_DIR."/lib/media_constants.php"; /** Used to manage the process of sending emails to users */ require_once BASE_DIR."/lib/mail_server.php"; /** Loads common constants for web crawling*/ require_once BASE_DIR."/lib/crawl_constants.php"; /** Used to reparse wiki pages after a media conversion */ require_once BASE_DIR."/lib/wiki_parser.php"; /**Load base model class used by source model */ require_once BASE_DIR."/models/model.php"; /** UrlParser is used to get file extensions*/ require_once BASE_DIR."/lib/url_parser.php"; /** Source model is used to manage news feed sites*/ if (file_exists(APP_DIR."/models/source_model.php")) { require_once APP_DIR."/models/source_model.php"; } else { require_once BASE_DIR."/models/source_model.php"; } /** Group model is used to manage media conversion*/ if (file_exists(APP_DIR."/models/group_model.php")) { require_once APP_DIR."/models/group_model.php"; } else { require_once BASE_DIR."/models/group_model.php"; } /* * We'll set up multi-byte string handling to use UTF-8 */ mb_internal_encoding("UTF-8"); mb_regex_encoding("UTF-8"); if (function_exists('lcfirst') === false) { /** * Lower cases the first letter in a string * * This function is only defined if the PHP version is before 5.3 * @param string $str string to be lower cased * @return string the lower cased string */ function lcfirst( $str ) { return (string)(strtolower(substr($str, 0, 1)).substr($str, 1)); } } /** * Separate process/command-line script which can be used to update * news sources for Yioop and also handle other kinds of activities such as * video conversion. This is as an alternative to using the web app * for updating. Makes use of the web-apps code. * * @author Chris Pollett * @package seek_quarry\bin */ class MediaUpdater implements CrawlConstants,MediaConstants { /** * The last time feeds were checked for updates * @var int */ var $update_time; /** * Controls whether media updating should be viewed as only occurring * on the name server or should it be viewed as a distributed process * amongst all machines in this Yioop instance * @var string */ var $media_mode; /** * Supported file types of videos that we can convert to mp4. * @var array */ var $video_convert_types = array("mov", "avi"); /** * Mail Server object used to send mails from media updater * @var object */ var $mail_server; /** * Sets up the field variables so that media updating can begin */ function __construct() { $this->delete_time = 0; $this->retry_time = 0; $this->update_time = 0; $this->media_mode = "name_server"; $this->mail_server = new MailServer(MAIL_SENDER, MAIL_SERVER, MAIL_SERVERPORT, MAIL_USERNAME, MAIL_PASSWORD, MAIL_SECURITY); } /** * This is the function that should be called to get the media_updater to * start to start updating. Calls init to handle the command-line * arguments then enters news_updaters main loop */ function start() { global $argv; CrawlDaemon::init($argv, "media_updater"); crawlLog("\n\nInitialize logger..", "media_updater", true); $this->sourceModel = new SourceModel(); $this->groupModel = new GroupModel(); $this->loop(); } /** * Main loop for the news updater. */ function loop() { crawlLog("In Media Update Loop"); $info[self::STATUS] = self::CONTINUE_STATE; $local_archives = array(""); while (CrawlDaemon::processHandler()) { $start_time = microtime(); $this->getUpdateProperties(); $this->newsUpdate(); $this->videoUpdate(); $this->sendGroupNotificationEmailsInBatches(); $sleep_time = max(0, ceil( MINIMUM_UPDATE_LOOP_TIME - changeInMicrotime($start_time))); if ($sleep_time > 0) { crawlLog("Ensure minimum loop time by sleeping...".$sleep_time); sleep($sleep_time); } } //end while crawlLog("Media Updater shutting down!!"); } /** * Makes a request to the name server to find out if we are running * as a media updater just on the name server or on both the name server * as well as all other machines in the Yioop instance */ function getUpdateProperties() { crawlLog("Checking Name Server for Media Updater properties..."); $current_machine = $this->sourceModel->getCurrentMachine(); $pre_properties = $this->sourceModel->execMachines( "getUpdateProperties", array(NAME_SERVER), $current_machine); $properties = array(); if (isset($pre_properties[0][self::PAGE])) { $properties = unserialize(webdecode($pre_properties[0][self::PAGE])); if(isset($properties['MEDIA_MODE'])) { $this->media_mode = $properties['MEDIA_MODE']; crawlLog("...Setting media mode to: " . $properties['MEDIA_MODE']); } } crawlLog("Done checking Name Server for Media Updater properties"); } /** * If news_update time has passed, then updates news feeds associated with * this Yioop instance * * @param array $data used by view to render itself. In this case, if there * is a problem updating the news then we will flash a message * @param bool $no_news_process if true than assume media_updater.php is * not running. If false, assume being run from media_updater.php so * update news_process cron time. */ function newsUpdate() { crawlLog("Checking for News Updates..."); $time = time(); $something_updated = false; $delta = $time - $this->update_time; // every hour get items from feeds if ($delta > ONE_HOUR) { $this->update_time = $time; crawlLog("Performing news feeds update"); $this->sourceModel->updateFeedItems(ONE_WEEK, $this->media_mode); $something_updated = true; } /* if anything changed rebuild shard */ if ($something_updated) { crawlLog("Deleting feed items and rebuild shard..."); $this->sourceModel->rebuildFeedShard(ONE_WEEK); crawlLog("... delete complete, shard rebuilt"); } else { crawlLog("No updates needed."); } } /** * This function calls the video updater functionality. * It calls the all the required functions based on single or multiple. * machine setting. */ function videoUpdate() { crawlLog("Checking for video files to process..."); $current_machine = $this->sourceModel->getCurrentMachine(); if ($current_machine == crawlHash(NAME_SERVER)) { $this->splitVideos(); $this->moveVideoFoldersToConvertedDirectory(); $this->generateAssembleVideoFile(); $this->concatenateVideos(); } else { $this->checkGetConvertVideos(); } crawlLog("Video updates done!..."); } /** * Checks if it has been more than two minutes since the last time * a file was modified * * @param string $file_name file to check * @return bool true if it has been more than two minutes */ function isNoLongerModified($file_name) { if (file_exists($file_name)) { clearstatcache($file_name); if (time() < filemtime($file_name) + 2 * ONE_MINUTE) { return false; } } return true; } /** * Generates a thumbnail from a video file assuming FFMPEG * * @param string $video_name full name and path of video file to make * thumbnail from * @param string $thumb_name full name and path for thumbnail file */ function thumbFileFromVideo($video_name, $thumb_name) { $make_thumb_string = FFMPEG." -i \"$video_name\" -vframes 1 -map 0:v:0". " -vf \"scale=".THUMB_DIM.":".THUMB_DIM."\" ". "\"$thumb_name\" 2>&1"; crawlLog("Making thumb with $make_thumb_string"); exec($make_thumb_string); clearstatcache($thumb_name); } /** * Splits a video into small chunks of 5 minutes * * @param string.$file_path full path of video file to be split * @param string file_name.name of video file along with extension * @param.string.$destination_directory.destination directory.name * where split files would be produced */ function splitVideo($file_path, $file_name, $destination_directory) { crawlLog("Splitting $file_path/$file_name..."); $extension = "." . UrlParser::getDocumentType($file_name, ""); $new_name = substr($file_name, 0, -strlen($extension)); $ffmpeg = FFMPEG." -i \"$file_path/$file_name\" ". " -acodec copy -f segment -segment_time 300 ". "-vcodec copy -reset_timestamps 1 -map 0 ". "\"$destination_directory/%d$new_name$extension\""; crawlLog($ffmpeg); exec($ffmpeg); } /** * Function to look through all the video directories present in media. * convert folder generated by group model.and split the eligible.files. */ function splitVideos() { $convert_folder = WORK_DIRECTORY.self::CONVERT_FOLDER; if(!defined('FFMPEG') || !file_exists($convert_folder)) { return; } crawlLog(" Looking for video files to split..."); $type_string = "{" . implode(",", $this->video_convert_types) . "}"; $video_paths = glob($convert_folder."/*"); foreach ($video_paths as $video_path) { if (is_dir($video_path)){ if (!file_exists($video_path.self::SPLIT_FILE)) {return; } if (file_exists($video_path.self::SPLIT_FILE)) { crawlLog("Splitting the video $video_path"); $lines = file($video_path.self::FILE_INFO); $folder_name = rtrim($lines[1]); $file_name = rtrim($lines[3]); crawlLog("$folder_name : $file_name"); if ($folder_name && $file_name){ $this->splitVideo($folder_name, $file_name, $video_path); unlink($video_path . self::SPLIT_FILE); file_put_contents($video_path . self::COUNT_FILE, count(glob($video_path . "/*.$type_string", GLOB_BRACE))); } } } } } /** * Function to look through all the video directories present in media. * convert folder and move them to converted folders if all the split files. * are converted and are present in video.directory.under.converted. */ function moveVideoFoldersToConvertedDirectory() { crawlLog("Moving video folders from media_convert to converted..."); $convert_folder = WORK_DIRECTORY.self::CONVERT_FOLDER; $converted_folder = WORK_DIRECTORY.self::CONVERTED_FOLDER; if(!file_exists($converted_folder)) { mkdir($converted_folder); } $video_paths = glob($convert_folder."/*"); foreach ($video_paths as $video_path) { crawlLog(" Video Path : $video_path"); $actual_count = file_get_contents($video_path.self::COUNT_FILE); crawlLog(" Actual_count : $actual_count"); $timestamp_files = glob($video_path."/*.time.txt"); $checked_out = count($timestamp_files); crawlLog(" Checked out count : $checked_out"); $video_folder = str_replace($convert_folder."/", "", $video_path); $converted_video_path = $converted_folder . "/" . $video_folder; $converted_count = count(glob($converted_video_path . "/*.{mp4}", GLOB_BRACE)); crawlLog(" Converted count : $converted_count"); if ($converted_count == $actual_count) { crawlLog(" Conversion of segments complete!"); rename($video_path . self::COUNT_FILE, $converted_video_path . self::COUNT_FILE); rename($video_path . self::FILE_INFO, $converted_video_path . self::FILE_INFO); $this->sourceModel->db->unlinkRecursive($video_path); } } } /** * Function to look through all the converted.video directories present in * media and generate the assemble video files needed for concatenating the * converted splitfiles. */ function generateAssembleVideoFile() { crawlLog("Inside generateAssembleVideoFile function..."); $converted_folder = WORK_DIRECTORY.self::CONVERTED_FOLDER; if(!file_exists($converted_folder)) { mkdir($converted_folder); } foreach (glob($converted_folder."/*") as $video_path) { if (file_exists($video_path . self::CONCATENATED_FILE)){ continue; } if (file_exists($video_path . self::ASSEMBLE_FILE)) { continue; } if (!file_exists($video_path.self::COUNT_FILE)) { continue; } $actual_count = file_get_contents($video_path.self::COUNT_FILE); $video_segments = glob($video_path . "/*.mp4"); $converted_count = count($video_segments); if ($actual_count == $converted_count) { foreach($video_segments as $video_segment){ file_put_contents($video_path . self::ASSEMBLE_FILE, "file "."'".(str_replace($video_path."/", "", $video_segment))."'", FILE_APPEND); file_put_contents($video_path.self::ASSEMBLE_FILE, PHP_EOL, FILE_APPEND); } } } } /** * Concatenates split video files to generate one video file * * @param string.$text_file_name file path containing.the relative file. * paths of the files to be concatenated * @param string file_name name of video file to be given to output file. * @param string $destination_directory.destination directory.name * where concatenated file would be produced */ function mergeVideo($text_file_name , $file_name, $destination_directory) { $extension = "." . UrlParser::getDocumentType($file_name, ""); $new_name = substr($file_name, 0, -strlen($extension)); if (!file_exists($text_file_name)) {return; } $generate_output = $destination_directory."/$new_name.mp4"; $ffmpeg = FFMPEG." -f concat -i \"$text_file_name\" -c copy ". "\"$generate_output\""; crawlLog($ffmpeg); exec($ffmpeg); if(file_exists($generate_output)) { return true; } return false; } /** * Function to look.through each video directory and call the function to * concatenate split files. */ function concatenateVideos() { crawlLog("Concatenating videos..."); $converted_folder = WORK_DIRECTORY.self::CONVERTED_FOLDER; if(!file_exists($converted_folder)) { mkdir($converted_folder); } foreach (glob($converted_folder."/*") as $video_path) { crawlLog(" Video Path " . $video_path); if (is_dir($video_path)){ if(!file_exists($video_path . self::ASSEMBLE_FILE)) { continue; } $assemble_file = $video_path . self::ASSEMBLE_FILE; $lines = file($video_path . self::FILE_INFO); $folder = trim($lines[1]); $thumb_folder = trim($lines[2]); $file_name = trim($lines[3]); if($this->mergeVideo($assemble_file, $file_name, $folder)){ $this->sourceModel->db->unlinkRecursive($video_path); $video_name = $folder. "/" . $file_name; $extension_len = strlen( UrlParser::getDocumentType($video_name)); $file_prefix = substr($file_name, 0, -$extension_len - 1); $thumb_file_name = $file_prefix . ".mp4.jpg"; $thumb_name = $thumb_folder . "/" . $thumb_file_name; $this->thumbFileFromVideo($video_name, $thumb_name); } } } } /** * Function to convert avi or mov file to mp4 format. * * @param string $file_name full path of the file. */ function convertVideo($file_name) { $extension = "." . UrlParser::getDocumentType($file_name, ""); $new_name = substr($file_name, 0, -strlen($extension)); switch($extension) { case '.mov': $ffmpeg = FFMPEG." -i \"$file_name\" ". " -vcodec h264 -acodec aac -preset veryfast -crf 28 ". "-strict -2 \"$new_name.mp4\""; break; case '.avi': $ffmpeg = FFMPEG." -i \"$file_name\" ". " -vcodec libx264 -preset slow -acodec aac -crf 28 ". "-strict experimental -b:a 192k -ac 2 \"$new_name.mp4\""; break; } crawlLog($ffmpeg); exec($ffmpeg); } /** * Checks name server for a video segment to convert. If there are * converts the mov or avi segment file to an mp4 file * This function would only be called by slave media updaters. */ function checkGetConvertVideos() { crawlLog("Checking Name Server for video segments to convert.."); $convert_folder = WORK_DIRECTORY . self::CONVERT_FOLDER; if (!file_exists($convert_folder)) { @mkdir($convert_folder); if (!file_exists($convert_folder)) { crawlLog("Unable to create $convert_folder. Bailing!"); return; } } $folders = glob($convert_folder."/*", GLOB_ONLYDIR); if (count($folders) > 0) { foreach($folders as $folder){ $this->sourceModel->db->unlinkRecursive($folder); } } $time = time(); $session = md5($time . AUTH_KEY); $server = NAME_SERVER; $folder_file = $this->sourceModel->requestFileForConversion(); $db = $this->sourceModel->db; if ($folder_file) { $info = explode(":", $folder_file); $folder_name = trim($info[0]); $file_name = trim($info[1]); /* Download the file from the server */ $request = "$server?c=resource&a=get&time=$time&session=$session". "&f=schedules&n=". urlencode($file_name)."&sf=$folder_name"; $data = FetchUrl::getPage($request, NULL, true); $convert_path = $convert_folder."/".$folder_name; if(file_exists( $convert_path)) { $db->unlinkRecursive( $convert_path); } mkdir($convert_path); $downloaded_file = $convert_path . "/" . $file_name; file_put_contents($downloaded_file, $data); $this->convertVideo($downloaded_file); $files = glob($convert_path . "/*.{mp4}", GLOB_BRACE); if (!$files[0]) { crawlLog("Will try to convert the file again later"); } else { $converted_file_name = substr($files[0], strlen($convert_path) + 1); /* Upload the file to the server */ $file_data = file_get_contents($files[0]); $post_data['c'] = "resource"; $post_data['a'] = "uploadConvertedVideoFile"; $post_data['time'] = $time; $post_data['session'] = $session; $post_data['data'] = webencode($file_data); $post_data['file_name'] = webencode($converted_file_name); $post_data['folder_name'] = webencode($folder_name); crawlLog("Attempting to upload converted video file..."); $response = FetchUrl::getPage($server, $post_data , true); crawlLog("...server response:\n $response"); } } else { crawlLog("No files on server to convert!"); } } /** * * Function to send emails to mailer batches created by * mail_server. This function would periodically be invoked and * send emails reading data from the text files. */ function sendGroupNotificationEmailsInBatches() { crawlLog("Checking for mailer files to be sent out..."); $current_machine = $this->sourceModel->getCurrentMachine(); if ($current_machine == crawlHash(NAME_SERVER) && MEDIA_MODE != 'distributed') { $mail_directory = WORK_DIRECTORY . self::MAIL_FOLDER; if(!file_exists($mail_directory)) { return; } $files = glob($mail_directory."/*.txt"); if(!isset($files[0])) { return; } $sendable_file = false; foreach($files as $email_file) { if(time() - filemtime($email_file) > MAX_MAIL_TIMESTAMP_LIMIT) { $sendable_file = $email_file; break; } } if(!$sendable_file) { return; } $emails_string = file_get_contents($sendable_file); unlink($email_file); $emails = explode(self::MESSAGE_SEPARATOR, $emails_string); foreach ($emails as $serialized_email) { $email = unserialize($serialized_email); if(count($email) == 4) { crawlLog("Sending email to {$email[2]} about {$email[0]}"); $this->mail_server->sendImmediate( $email[0], $email[1], $email[2], $email[3]); } } } else { $file_name = $this->sourceModel->requestFileForMailingList(); if(empty($file_name)){ crawlLog("...Could not get any response from name server!"); return; } $time = time(); $session = md5($time . AUTH_KEY); $server = NAME_SERVER; /* Download the file from the server */ $request = "$server?c=resource&a=get&time=$time&session=$session". "&f=schedules&n=" . urlencode($file_name) . "&sf=mail"; $emails_string = FetchUrl::getPage($request, NULL, true); if(!$emails_string) { crawlLog(" No mail data returning"); return; } $emails = explode(self::MESSAGE_SEPARATOR, $emails_string); foreach ($emails as $serialized_email) { $email = unserialize($serialized_email); if(count($email) == 4) { crawlLog("Sending email to {$email[2]} about {$email[0]}"); $this->mail_server->sendImmediate( $email[0], $email[1], $email[2], $email[3]); } } $request = "$server?c=resource&a=removeMailingListFile". "&time=$time&session=$session&n=".urlencode($file_name); FetchUrl::getPage($request, NULL, true); } } } /* * Instantiate and run the MediaUpdater program */ $media_updater = new MediaUpdater(); $media_updater->start();