Last commit for models/parallel_model.php: 9ff742e4cc2ef0dba312dd0c5f642890b6945730

First pass at converting files to use autoloading! Take care if you have an old yioop system you are upgrading, a=chris

Chris Pollett [2015-07-01 02:Jul:st]

First pass at converting files to use autoloading! Take care if you have an old yioop system you are upgrading, a=chris

<?php
/**
 * SeekQuarry/Yioop --
 * Open Source Pure PHP Search Engine, Crawler, and Indexer
 *
 * Copyright (C) 2009 - 2015  Chris Pollett chris@pollett.org
 *
 * LICENSE:
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 * END LICENSE
 *
 * @author Chris Pollett chris@pollett.org
 * @package seek_quarry
 * @subpackage model
 * @license http://www.gnu.org/licenses/ GPL3
 * @link http://www.seekquarry.com/
 * @copyright 2009 - 2015
 * @filesource
 */
if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}
/**
 * Loads common constants for web crawling, used for index_data_base_name and
 * schedule_data_base_name
 */
require_once BASE_DIR."/lib/crawl_constants.php";
/**
 * Crawl data is stored in an IndexArchiveBundle, which are managed by the
 * IndexManager so load the definition of this class
 */
require_once BASE_DIR."/lib/index_manager.php";
/** lookupSummaryOffsets uses word iterator*/
require_once BASE_DIR.'/lib/index_bundle_iterators/word_iterator.php';
/** For crawlHash function */
require_once BASE_DIR."/lib/utility.php";
/**
 * Used for keeping track of timing statistics
 */
require_once BASE_DIR.'/lib/analytics_manager.php';
/**
 * Needed for getHost
 */
require_once BASE_DIR.'/lib/url_parser.php';
/**
 * Needed to be able to send data via http to remote queue_servers
 */
require_once BASE_DIR.'/lib/fetch_url.php';
/**
 * Base class of models that need access to data from multiple queue servers
 * Subclasses include @see CrawlModel and @see PhraseModel.
 *
 * @author Chris Pollett
 *
 * @package seek_quarry
 * @subpackage model
 */
class ParallelModel extends Model implements CrawlConstants
{
    /**
     * Stores the name of the current index archive to use to get search
     * results from
     * @var string
     */
    var $index_name;
    /**
     * If known the id of the queue_server this belongs to
     * @var int
     */
    var $current_machine;
    /**
     * the minimum length of a description before we stop appending
     * additional link doc summaries
     */
    const MIN_DESCRIPTION_LENGTH = 100;
    /**
     * {@inheritDoc}
     *
     * @param string $db_name the name of the database for the search engine
     * @param bool $connect whether to connect to the database by default
     *     after making the datasource class
     */
    function __construct($db_name = DB_NAME, $connect = true)
    {
        parent::__construct($db_name, $connect);
        $this->current_machine = 0;//if known, controller will set later
    }
    /**
     * Get a summary of a document by the generation it is in
     * and its offset into the corresponding WebArchive.
     *
     * @param string $url of summary we are trying to look-up
     * @param array $machine_urls an array of urls of yioop queue servers
     * @param string $index_name timestamp of the index to do the lookup in
     * @return array summary data of the matching document
     */
    function getCrawlItem($url, $machine_urls = NULL, $index_name = "")
    {
        $hash_url = crawlHash($url, true);
        if($index_name == "") {
            $index_name = $this->index_name;
        }
        $results = $this->getCrawlItems(
            array($hash_url =>array($url, $index_name)), $machine_urls);
        if(isset($results[$hash_url])) {
            return $results[$hash_url];
        }
        return $results;
    }
    /**
     * Gets summaries for a set of document by their url, or by group of
     * 5-tuples of the form (machine, key, index, generation, offset).
     *
     * @param string $lookups things whose summaries we are trying to look up
     * @param array $machine_urls an array of urls of yioop queue servers
     * @return array of summary data for the matching documents
     */
    function getCrawlItems($lookups, $machine_urls = NULL)
    {
        if($machine_urls != NULL && !$this->isSingleLocalhost($machine_urls)) {
            $summaries = $this->networkGetCrawlItems($lookups, $machine_urls);
        } else {
            $summaries = $this->nonNetworkGetCrawlItems($lookups);
        }
        return $summaries;
    }
    /**
     * In a multiple queue server setting, gets summaries for a set of document
     * by their url, or by group of 5-tuples of the form
     * (machine, key, index, generation, offset). This makes an execMachines
     * call to make a network request to the CrawlController's on each machine
     * which in turn calls getCrawlItems (and thence nonNetworkGetCrawlItems)
     * on each machine. The results are then sent back to networkGetCrawlItems
     * and aggregated.
     *
     * @param string $lookups things whose summaries we are trying to look up
     * @param array $machine_urls an array of urls of yioop queue servers
     * @return array of summary data for the matching documents
     */
    function networkGetCrawlItems($lookups, $machine_urls)
    {
        //Set-up network request
        $machines = array();
        $indexes = array();
        $num_machines = count($machine_urls);
        foreach($lookups as $lookup => $lookup_info) {
            if(count($lookup_info) == 2 && ($lookup_info[0][0] === 'h'
                || $lookup_info[0][0] === 'r'
                || $lookup_info[0][0] === 'g')) {
                $machines = $machine_urls;
                break;
            } else {
                foreach($lookup_info as $lookup_item) {
                    $out_lookup_info = array();
                    if(count($lookup_item) == 5) {
                        list($index, , , , ) = $lookup_item;
                        $machines[$index] = $machine_urls[$index];
                    } else {
                        $machines = $machine_urls;
                        break;
                    }
                }
            }
        }
        //Make request
        $page_set = $this->execMachines("getCrawlItems",
            $machines, serialize($lookups), $num_machines);
        //Aggregate results
        $summaries = array();
        $elapsed_times = array();
        if(is_array($page_set)) {
            foreach($page_set as $elt) {
                $description_hash = array();
                $result = @unserialize(webdecode($elt[self::PAGE]));
                if(!is_array($result)) {
                    $elapsed_times[] = 0;
                    continue;
                }
                $elapsed_times[] = $result["ELAPSED_TIME"];
                unset($result["ELAPSED_TIME"]);
                $ellipsis = "";
                foreach($result as $lookup => $summary) {
                    if(isset($summaries[$lookup])) {
                        if(isset($summary[self::DESCRIPTION])) {
                            $description = trim($summary[self::DESCRIPTION]);
                            if(!isset($summaries[$lookup][self::DESCRIPTION])){
                                $summaries[$lookup][self::DESCRIPTION] = "";
                            }
                            if(!isset($description_hash[$description])){
                                $summaries[$lookup][self::DESCRIPTION] =
                                    $ellipsis . $description;
                                $ellipsis = " .. ";
                                $description_hash[$description] = true;
                            }
                        }
                        foreach($summary as $attr => $value){
                            if($attr !=self::DESCRIPTION &&
                                !isset($summaries[$lookup][$attr])) {
                                $summaries[$lookup][$attr] = $value;
                            }
                        }
                    } else {
                        $summaries[$lookup] =  $summary;
                    }
                }
            }
            $summary_times_string = AnalyticsManager::get("SUMMARY_TIMES");
            if($summary_times_string) {
                $all_elapsed_times = unserialize($summary_times_string);
            } else {
                $all_elapsed_times = array();
            }
            $all_elapsed_times[] = $elapsed_times;
            AnalyticsManager::set("SUMMARY_TIMES", serialize(
                $all_elapsed_times));
        }
        return $summaries;
    }
    /**
     * Gets summaries on a particular machine for a set of document by
     * their url, or by group of 5-tuples of the form
     * (machine, key, index, generation, offset)
     * This may be used in either the single queue_server setting or
     * it may be called indirectly by a particular machine's
     * CrawlController as part of fufilling a network-based getCrawlItems
     * request. $lookups contains items which are to be grouped (as came
     * from same url or site with the same cache). So this function aggregates
     * their descriptions.
     *
     * @param string $lookups things whose summaries we are trying to look up
     * @return array of summary data for the matching documents
     */
    function nonNetworkGetCrawlItems($lookups)
    {
        $summary_offset = NULL;
        $generation = NULL;
        $summaries = array();
        $db = $this->db;
        foreach($lookups as $lookup => $lookup_info) {
            $scheme = (isset($lookup_info[0]) && is_string($lookup_info[0])) ?
                substr($lookup_info[0], 0, 3) : "";
            if(count($lookup_info) == 2 && ($scheme == 'htt' ||$scheme == 'gop'
                || $scheme == 'rec')) {
                list($url, $index_name) = $lookup_info;
                $index_archive = IndexManager::getIndex($index_name);
                $offset_gen_arr =
                    $this->lookupSummaryOffsetGeneration($url, $index_name);
                if($offset_gen_arr !== false){
                    list($summary_offset, $generation) = $offset_gen_arr;
                } else {
                    return false;
                }
                $summary =
                    $index_archive->getPage($summary_offset, $generation);
            } else {
                $summary = array();
                $ellipsis = "";
                $description_hash = array();
                $sql = "SELECT * FROM FEED_ITEM WHERE GUID=?";
                foreach($lookup_info as $lookup_item) {
                    if(count($lookup_item) == 2) {
                        list($word_key, $index_name) = $lookup_item;
                        $offset_info =
                            $this->lookupSummaryOffsetGeneration(
                                $word_key, $index_name, true);
                        if(is_array($offset_info)) {
                            list($summary_offset, $generation) = $offset_info;
                        } else {
                            continue;
                        }
                    } else {
                        list($machine, $key, $index_name, $generation,
                            $summary_offset) = $lookup_item;
                    }
                    if(strcmp($index_name, "feed") != 0) {
                        $index = IndexManager::getIndex($index_name);
                        $index->setCurrentShard($generation, true);
                        if(is_integer($summary_offset) &&
                            is_integer($generation)) {
                            $page = @$index->getPage($summary_offset);
                        } else {
                            $page = NULL;
                        }
                    } else {
                        $guid = base64Hash(substr($key,
                            IndexShard::DOC_KEY_LEN,
                            IndexShard::DOC_KEY_LEN));
                        $result = $db->execute($sql, array($guid));
                        $page = false;
                        if($result) {
                            $row = $db->fetchArray($result);
                            if($row) {
                                $page[self::TITLE] = $row["TITLE"];
                                $page[self::DESCRIPTION] = $row["DESCRIPTION"];
                                $page[self::URL] = $row["LINK"];
                                $page[self::SOURCE_NAME] = $row["SOURCE_NAME"];
                                $page[self::IMAGE_LINK] = $row["IMAGE_LINK"];
                            }
                        }
                    }
                    if(!$page || $page == array()) {continue;}
                    $copy = false;
                    if($summary == array()) {
                        if(isset($page[self::DESCRIPTION])) {
                            $description = trim($page[self::DESCRIPTION]);
                            $page[self::DESCRIPTION] = $description;
                            $description_hash[$description] = true;
                        }
                        $ellipsis = " .. ";
                        $summary = $page;
                    } else if (isset($page[self::DESCRIPTION])) {
                        $description = trim($page[self::DESCRIPTION]);
                        if(!isset($summary[self::DESCRIPTION])) {
                            $summary[
                                self::DESCRIPTION] = "";
                        }
                        if(!isset($description_hash[$description])){
                            $summary[self::DESCRIPTION] .=
                                $ellipsis . $description;
                            $ellipsis = " .. ";
                            $description_hash[$description] = true;
                        }
                        $copy = true;
                    } else {
                        $copy = true;
                    }
                    if(strlen($summary[self::DESCRIPTION]) >
                        self::MIN_DESCRIPTION_LENGTH) {
                        break;
                    }
                    if($copy) {
                        foreach($page as $attr => $value){
                            if($attr !=self::DESCRIPTION &&
                                !isset($summary[$attr])) {
                                $summary[$attr] = $value;
                            }
                        }
                    }
                }
            }
            if($summary != array()) {
                $summaries[$lookup] = $summary;
            }
        }
        return $summaries;
    }
    /**
     * Determines the offset into the summaries WebArchiveBundle and generation
     * of the provided url (or hash_url) so that the info:url
     * (info:base64_hash_url) summary can be retrieved. This assumes of course
     * that the info:url  meta word has been stored.
     *
     * @param string $url_or_key either info:base64_hash_url or just a url to
     *     lookup
     * @param string $index_name index into which to do the lookup
     * @param bool $is_key whether the string is info:base64_hash_url or just a
     *     url
     * @return array (offset, generation) into the web archive bundle
     */
    function lookupSummaryOffsetGeneration($url_or_key, $index_name = "",
        $is_key = false)
    {
        if($index_name == "") {
            $index_name = $this->index_name;
        }
        $index_archive = IndexManager::getIndex($index_name);
        if(!$index_archive) {
            return false;
        }
        $num_retrieved = 0;
        $pages = array();
        $summary_offset = NULL;
        if(!isset($index_archive->generation_info['ACTIVE'])) {
            return false;
        }
        $mask = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00";
        $num_generations = $index_archive->generation_info['ACTIVE'];
        $hash_key = ($is_key) ? crawlHashWord($url_or_key, true, $mask) :
            crawlHashWord("info:$url_or_key", true, $mask);
        $info = IndexManager::getWordInfo($index_name, $hash_key, 0, $mask, 1);
        if(!isset($info[0][4])) {
            return false;
        }
        $word_iterator = new WordIterator($info[0][4], $index_name, true);
        if(is_array($next_docs = $word_iterator->nextDocsWithWord())) {
             foreach($next_docs as $doc_key => $doc_info) {
                 $summary_offset =
                    $doc_info[CrawlConstants::SUMMARY_OFFSET];
                 $generation = $doc_info[CrawlConstants::GENERATION];
                 $index_archive->setCurrentShard($generation, true);
                 $page = @$index_archive->getPage($summary_offset);
                 $num_retrieved++;
                 if($num_retrieved >=  1) {
                     break;
                 }
             }
             if($num_retrieved == 0) {
                return false;
             }
        } else {
            return false;
        }
        return array($summary_offset, $generation);
    }
    /**
     * A save point is used to store to disk a sequence generation-doc-offset
     * pairs of a particular mix query when doing an archive crawl of a crawl
     * mix. This is used so that the mix can remember where it was the next
     * time it is invoked by the web app on the machine in question.
     * This function deletes such a save point associated with a timestamp
     *
     * @param int $save_timestamp timestamp of save point to delete
     * @param array $machine_urls  machines on which to try to delete savepoint
     */
    function clearQuerySavePoint($save_timestamp, $machine_urls = NULL)
    {
        /*
           It's important to quit early in the case that the timestamp is
           empty, as this could result in deleting all SavePoint* files below.
        */
        if (!$save_timestamp) return;

        if($machine_urls != NULL && !$this->isSingleLocalhost($machine_urls)) {
            $this->execMachines("clearQuerySavePoint", $machine_urls,
                $save_timestamp);
            return;
        }

        /*
           SavePoint files have a $qpart tagged on to the timestamp to
           distinguish between parts of a query, so we want to delete anything
           that starts with the appropriate timestamp.
        */
        $save_stub = CRAWL_DIR.'/schedules/'.self::save_point.$save_timestamp;
        foreach (glob($save_stub.'*.txt') as $save_file) {
            @unlink($save_file);
        }
    }
    /**
     * This method is invoked by other ParallelModel (@see CrawlModel
     * for examples) methods when they want to have their method performed
     * on an array of other  Yioop instances. The results returned can then
     * be aggregated.  The invocation sequence is
     * crawlModelMethodA invokes execMachine with a list of
     * urls of other Yioop instances. execMachine makes REST requests of
     * those instances of the given command and optional arguments
     * This request would be handled by a CrawlController which in turn
     * calls crawlModelMethodA on the given Yioop instance, serializes the
     * result and gives it back to execMachine and then back to the originally
     * calling function.
     *
     * @param string $command the ParallelModel method to invoke on the remote
     *     Yioop instances
     * @param array $machine_urls machines to invoke this command on
     * @param string $arg additional arguments to be passed to the remote
     *      machine
     * @param int $num_machines the integer to be used in calculating partition
     * @return array a list of outputs from each machine that was called.
     */
    function execMachines($command, $machine_urls, $arg = NULL,
        $num_machines = 0)
    {
        if($num_machines == 0) {
            $num_machines = count($machine_urls);
        }
        $time = time();
        $session = md5($time . AUTH_KEY);
        $query = "c=crawl&a=$command&time=$time&session=$session" .
            "&num=$num_machines";
        if($arg != NULL) {
            $arg = webencode($arg);
            $query .= "&arg=$arg";
        }
        $sites = array();
        $post_data = array();
        $i = 0;
        foreach($machine_urls as $index => $machine_url) {
            $sites[$i][CrawlConstants::URL] = $machine_url;
            $post_data[$i] = $query."&i=$index";
            $i++;
        }
        $outputs = array();
        if(count($sites) > 0) {
            $outputs = FetchUrl::getPages($sites, false, 0, NULL, self::URL,
                self::PAGE, true, $post_data);
        }
        return $outputs;
    }
}
?>

ViewGit