Last commit for lib/index_bundle_iterators/network_iterator.php: 9ff742e4cc2ef0dba312dd0c5f642890b6945730

First pass at converting files to use autoloading! Take care if you have an old yioop system you are upgrading, a=chris

Chris Pollett [2015-07-01 02:Jul:st]
First pass at converting files to use autoloading! Take care if you have an old yioop system you are upgrading, a=chris
<?php
/**
 *  SeekQuarry/Yioop --
 *  Open Source Pure PHP Search Engine, Crawler, and Indexer
 *
 *  Copyright (C) 2009 - 2014  Chris Pollett chris@pollett.org
 *
 *  LICENSE:
 *
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 *  END LICENSE
 *
 * @author Chris Pollett chris@pollett.org
 * @package seek_quarry
 * @subpackage iterator
 * @license http://www.gnu.org/licenses/ GPL3
 * @link http://www.seekquarry.com/
 * @copyright 2009 - 2014
 * @filesource
 */
if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}
/**
 *Loads base class for iterating
 */
require_once BASE_DIR.'/lib/index_bundle_iterators/index_bundle_iterator.php';
/**
 * Needed to be able to get pages from remote queue_servers
 */
require_once BASE_DIR.'/lib/fetch_url.php';
/**
 * To record network timing statistics
 */
require_once BASE_DIR.'/lib/analytics_manager.php';
/**
 * This iterator is used to handle querying a network of queue_servers
 * with regard to a query
 *
 * @author Chris Pollett
 * @package seek_quarry
 * @subpackage iterator
 */
class NetworkIterator extends IndexBundleIterator
{
    /**
     * Part of query without limit and num to be processed by all queue_server
     * machines
     *
     * @var string
     */
    var $base_query;
    /**
     * Current limit number to be added to base query
     *
     * @var string
     */
    var $limit;
    /**
     * An array of servers to ask a query to
     *
     * @var string
     */
    var $queue_servers;
    /**
     * Flags for each server saying if there are more results for that server
     * or not
     *
     * @var array
     */
    var $more_results;
    /**
     * Keeps track of whether the word_iterator list is empty because the
     * word does not appear in the index shard
     * @var int
     */
    var $filter;
    /**
     * used to adaptively change the number of pages requested from each
     * machine based on the number of machines that still have results
     * @var int
     */
    var $next_results_per_block;
    /**
     * @var int
     */
    var $hard_query;
    /**
     * the minimum number of pages to group from a block;
     */
    const MIN_FIND_RESULTS_PER_BLOCK = 200;
    /** Host Key position + 1 (first char says doc, inlink or eternal link)*/
    const HOST_KEY_POS = 17;
    /** Length of a doc key*/
    const KEY_LEN = 8;
    /**
     * Creates a network iterator with the given parameters.
     *
     * @param string $query the query that was supplied by the end user
     *      that we are trying to get search results for
     * @param array $queue_servers urls of yioop instances on which documents
     *  indexes live
     * @param string $timestamp the timestamp of the particular current index
     *      archive bundles that we look in for results
     * @param array $filter an array of hashes of domains to filter from
     *      results
     * @param string $save_timestamp if this timestamp is nonzero, then when
     *      making queries to separate machines the save_timestamp is sent so
     *      the queries on those machine can make savepoints. Note the
     *      format of save_timestamp is timestamp-query_part where query_part
     *      is the number of the item in a query presentation (usually 0).
     * @param bool $limit_news if true the number of media:news items to
     *      allow in search results is limited to WordIterator::LIMIT_NEWS_COUNT
     *
     */
    function __construct($query, $queue_servers, $timestamp, &$filter = NULL,
        $save_timestamp_name = "", $limit_news = true)
    {
        $this->results_per_block = ceil(self::MIN_FIND_RESULTS_PER_BLOCK);
        $this->next_results_per_block = $this->results_per_block;
        $this->hard_query = false;
        $this->base_query = "q=".urlencode($query).
            "&f=serial&network=false&raw=1&its=$timestamp&guess=false";
        if(!$limit_news) {
            $this->base_query .= "&s=news";
        }
        if($save_timestamp_name!="") { // used for archive crawls of crawl mixes
            $this->base_query .= "&save_timestamp=$save_timestamp_name";
        }
        $this->queue_servers = $queue_servers;
        $this->limit = 0;
        $count = count($this->queue_servers);
        for($i = 0; $i < $count; $i++) {
            $this->more_flags[$i] = true;
        }
        if($filter != NULL) {
            $this->filter = & $filter;
        } else {
            $this->filter = NULL;
        }
    }
    /**
     * Computes a relevancy score for a posting offset with respect to this
     * iterator and generation As this is not easily determined
     * for a network iterator, this method always returns 1.0 for this
     * iterator
     *
     * @param int $generation the generation the posting offset is for
     * @param int $posting_offset an offset into word_docs to compute the
     *      relevance of
     * @return float a relevancy score based on BM25F -- always 1.0.
     */
    function computeRelevance($generation, $posting_offset)
    {
        return 1.0;
    }
    /**
     * Returns the iterators to the first document block that it could iterate
     * over
     */
    function reset()
     {
        $this->limit = 0;
        $this->next_results_per_block = $this->results_per_block;
        $count = count($this->queue_servers);
        $this->hard_query = false;
        for($i = 0; $i < $count; $i++) {
            $this->more_flags[$i] = true;
        }
     }
    /**
     * Forwards the iterator one group of docs
     * @param array $gen_doc_offset a generation, doc_offset pair. If set,
     *      the must be of greater than or equal generation, and if equal the
     *      next block must all have $doc_offsets larger than or equal to
     *      this value
     */
    function advance($gen_doc_offset = NULL)
     {
        $this->current_block_fresh = false;
        $this->limit += $this->results_per_block;
        $this->results_per_block = $this->next_results_per_block;
     }
    /**
     * Gets the doc_offset and generation for the next document that
     * would be return by this iterator. As this is not easily determined
     * for a network iterator, this method always returns -1 for this
     * iterator
     *
     * @return mixed an array with the desired document offset
     *  and generation; -1 on fail
     */
    function currentGenDocOffsetWithWord()
    {
        return -1;
    }
    /**
     * Hook function used by currentDocsWithWord to return the current block
     * of docs if it is not cached
     *
     * @return mixed doc ids and score if there are docs left, -1 otherwise
     */
     function findDocsWithWord()
     {
        $query = $this->base_query .
            "&num={$this->results_per_block}&limit={$this->limit}";
        $sites = array();
        $lookup = array();
        $i = 0;
        $j = 0;
        foreach($this->queue_servers as $server) {
            if($this->more_flags[$i]) {
                $sites[$j][CrawlConstants::URL] = $server ."?". $query.
                    "&machine=$i";
                $lookup[$j] = $i;
                $j++;
            }
            $i++;
        }
        $net_times = AnalyticsManager::get("NET_TIMES");
        $net_times = ($net_times) ? $net_times : 0;
        $download_time = microtime();
        $downloads = array();
        if(count($sites) > 0) {
            $downloads = FetchUrl::getPages($sites, false, 0, NULL, self::URL,
                self::PAGE, true);
        }
        $net_times += changeInMicrotime($download_time);
        AnalyticsManager::set("NET_TIMES", $net_times);
        $results = array();
        $count = count($downloads);
        $this->num_docs = 0;
        $in4 = "&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;";
        $machine_times = AnalyticsManager::get("MACHINE_TIMES");
        $indent = ($machine_times) ? "<br />$in4" : $in4;
        $machine_times = ($machine_times) ? $machine_times: "";
        $max_machine_times = AnalyticsManager::get("MAX_MACHINE_TIMES");
        $max_machine_times = ($max_machine_times) ? $max_machine_times : 0;
        $max_time = 0;
        $num_with_results = $count;
        for($j = 0; $j < $count; $j++) {
            $download = & $downloads[$j];
            if(isset($download[self::PAGE])) {
                $pre_result = @unserialize($download[self::PAGE]);
                if(!isset($pre_result["TOTAL_ROWS"]) ||
                    $pre_result["TOTAL_ROWS"] < $this->results_per_block) {
                    $this->more_flags[$lookup[$j]] = false;
                    $num_with_results--;
                }
                if(isset($pre_result["TOTAL_ROWS"])) {
                    $this->num_docs += $pre_result["TOTAL_ROWS"];
                }
                if(isset($pre_result["PAGES"])) {
                    foreach($pre_result["PAGES"] as $page_data) {
                        if(isset($page_data[self::KEY])) {
                            $results[$page_data[self::KEY]] =
                                $page_data;
                            $results[$page_data[self::KEY]][self::MACHINE_ID] =
                                $j;
                        }
                    }
                }
                $max_time = max($max_time, $pre_result['ELAPSED_TIME']);
                $lookup_link = $this->makeLookupLink($sites, $lookup[$j]);
                $machine_times .= $indent . $lookup_link .
                    $pre_result['ELAPSED_TIME']."&nbsp;&nbsp;";
                $indent = "";
            }
        }
        if(isset($pre_result["HARD_QUERY"])) {
            $this->hard_query  = $pre_result["HARD_QUERY"];
        }
        if($num_with_results > 0) {
            $this->next_results_per_block = ceil(
                floatval($count * $this->results_per_block)/
                floatval($num_with_results));
        }
        $max_machine_times += $max_time;
        AnalyticsManager::set("MACHINE_TIMES", $machine_times);
        AnalyticsManager::set("MAX_MACHINE_TIMES", $max_machine_times);
        if($results == array()) {
            $results = -1;
        }
        if($results != -1) {
            if($this->filter != NULL) {
                foreach($results as $keys => $data) {
                    $host_key =
                        substr($keys, self::HOST_KEY_POS, self::KEY_LEN);
                    if(in_array($host_key, $this->filter) ) {
                        unset($results[$keys]);
                    }
                }
            }
        }
        $this->count_block = count($results);
        $this->pages = $results;
        return $results;
     }
    /**
     * Called to make a link for AnalyticsManager about a network query
     * performed by this iterator.
     *
     * @param array $sites used by this network iterator
     * @param int $index which site in array to make link for
     * @return string html of link
     */
    function makeLookupLink($sites, $index)
    {
        if(isset($sites[$index][self::URL])) {
            $url = $sites[$index][self::URL];
            $title = $url;
        } else {
            if(!isset($sites[$index])) {
                $sites[$index] = array();
            }
            $tmp = urlencode(print_r($sites[$index],
                true));
            $title = 'URL not set';
            if(trim($tmp) == "") {
                $tmp = 'Site null';
            }
            $url = 'javascript:alert("'.$tmp.'")';
        }
        $link = "<a target='_blank' class='gray-link' href='$url'".
             " title='$title' >ID_$index</a>:";
        return $link;
    }
    /**
     * Gets the summaries associated with the keys provided the keys
     * can be found in the current block of docs returned by this iterator
     * @param array $keys keys to try to find in the current block of returned
     *      results
     * @return array doc summaries that match provided keys
     */
    function getCurrentDocsForKeys($keys = NULL)
    {
        if($this->current_block_fresh == false) {
            $pages = $this->currentDocsWithWord();
            if(!is_array($pages)) {
                return $pages;
            }
        } else {
            $pages = & $this->pages;
        }
        return $pages;
    }
}
 ?>
ViewGit