Last commit for lib/index_bundle_iterators/word_iterator.php: 9ff742e4cc2ef0dba312dd0c5f642890b6945730

First pass at converting files to use autoloading! Take care if you have an old yioop system you are upgrading, a=chris

Chris Pollett [2015-07-01 02:Jul:st]
First pass at converting files to use autoloading! Take care if you have an old yioop system you are upgrading, a=chris
<?php
/**
 *  SeekQuarry/Yioop --
 *  Open Source Pure PHP Search Engine, Crawler, and Indexer
 *
 *  Copyright (C) 2009 - 2014  Chris Pollett chris@pollett.org
 *
 *  LICENSE:
 *
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 *  END LICENSE
 *
 * @author Chris Pollett chris@pollett.org
 * @package seek_quarry
 * @subpackage iterator
 * @license http://www.gnu.org/licenses/ GPL3
 * @link http://www.seekquarry.com/
 * @copyright 2009 - 2014
 * @filesource
 */
if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}
/**
 *Loads base class for iterating
 */
require_once BASE_DIR.'/lib/index_bundle_iterators/index_bundle_iterator.php';
/**
 * Used to iterate through the documents associated with a word in
 * an IndexArchiveBundle. It also makes it easy to get the summaries
 * of these documents.
 *
 * A description of how words and the documents containing them are stored
 * is given in the documentation of IndexArchiveBundle.
 *
 * @author Chris Pollett
 * @package seek_quarry
 * @subpackage iterator
 * @see IndexArchiveBundle
 */
class WordIterator extends IndexBundleIterator
{
    /**
     * hash of word that the iterator iterates over
     * @var string
     */
    var $word_key;
    /**
     * The timestamp of the index is associated with this iterator
     * @var string
     */
    var $index_name;
    /**
     * The next byte offset in the IndexShard
     * @var int
     */
    var $next_offset;
    /**
     * An array of shard generation and posting list offsets, lengths, and
     * numbers of documents
     * @var array
     */
    var $dictionary_info;
    /**
     * File name (including path) of the feed shard for news items
     * @var string
     */
    var $feed_shard_name;
    /**
     * Structure used to hold posting list start and stops for the query
     * in the feed shard
     * @var array
     */
    var $feed_info;
    /**
     * The total number of shards that have data for this word
     * @var int
     */
    var $num_generations;
    /**
     * Index into dictionary_info corresponding to the current shard
     * @var int
     */
    var $generation_pointer;
    /**
     * Numeric number of current shard
     * @var int
     */
    var $current_generation;
    /**
     * The current byte offset in the IndexShard
     * @var int
     */
    var $current_offset;
    /**
     * Starting Offset of word occurence in the IndexShard
     * @var int
     */
    var $start_offset;
    /**
     * Last Offset of word occurence in the IndexShard
     * @var int
     */
    var $last_offset;
    /**
     * Keeps track of whether the word_iterator list is empty because the
     * word does not appear in the index shard
     * @var int
     */
    var $empty;
    /**
     * Keeps track of whether the word_iterator list is empty because the
     * word does not appear in the index shard
     * @var int
     */
    var $filter;
    /**
     * The current value of the doc_offset of current posting if known
     * @var int
     */
    var $current_doc_offset;
    /** Host Key position + 1 (first char says doc, inlink or eternal link)*/
    const HOST_KEY_POS = 17;
    /** Length of a doc key*/
    const KEY_LEN = 8;
    /** If the $limit_news constructor input is true then limit the number
     *  of items coming from the feed shard to this count.
     */
    const LIMIT_NEWS_COUNT = 25;
    /**
     * Creates a word iterator with the given parameters.
     *
     * @param string $word_key hash of word or phrase to iterate docs of
     * @param string $index_name time_stamp of the to use
     * @param bool $raw whether the $word_key is our variant of base64 encoded
     * @param array $filter an array of hashes of domains to filter from
     *      results
     * @param string $mask byte mask to apply against word id, default is for
     *      exact match
     */
    function __construct($word_key, $index_name, $raw = false, &$filter = NULL,
        $results_per_block = IndexBundleIterator::RESULTS_PER_BLOCK,
        $limit_news = false,
        $mask = "\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF")
    {
        if($raw == false) {
            //get rid of out modified base64 encoding
            $word_key = unbase64Hash($word_key);
        }
        if($filter != NULL) {
            $this->filter = & $filter;
        } else {
            $this->filter = NULL;
        }
        $this->word_key = $word_key;
        $this->index_name =  $index_name;
        $this->dictionary_info =
            IndexManager::getWordInfo($index_name, $word_key, 0,
            $mask);
        $this->feed_shard_name = WORK_DIRECTORY."/feeds/index";
        if((!defined('NO_FEEDS') || !NO_FEEDS)
            && file_exists($this->feed_shard_name)) {
            //NO_FEEDS defined true in statistic_controller.php
            $this->use_feeds = true;
        } else {
            $this->use_feeds = false;
        }
        if($this->use_feeds) {
            if (!isset($this->dictionary_info[-1])) {
                $this->feed_info = false;
                $this->feed_empty = true;
            } else {
                $this->feed_info = $this->dictionary_info[-1];
                unset($this->dictionary_info[-1]);
                $this->feed_empty = false;
            }
        } else {
            $this->feed_info = false;
            $this->feed_empty = true;
        }
        if(is_array($this->feed_info)) {
            list(,$this->feed_start, $this->feed_end, $this->feed_count,) =
                $this->feed_info;
            $this->feed_info = array($this->feed_start, $this->feed_end,
                $this->feed_count);
        } else {
            $this->feed_start = 0;
            $this->feed_end = 0;
            $this->feed_count = 0;
        }
        if($this->feed_count > 0) {
            $this->using_feeds = true;
        } else {
            $this->using_feeds = false;
        }
        if($limit_news && $this->feed_count > self::LIMIT_NEWS_COUNT) {
            $this->feed_count = self::LIMIT_NEWS_COUNT;
            $this->feed_end = $this->feed_start +
                IndexShard::POSTING_LEN * (self::LIMIT_NEWS_COUNT - 1);
        }
        $this->num_docs = $this->feed_count;
        ksort($this->dictionary_info);
        $this->dictionary_info = array_values($this->dictionary_info);
        if ($this->dictionary_info === false) {
            $this->empty = true;
        } else {
            $this->num_generations = count($this->dictionary_info);
            if($this->num_generations == 0) {
                $this->empty = true;
            } else {
                foreach ($this->dictionary_info as $info) {
                    if(isset($info[3])) {
                        $this->num_docs += $info[3];
                    }
                }
                $this->empty = false;
            }
        }
        $this->current_doc_offset = NULL;
        $this->results_per_block = $results_per_block;
        $this->current_block_fresh = false;
        if($this->dictionary_info !== false || $this->feed_info !== false) {
            $this->reset();
        }
    }
    /**
     * Computes a relevancy score for a posting offset with respect to this
     * iterator and generation
     * @param int $generation the generation the posting offset is for
     * @param int $posting_offset an offset into word_docs to compute the
     *      relevance of
     * @return float a relevancy score based on BM25F.
     */
    function computeRelevance($generation, $posting_offset)
    {
        $item = array();
        if($this->using_feeds && $this->use_feeds) {
            $index = IndexManager::getIndex("feed");
            $num_docs_or_links =
                IndexShard::numDocsOrLinks($this->feed_start,
                $this->feed_end);
            $current = $posting_offset >> 2;
            $posting = $index->getCurrentShard()->getPostingAtOffset(
                $current, $posting_start, $posting_end);
            list( , $item) = $index->getCurrentShard()->makeItem($posting,
                $num_docs_or_links, 1);
            $item[self::RELEVANCE] *= 10;
        } else {
            $index = IndexManager::getIndex($this->index_name);
            $index->setCurrentShard($generation, true);
            $num_docs_or_links =
                IndexShard::numDocsOrLinks($this->start_offset,
                $this->last_offset);
            $current = $posting_offset >> 2;
            $posting = $index->getCurrentShard()->getPostingAtOffset(
                $current, $posting_start, $posting_end);
            list( , $item) = $index->getCurrentShard()->makeItem($posting,
                $num_docs_or_links, 1);
        }
        return $item[self::RELEVANCE];
    }
    /**
     * Resets the iterator to the first document block that it could iterate
     * over
     */
    function reset()
    {
        if($this->feed_count > 0) {
            $this->using_feeds = true;
        } else {
            $this->using_feeds = false;
        }
        $no_feeds = $this->feed_empty || !$this->use_feeds;
        if(!$this->empty) {// we shouldn't be called when empty - but to be safe
            list($this->current_generation, $this->start_offset,
                $this->last_offset, )
                = $this->dictionary_info[0];
        } else {
            $this->start_offset = 0;
            $this->last_offset = -1;
            $this->num_generations = -1;
        }
        if(!$no_feeds) {
            $this->current_offset = $this->feed_start;
            $this->current_generation = -1;
        } else {
            $this->current_offset = $this->start_offset;
        }
        $this->generation_pointer = 0;
        $this->count_block = 0;
        $this->seen_docs = 0;
        $this->current_doc_offset = NULL;
    }
    /**
     * Hook function used by currentDocsWithWord to return the current block
     * of docs if it is not cached
     *
     * @return mixed doc ids and score if there are docs left, -1 otherwise
     */
    function findDocsWithWord()
    {
        $no_feeds = $this->feed_empty || !$this->use_feeds;
        $feed_in_use = $this->using_feeds && !$no_feeds;
        if($this->empty && $no_feeds) {
            return -1;
        }
        if(!$feed_in_use &&(($this->generation_pointer>=$this->num_generations)
            || ($this->generation_pointer == $this->num_generations - 1 &&
            $this->current_offset > $this->last_offset))) {
            return -1;
        }
        $pre_results = array();
        if($feed_in_use) {
            $this->next_offset = $this->current_offset;
            $feed_shard = IndexManager::getIndex("feed");
            if($feed_shard) {
                $pre_results = $feed_shard->getPostingsSlice(
                    $this->feed_start,
                    $this->next_offset, $this->feed_end,
                    $this->results_per_block);
                $time = time();
                foreach($pre_results as $keys => $pre_result) {
                    $pre_results[$keys][self::IS_FEED] = true;
                    $delta = $time - $pre_result[self::SUMMARY_OFFSET];
                    $pre_results[$keys][self::DOC_RANK] = 720000 /
                        max($delta, 1);
                }
            }
        } else if(!$this->empty) {
            $this->next_offset = $this->current_offset;
            $index = IndexManager::getIndex($this->index_name);
            $index->setCurrentShard($this->current_generation, true);
            //the next call also updates next offset
            $shard = $index->getCurrentShard();
            $pre_results = $shard->getPostingsSlice(
                $this->start_offset,
                $this->next_offset, $this->last_offset,
                $this->results_per_block);
        }
        $results = array();
        $doc_key_len = IndexShard::DOC_KEY_LEN;
        $filter = ($this->filter == NULL) ? array() : $this->filter;
        foreach($pre_results as $keys => $data) {
            $host_key = substr($keys, self::HOST_KEY_POS, self::KEY_LEN);
            if(in_array($host_key, $filter) ) {
                continue;
            }
            $data[self::KEY] = $keys;
            // inlinks is the domain of the inlink
            $key_parts = str_split($keys, $doc_key_len);
            if(isset($key_parts[2])) {
                list($hash_url, $data[self::HASH], $data[self::INLINKS]) =
                    $key_parts;
            } else {
                continue;
            }
            if(isset($data[self::IS_FEED]) && $data[self::IS_FEED]) {
                $data[self::CRAWL_TIME] = "feed";
            } else {
                $data[self::CRAWL_TIME] = $this->index_name;
            }
            $results[$keys] = $data;
        }
        $this->count_block = count($results);
        if($this->generation_pointer == $this->num_generations - 1 &&
            $results == array()) {
            $results = NULL;
        }
        $this->pages = $results;
        return $results;
    }
    /**
     * Updates the seen_docs count during an advance() call
     */
    function advanceSeenDocs()
    {
        if($this->current_block_fresh != true) {
            if($this->using_feeds && $this->use_feeds) {
                $num_docs = min($this->results_per_block,
                    IndexShard::numDocsOrLinks($this->next_offset,
                        $this->feed_end));
            } else {
                $num_docs = min($this->results_per_block,
                    IndexShard::numDocsOrLinks($this->next_offset,
                        $this->last_offset));
            }
            $this->next_offset = $this->current_offset;
            $this->next_offset += IndexShard::POSTING_LEN * $num_docs;
            if($num_docs < 0) {
                return;
            }
        } else {
            $num_docs = $this->count_block;
        }
        $this->current_block_fresh = false;
        $this->seen_docs += $num_docs;
    }
    /**
     * Forwards the iterator one group of docs
     * @param array $gen_doc_offset a generation, doc_offset pair. If set,
     *      the must be of greater than or equal generation, and if equal the
     *      next block must all have $doc_offsets larger than or equal to
     *      this value
     */
    function advance($gen_doc_offset = NULL)
    {
        if($gen_doc_offset != NULL) { //only advance if $gen_doc_offset bigger
            $cur_gen_doc_offset = $this->currentGenDocOffsetWithWord();
            if($cur_gen_doc_offset == -1 ||
                $this->genDocOffsetCmp($cur_gen_doc_offset,
                $gen_doc_offset) >= 0) {
                return;
            }
        }
        $this->advanceSeenDocs();
        $this->current_doc_offset = NULL;
        if($this->current_offset < $this->next_offset) {
            $this->current_offset = $this->next_offset;
        } else {
            $this->advanceGeneration();
            $this->next_offset = $this->current_offset;
        }
        $using_feeds = $this->using_feeds && $this->use_feeds;
        if(($using_feeds &&
            $this->current_offset > $this->feed_end) || (!$using_feeds &&
            $this->current_offset > $this->last_offset)) {
            $this->advanceGeneration();
            $this->next_offset = $this->current_offset;
        }
        if($gen_doc_offset !== NULL) {
            if($this->current_generation < $gen_doc_offset[0]) {
                $this->advanceGeneration($gen_doc_offset[0]);
                $this->next_offset = $this->current_offset;
            }
            $using_feeds = $this->using_feeds && $this->use_feeds;
            if($using_feeds) {
                $shard = IndexManager::getIndex("feed");
                $last = $this->feed_end;
            } else {
                $index = IndexManager::getIndex($this->index_name);
                $index->setCurrentShard($this->current_generation, true);
                $shard = $index->getCurrentShard();
                $last = $this->last_offset;
            }

            if($this->current_generation == $gen_doc_offset[0]) {
                $offset_pair =
                    $shard->nextPostingOffsetDocOffset($this->next_offset,
                            $last, $gen_doc_offset[1]);
                if($offset_pair === false) {
                    $this->advanceGeneration();
                    $this->next_offset = $this->current_offset;
                } else {
                   list($this->current_offset,
                        $this->current_doc_offset) = $offset_pair;
                }
            }
            if($this->current_generation == -1) {
                $this->seen_docs =
                    ($this->current_offset - $this->feed_start)/
                        IndexShard::POSTING_LEN;
            } else {
                $this->seen_docs = ($using_feeds) ? $this->feed_count : 0;
                $this->seen_docs +=
                    ($this->current_offset - $this->start_offset)/
                        IndexShard::POSTING_LEN;
            }
        }
    }
    /**
     * Switches which index shard is being used to return occurrences of
     * the word to the next shard containing the word
     *
     * @param int $generation generation to advance beyond
     */
    function advanceGeneration($generation = NULL)
    {
        if($this->using_feeds && $this->use_feeds) {
            $this->using_feeds = false;
            $this->generation_pointer = -1;
        }
        if($generation === NULL) {
            $generation = $this->current_generation;
        }
        do {
            if($this->generation_pointer < $this->num_generations) {
                $this->generation_pointer++;
            }
            if($this->generation_pointer < $this->num_generations) {
                list($this->current_generation, $this->start_offset,
                    $this->last_offset, )
                    = $this->dictionary_info[$this->generation_pointer];
                $this->current_offset = $this->start_offset;
            }
       } while($this->current_generation < $generation &&
            $this->generation_pointer < $this->num_generations);
    }
    /**
     * Gets the doc_offset and generation for the next document that
     * would be return by this iterator
     *
     * @return mixed an array with the desired document offset
     *  and generation; -1 on fail
     */
    function currentGenDocOffsetWithWord() {
        if($this->current_doc_offset !== NULL) {
            return array($this->current_generation, $this->current_doc_offset);
        }
        $feeds = $this->using_feeds && $this->use_feeds && !$this->feed_empty;
        if( ($feeds && $this->current_offset > $this->feed_end) ||
            (!$feeds && ($this->current_offset > $this->last_offset||
            $this->generation_pointer >= $this->num_generations))) {
            return -1;
        }
        if($feeds) {
            $index = IndexManager::getIndex("feed");
            $this->current_doc_offset =
                $index->docOffsetFromPostingOffset($this->current_offset);
            return array(-1, $this->current_doc_offset);
        }
        $index = IndexManager::getIndex($this->index_name);
        $index->setCurrentShard($this->current_generation, true);
        $this->current_doc_offset = $index->getCurrentShard(
            )->docOffsetFromPostingOffset($this->current_offset);
        return array($this->current_generation, $this->current_doc_offset);
    }
}
?>
ViewGit