Last commit for src/library/index_bundle_iterators/WordIterator.php: 88ba842636f692ac9bde972fed5a3cf6959d841b

Allows Arctool to rebuild/remerge a range of partitions, fixes term lookup bugs in WordIterator and IndexDocumentBundle

Chris Pollett [2024-02-04 02:Feb:th]

Allows Arctool to rebuild/remerge a range of partitions, fixes term lookup bugs in WordIterator and IndexDocumentBundle

<?php
/**
 * SeekQuarry/Yioop --
 * Open Source Pure PHP Search Engine, Crawler, and Indexer
 *
 * Copyright (C) 2009 - 2024  Chris Pollett chris@pollett.org
 *
 * LICENSE:
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 *
 * END LICENSE
 *
 * @author Chris Pollett chris@pollett.org
 * @license https://www.gnu.org/licenses/ GPL3
 * @link https://www.seekquarry.com/
 * @copyright 2009 - 2024
 * @filesource
 */
namespace seekquarry\yioop\library\index_bundle_iterators;

use seekquarry\yioop\configs as C;
use seekquarry\yioop\library as L;
use seekquarry\yioop\library\IndexShard;
use seekquarry\yioop\library\IndexDocumentBundle;
use seekquarry\yioop\library\IndexManager;
use seekquarry\yioop\library\PartitionDocumentBundle;
use seekquarry\yioop\models\ParallelModel;

/**
 * Used to iterate through the documents associated with a word in
 * an IndexArchiveBundle. It also makes it easy to get the summaries
 * of these documents.
 *
 * A description of how words and the documents containing them are stored
 * is given in the documentation of IndexArchiveBundle.
 *
 * @author Chris Pollett
 * @see IndexArchiveBundle
 */
class WordIterator extends IndexBundleIterator
{
    /**
     * Host Key position + 1 (first char says doc, inlink or external link)
     */
    const HOST_KEY_POS = 17;
    /**
     *  Length of a doc key part
     */
    const KEY_LEN = 8;
    /**
     * Word key above in our modified base 64 encoding
     * @var string
     */
    public $base64_word_key;
    /**
     * The current value of the doc_offset of current posting if known
     * @var int
     */
    public $current_doc_offset;
    /**
     * Numeric number of current shard
     * @var int
     */
    public $current_generation;
    /**
     * The current byte offset in the IndexShard (if older index)
     * @var int
     */
    public $current_offset;
    /**
     * An array of shard generation and posting list offsets, lengths, and
     * numbers of documents
     * @var array
     */
    public $dictionary_info;
    /**
     * Keeps track of whether the word_iterator list is empty because the
     * word does not appear in the index shard
     * @var int
     */
    public $empty;
    /**
     * Model responsible for keeping track of edited and deleted search results
     * @var SearchfiltersModel
     */
    public $filter;
    /**
     * Index into dictionary_info corresponding to the current shard
     * @var int
     */
    public $generation_pointer;
    /**
     * The timestamp of the index is associated with this iterator
     * @var string
     */
    public $index_name;
    /**
     * Whether word key corresponds to a meta word
     * @var string
     */
    public $is_meta;
    /**
     * Last Offset of word occurrence in the IndexShard
     * @var int
     */
    public $last_offset;
    /**
     * The next byte offset in the IndexShard
     * @var int
     */
    public $next_offset;
    /**
     * The total number of shards that have data for this word
     * @var int
     */
    public $num_generations;
    /**
     * @var int
     */
    public $max_items_per_partition;
    /**
     * @var int
     */
    public $avg_items_per_partition;
    /**
     * @var int
     */
    public $total_number_of_partitions;
    /**
     * @var int
     */
    public $num_occurrences;
    /**
     * @var int
     */
    public $threshold_exceeded;
    /**
     * @var int
     */
    public $archive_file;
    /**
     * @var int
     */
    public $term_info_computed;
    /**
     * @var int
     */
    public $total_num_docs_and_links;
    /**
     * How url, keywords, and title words should influence relevance
     * and doc rank calculations
     * @var array
     */
    public $ranking_factors;
    /**
     * First shard generation that word info was obtained for
     * @var int
     */
    public $start_generation;
    /**
     * Starting Offset of word occurrence in the IndexShard
     * @var int
     */
    public $start_offset;
    /**
     * Whether the iterator iterates forward or backward through documents in
     * bundle
     * @var int
     */
    public $direction;
    /**
     * hash of word or phrase that the iterator iterates over
     * @var string
     */
    public $word_key;
    /**
     * Whether the latest version of each document should be searched for
     * @var boolean
     */
    public $retrieve_latest;
    /**
     * Creates a word iterator with the given parameters.
     *
     * @param string $word_key hash of word or phrase to iterate docs of
     * @param string $index_name time_stamp of the to use
     * @param bool $raw whether the $word_key is our variant of base64 encoded
     * @param SearchfiltersModel $filter Model responsible for keeping track
     *      of edited and deleted search results
     * @param int $results_per_block the maximum number of results that can
     *      be returned by a findDocsWithWord call
     * @param int $direction when results are access from $index_name in
     *      which order they should be presented. self::ASCENDING is from first
     *      added to last added, self::DESCENDING is from last added to first
     *      added. Note: this value is not saved permanently. So you
     *      could in theory open two read only versions of the same bundle but
     *      reading the results in different directions
     * @param array $ranking_factors field say how url, keywords, and
     *      title words should influence relevance and doc rank calculations
     * @param boolean $retrieve_latest whether the latest indexed instance of a
     *      document should be returned or not (might have multiple instances
     *      if crawl indexes document more than once)
     *      (@see PhraseModel::lookupSummaryOffsetGeneration())
     */
    public function __construct($word_key, $index_name, $raw = false,
        $filter = null, $results_per_block =
        IndexBundleIterator::RESULTS_PER_BLOCK, $direction = self::ASCENDING,
        $ranking_factors = [], $retrieve_latest = true)
    {
        if ($raw == false) {
            //get rid of our modified base64 encoding
            $word_key = L\unbase64Hash($word_key);
        }
        $this->direction = $direction;
        $this->filter = $filter;
        $this->word_key = $word_key;
        $this->is_meta = L\PhraseParser::checkMetaTerm($this->word_key);
        $this->base64_word_key = L\base64Hash($word_key);
        $this->index_name = $index_name;
        $this->termInfoIteratorFields($index_name, $word_key);
        $this->current_doc_offset = null;
        $this->results_per_block = $results_per_block;
        $this->current_block_fresh = false;
        $this->retrieve_latest = $retrieve_latest;
        $this->start_generation = ($direction == self::ASCENDING) ? 0 :
            "ACTIVE";
        foreach (["CLD_URL_BONUS" => C\CLD_URL_BONUS,
            "HOST_URL_BONUS" => C\HOST_URL_BONUS,
            "HOST_KEYWORD_BONUS" => C\HOST_KEYWORD_BONUS,
            "PATH_KEYWORD_BONUS" => C\PATH_KEYWORD_BONUS,
            "TITLE_BONUS" => C\TITLE_BONUS,
            "WIKI_BONUS" => C\WIKI_BONUS,
            "NUM_SLASHES_BONUS" => C\NUM_SLASHES_BONUS,
            ] as $factor => $default) {
            $this->ranking_factors[$factor] = $ranking_factors[$factor] ??
                $default;
        }
        if (!$this->empty) {
            $this->reset();
        }
    }
    /**
     * Returns CrawlConstants::ASCENDING or CrawlConstants::DESCENDING
     * depending on the direction in which this iterator ttraverse the
     * underlying index archive bundle.
     *
     * @return int direction traversing underlying archive bundle
     */
    public function getDirection()
    {
        return $this->direction;
    }
    /**
     * Resets the iterator to the first document block that it could iterate
     * over
     */
    public function reset()
    {
        if (!$this->empty) {//we shouldn't be called when empty - but to be safe
            $this->termInfoIteratorFields($this->index_name,
                    $this->word_key);
            $info = ($this->direction == self::ASCENDING) ?
                $this->dictionary_info[0] : $this->dictionary_info[
                $this->num_generations - 1];
            $this->current_generation = $info['PARTITION'];
            $this->start_offset = 0;
            $this->last_offset = $info['NUM_DOCS'] - 1;
        } else {
            $this->start_offset = 0;
            $this->last_offset = -1;
            $this->num_generations = -1;
        }
        if ($this->direction == self::ASCENDING) {
            $this->current_offset = $this->start_offset;
            $this->generation_pointer = 0;
        } else {
            $this->current_offset = $this->last_offset;
            /*  reset pointer to the number of gens, which in reverse is the
               first one we want
             */
            $this->generation_pointer = $this->num_generations - 1;
        }
        $this->count_block = 0;
        $this->seen_docs = 0;
        $this->current_doc_offset = null;
    }
    /**
     * Used to compute fields such as $this->total_num_docs for this iterator on
     * term $word_key for index $index_name
     *
     * @param string $index_name name of index to compute statistics with
     *      respect to
     * @param string $word_key term to compute statics with respect to
     */
    protected function termInfoIteratorFields($index_name, $word_key)
    {
        if (!empty($this->term_info_computed)) {
            return;
        }
        $word_info = IndexManager::getWordInfo($index_name, $word_key, -1, -1,
            C\NUM_DISTINCT_GENERATIONS, true);
        $this->total_num_docs = $word_info['TOTAL_NUM_DOCS'] ?? 0;
        $this->total_num_docs_and_links =
            $word_info['TOTAL_NUM_LINKS_AND_DOCS'] ?? 0;
        $this->max_items_per_partition =
            $word_info['MAX_ITEMS_PER_PARTITION'] ??
            PartitionDocumentBundle::MAX_ITEMS_PER_FILE;
        $this->avg_items_per_partition =
            $word_info['AVG_ITEMS_PER_PARTITION'] ??
            PartitionDocumentBundle::MAX_ITEMS_PER_FILE;
        $this->total_number_of_partitions =
            $word_info['TOTAL_NUMBER_OF_PARTITIONS'] ?? 0;
        $this->num_docs = $word_info['TOTAL_COUNT'] ?? 0;
        $this->num_occurrences = $word_info['TOTAL_OCCURRENCES'] ?? 0;
        $this->dictionary_info = $word_info['ROWS'] ?? [];
        $this->threshold_exceeded = $word_info['THESHOLD_EXCEEDED'] ??
            false;
        $this->archive_file = $word_info['ARCHIVE_FILE'] ?? "";
        if (empty($this->dictionary_info)) {
            $this->empty = true;
            $this->num_generations = 0;
        } else {
            $this->num_generations = count($this->dictionary_info);
            $this->empty = ($this->num_generations == 0);
        }
        $this->term_info_computed = true;
    }
    /**
     * Hook function used by currentDocsWithWord to return the current block
     * of docs if it is not cached
     *
     * @return mixed doc ids and score if there are docs left, -1 otherwise
     */
    public function findDocsWithWord()
    {
        if ($this->empty) {
            return -1;
        }
        $ascending = ($this->direction == self::ASCENDING);
        if ($ascending) {
            if (($this->generation_pointer >= $this->num_generations) ||
                $this->generation_pointer == $this->num_generations - 1 &&
                $this->current_offset > $this->last_offset) {
                return -1;
            }
        } else {
            if (($this->generation_pointer < 0)
                || ($this->generation_pointer == 0 &&
                $this->current_offset < $this->start_offset)) {
                return -1;
            }
        }
        $pre_results = [];
        if (!$this->empty) {
            $pre_results = $this->getPostingsSliceResults();
        }
        $results = [];
        $doc_key_len = self::KEY_LEN;
        foreach ($pre_results as $keys => $data) {
            $host_key = substr($keys, self::HOST_KEY_POS, self::KEY_LEN);
            if (!empty($this->filter) && $this->filter->isFiltered($host_key)) {
                continue;
            }
            // inlinks is the domain of the inlink
            $key_parts = str_split($keys, $doc_key_len);
            $data[self::KEY] = $keys;
            if (isset($key_parts[2])) {
                list(, $data[self::HASH], $data[self::INLINKS]) =
                    $key_parts;
            } else {
                continue;
            }
            $data[self::CRAWL_TIME] = $this->index_name;
            $results[$keys] = $data;
        }
        $this->count_block = count($results);
        if ($this->generation_pointer == $this->num_generations - 1 &&
            empty($pre_results)) {
            $results = -1;
        }
        $this->pages = $results;
        return $results;
    }
    /**
     * Given the current_offset, result_per_block, and index used get the
     * result_per_block postings starting from current_offset in the current
     * direction (ascending or descending) for the term word iterator
     * iterates over from the index.
     */
    public function getPostingsSliceResults()
    {
        $this->next_offset = $this->current_offset;
        if ($this->direction == self::ASCENDING) {
            if ($this->current_offset < $this->start_offset) {
                $this->current_offset = $this->start_offset;
                $this->next_offset = $this->current_offset;
            }
            if ($this->next_offset > $this->last_offset) {
                return [];
            }
            $start_slice = $this->next_offset;
            $num_slice = min($this->results_per_block,
                $this->last_offset - $this->next_offset + 1);
            $this->next_offset += $num_slice;
        } else {
            if ($this->current_offset > $this->last_offset) {
                $this->current_offset = $this->last_offset;
                $this->next_offset = $this->current_offset;
            }
            if ($this->next_offset < $this->start_offset) {
                return [];
            }
            $num_slice = max($this->results_per_block,
                $this->start_offset);
            $this->next_offset -= $num_slice;
            $start_slice = $this->next_offset + 1;
        }
        $postings = $this->getGenerationPostings($this->generation_pointer);
        $postings = array_slice($postings, $start_slice, $num_slice);
        $key_postings = $this->getDocKeyPositionsScoringInfo($postings,
            $this->current_generation);
        return $key_postings;
    }
    /**
     * Get the positions file specs for the given base index folder.
     *
     * @param string $base_folder of index folder
     * @return array [file handle, file size]
     */
    public function getPositionsFile($base_folder)
    {
        $positions_filename = $base_folder . "/" .
            IndexDocumentBundle::POSITIONS_FILENAME;
        if (file_exists($positions_filename)) {
            $fh = fopen($positions_filename, "r");
            $file_size = filesize($positions_filename);
        } else {
            $fh = false;
            $file_size = 0;
        }
        return [$fh, $file_size];
    }
    /**
     * Get the positions entry associated with a particular posting.
     *
     * @param array $posting of positions entry
     * @param int $positions_file_size of positions file
     * @param resource $positions_fh of positions file
     * @return array of positions
     */
    public function getPositionsList($posting,
        $positions_file_size, $positions_fh)
    {
        if ($posting['POSITIONS_LEN'] > 0 && $posting['POSITIONS_LEN'] <
            $positions_file_size && !empty($positions_fh)) {
            if (fseek($positions_fh, $posting['POSITIONS_OFFSET']) >= 0) {
                $encoded_positions = fread($positions_fh,
                    $posting['POSITIONS_LEN']);
                $position_list = L\decodePositionList(
                    $encoded_positions, $posting['FREQUENCY']);
            } else {
                $position_list = [];
            }
        } else {
            $position_list = [];
        }
        return $position_list;
    }
    /**
     * Add to a set of postings from a partition scoring information, position
     * list information and info about the relative weights of given position
     * based on the position list file and doc_map file.
     *
     *  @param array $postings posting data to add scoring information to
     *  @param int $partition which partition from the PartitionDocumentBundle
     *    postings a re related to
     */
    public function getDocKeyPositionsScoringInfo($postings, $partition)
    {
        $key_postings = [];
        $index = IndexManager::getIndex($this->index_name);
        $base_folder = $index->getPartitionBaseFolder($partition);
        $doc_map_filename = $base_folder . "/" .
            IndexDocumentBundle::DOC_MAP_FILENAME;
        $doc_map_tools = $index->doc_map_tools;
        list($fh, $file_size) = $this->getPositionsFile($base_folder);
        $number_of_partitions = $this->total_number_of_partitions;
        $num_doc_keys = $doc_map_tools->countTableEntries($doc_map_filename);
        $is_ascending = ($this->direction == self::ASCENDING);
        $num_seen_partitions = ($is_ascending) ?
            $partition + 1 : $number_of_partitions - $partition;
        $occurrences_per_doc = $this->num_occurrences /
            max($this->total_num_docs, 1);
        foreach ($postings as $posting) {
            $posting[self::GENERATION] = $partition;
            $posting[self::POSITION_LIST] = $this->getPositionsList($posting,
                $file_size, $fh);
            $doc_map_index = $posting['DOC_MAP_INDEX'];
            $entry = $doc_map_tools->findEntryAtIndexTableName(
                $doc_map_filename, $doc_map_index);
            $docid_len = IndexDocumentBundle::DOCID_LEN;
            $termsfilter_len = IndexDocumentBundle::TERMSFILTER_LEN;
            if (strlen($entry) < $docid_len) {
                continue;
            }
            $doc_key = substr($entry, 0, $docid_len);
            $is_text = IndexDocumentBundle::isType($doc_key, "text");
            /**
             * For backward compatibility: only check for the latest
             * crawled version of a page if $entry[24] == 't'
             * (the beginning character of the term bloom
             * filter string attached to doc_map entries).
             */
            $values = (strlen($entry) >= ($docid_len + $termsfilter_len + 1) &&
                $entry[$docid_len] == 't') ?
                substr($entry, $docid_len + $termsfilter_len + 1) :
                substr($entry, $docid_len);
            if ($this->retrieve_latest && $entry[$docid_len] == 't' &&
                $is_text) {
                $url_hash = substr($doc_key, 0, 8);
                $latest_version_info =
                    IndexManager::lookupLatestVersionPage($url_hash,
                    $this->index_name);
                if ($latest_version_info != null) {
                    list($latest_partition, $latest_posting) =
                        $latest_version_info;
                    /**
                     * Ensure that the discovered latest version
                     * isn't the same as the current posting.
                     */
                    if ($partition != $latest_partition ||
                        $latest_posting['DOC_MAP_INDEX'] !=
                        $doc_map_index) {
                        $latest_base_folder = $index->
                            getPartitionBaseFolder($latest_partition);
                        $latest_doc_map_filename = $latest_base_folder .
                            "/" . IndexDocumentBundle::DOC_MAP_FILENAME;
                        $latest_doc_map_index =
                            $latest_posting['DOC_MAP_INDEX'];
                        $latest_doc_map_entry =
                            $doc_map_tools->findEntryAtIndexTableName(
                                $latest_doc_map_filename,
                                $latest_doc_map_index);
                        if (strlen($latest_doc_map_entry) < $docid_len) {
                            continue;
                        }
                        $latest_doc_key = substr($latest_doc_map_entry, 0,
                            $docid_len);
                        $terms_filter = substr($latest_doc_map_entry,
                            $docid_len + 1, $termsfilter_len);
                        if (!IndexDocumentBundle::checkTermExists(
                                $this->word_key, $terms_filter)) {
                            continue;
                        } else {
                            /**
                             * The current term id exists in the most recent
                             * version of the document; replace the current
                             * posting entries with the latest entry.
                             */
                            $posting[self::GENERATION] = $latest_partition;
                            $posting['DOC_MAP_INDEX'] =
                                $latest_doc_map_index;
                            $doc_key = $latest_doc_key;
                            $values = substr($latest_doc_map_entry,
                                $docid_len + $termsfilter_len + 1);
                            $latest_term_postings = $this->
                                getGenerationPostings($latest_partition);
                            $target_posting =
                                array_filter($latest_term_postings,
                                    function ($p) use ($latest_doc_map_index)
                                    {
                                        return $p['DOC_MAP_INDEX'] ==
                                            $latest_doc_map_index;
                                    });
                            if (count($target_posting) > 0) {
                                $posting['POSITIONS_LEN'] =
                                    $target_posting[0]['POSITIONS_LEN'];
                                $posting['POSITIONS_OFFSET'] =
                                    $target_posting[0]['POSITIONS_OFFSET'];
                                $posting['FREQUENCY'] =
                                    $target_posting[0]['FREQUENCY'];
                                $latest_base_folder = $index->
                                    getPartitionBaseFolder($partition);
                                list($latest_positions_fh,
                                    $latest_positions_file_size) = $this->
                                getPositionsFile($latest_base_folder);
                                $posting[self::POSITION_LIST] =
                                    $this->getPositionsList($posting,
                                        $latest_positions_file_size,
                                        $latest_positions_fh);
                            }
                        }
                    }
                }
            }
            if (IndexDocumentBundle::isType($doc_key, "doc")) {
                $posting[self::IS_DOC] = true;
            }
            $doc_info = $doc_map_tools->unpack($values);
            if (empty($doc_info)) {
                continue;
            }
            $time = time();
            $posting[self::KEY] = $doc_key;
            list($posting[self::DOC_LEN], $original_score) =
                array_values(array_shift($doc_info));
            $is_timestamp_score = ($original_score <= $time &&
                $original_score > ($time >> 1));
            /*
               DOC_RANK calculate is a computes a document quality measure
               either based on time item was added (freshness) or a
               sum of signals (how early or late it was added to index),
               whether the url was a CLD or HOST, whether page was a wiki
               page, and the number of slashes in the url path
             */
            if ($is_timestamp_score) {
                $posting[self::DOC_RANK] = $time /
                    (max(1, $time - $original_score)) *
                    $this->getMaxDocQualityScore();
            } else {
                $posting[self::DOC_RANK] = $this->computeDocRank($doc_key,
                    $doc_map_index, $num_seen_partitions, $number_of_partitions,
                    $num_doc_keys, $this->avg_items_per_partition,
                    $this->max_items_per_partition,
                    $this->ranking_factors, $is_ascending);
            }
            list($preface_positions, $num_description_scores) =
                array_values(array_shift($doc_info));
            $num_description_scores = intval($num_description_scores);
            $posting["PATH_KEYWORDS_END_POS"] = ($preface_positions & 255);
            $preface_positions = $preface_positions >> 8;
            $posting["TITLE_END_POS"] = ($preface_positions & 255);
            $preface_positions = $preface_positions >> 8;
            $posting["HOST_KEYWORDS_END_POS"] = ($preface_positions & 255);
            $posting[self::DESCRIPTION_SCORES] = array_slice($doc_info, 0,
                $num_description_scores);
            if ($posting['FREQUENCY'] > 0) {
                list($bonuses, $frequency) =
                    $this->frequencyScoring(
                    $occurrences_per_doc,
                    $posting[self::POSITION_LIST],
                    $posting[self::DOC_LEN],
                    $posting["HOST_KEYWORDS_END_POS"],
                    $posting["TITLE_END_POS"],
                    $posting["PATH_KEYWORDS_END_POS"],
                    $posting[self::DESCRIPTION_SCORES]);
                // Divergence-from-randomness + preface score
                $nonzero_occurrences_per_doc = ($occurrences_per_doc > 0) ?
                    $occurrences_per_doc : 1;
                $posting[self::RELEVANCE] = $bonuses *
                    log(1 + 1/$nonzero_occurrences_per_doc, 2) /
                    ($bonuses + 1) +
                    ((log(1 + $occurrences_per_doc, 2) + $frequency *
                    log(1 + 1/$nonzero_occurrences_per_doc, 2)) /
                    ($frequency + 1));
            } else {
                 /*
                   this will typically be the relaveance score for a meta word
                   As will always be frequency 1 and have no position info
                   set close to 0. (Not zero to avoid div by 0's)
                  */
                 $posting[self::RELEVANCE] = 0.01;
            }
            $posting[self::SCORE] = $posting[self::DOC_RANK] +
                $posting[self::RELEVANCE];
            $posting[self::USER_RANKS] = array_slice($doc_info,
                $num_description_scores);
            $key_postings[$doc_key] = $posting;
        }
        if (!empty($fh)) {
            fclose($fh);
        }
        return $key_postings;
    }
    /**
     * Computes weighted frequencies of a term within a document with respect to
     * the length of the document, the positions of the term with the document
     * and the overall importance score for a given position within the document
     * Also computes the score of the posting for the host keywords,
     * title keywords, and path keywords bonuses.
     *
     * @param float $occurrences_per_doc expected number of occurrence of term
     *  per/doc.
     * @param array $positions positions of this iterators term in the document
     * @param int $num_words number of terms in the document
     * @param int $host_keywords_end_pos term offset into the document summary
     *  that demarks the end of the host keywords portion of the summary
     * @param int $title_end_pos absolute term offset into the document summary
     *  that demarks the end of the title portion of the summary
     * @param int $path_keywords_end_pos absolute term offset into the document
     *  summary that demarks the end of the title portion of the summary
     * @param array $descriptions_scores boundaries and scores of different
     *  regions with document
     * @return array [score for host title path keywords bonuses, frequency]
     */
    public function frequencyScoring(
        $occurrences_per_doc, $positions, $num_words, $host_keywords_end_pos,
        $title_end_pos, $path_keywords_end_pos, $descriptions_scores)
    {
        $num_words = max($num_words, 1);
        /*
         * Amati and van Rijsbergen suggest a normalization of
         * log_2(1 + l_avg/l_d) for divergence-from-randomness
         * Here l_avg = average num words in a document, l_d = num words
         * current document. C\MAX_DESCRIPTION_LEN is the max number
         * of characters in a document. Assuming the average word is
         * around 5 chars + whitespace char + punctuation, and most documents
         * are summarized to close to the max character length, we
         * approximate l_avg as C\MAX_DESCRIPTION_LEN/7 in the below.
         */
        $pseudo_doc_length = 7 * $num_words;
        $length_normalization = log(1 + C\MAX_DESCRIPTION_LEN/(7 * $num_words),
            2);
        if (empty($descriptions_scores)) {
            return count($positions) * $length_normalization;
        }
        $host_bonus = $this->ranking_factors["HOST_KEYWORD_BONUS"];
        $path_bonus = $this->ranking_factors["PATH_KEYWORD_BONUS"];
        $title_bonus = $this->ranking_factors["TITLE_BONUS"];
        $len_term = strlen($this->word_key);
        $first_index = 0;
        $old_pos = 0;
        /*
           Sum of description scores without bonus scores we add below
           is 1. So with the scores we add below is $max_doc_norm_score.
           The foreach loop that follows measures what fraction of this
           comes from $this->word_key occurrences, so will be a number
           less than $max_doc_norm_score;
         */
        $descriptions_scores = array_merge(
            [['POS' => - $path_keywords_end_pos - 1,
             'SCORE' => $host_bonus],
             ['POS' => $host_keywords_end_pos - $path_keywords_end_pos - 1,
             'SCORE' => $title_bonus],
             ['POS' => $title_end_pos - $path_keywords_end_pos - 1,
              'SCORE' => $path_bonus],
           ], $descriptions_scores);
        $num_scores = count($descriptions_scores);
        $weighted_frequency = 0;
        $bonuses = 0;
        foreach ($positions as $position) {
            $last_index = $num_scores - 1;
            /* description score offsets are with respect to the description
               only so we subtract from the term position the offset of the
               non-description
             */
            $position -= ($path_keywords_end_pos + 1);
            while ($first_index < $last_index) {
                $mid_index = ceil(($first_index + $last_index)/2.0);
                if ($descriptions_scores[$mid_index]['POS'] > $position) {
                    $last_index = $mid_index - 1;
                } else {
                    $first_index = $mid_index;
                }
            }
            $weight = $descriptions_scores[$first_index]['SCORE'];
            $start_description_pos = $descriptions_scores[$first_index]['POS'];
            $len_description = max(abs(($first_index == $num_scores - 1) ?
                $pseudo_doc_length - $start_description_pos :
                $descriptions_scores[$first_index + 1]['POS'] -
                $start_description_pos), $len_term, 1);
            $frequency_term = $weight * $len_term / $len_description;
            if ($position <= 0) {
                $bonuses += $weight; //$frequency_term;
            } else {
                $weighted_frequency += $frequency_term;
            }
        }
        $frequency = $weighted_frequency * $length_normalization;
        return [$bonuses, $frequency];
    }
    /**
     * Updates the seen_docs count during an advance() call
     */
    public function advanceSeenDocs()
    {
        if ($this->current_block_fresh != true) {
            if ($this->direction == self::ASCENDING) {
                $remaining_postings = $this->last_offset - $this->next_offset;
                $num_docs = min($this->results_per_block, $remaining_postings);
                $delta_sign = 1;
            } else {
                $remaining_postings = $this->next_offset -
                    $this->start_offset + 1;
                $num_docs = min($this->results_per_block,
                    $remaining_postings);
                $delta_sign = -1;
            }
            $this->next_offset = $this->current_offset;
            $this->next_offset += $delta_sign * $num_docs;
            if ($num_docs <= 0) {
                return;
            }
        } else {
            $num_docs = $this->count_block;
        }
        $this->current_block_fresh = false;
        $this->seen_docs += $num_docs;
    }
    /**
     * Forwards the iterator one group of docs
     * @param array $gen_doc_offset a generation, doc_offset pair. If not null,
     *     (in the ascending search case opposite for descending), the pair
     *     must be of greater than or equal generation, and if equal the
     *     next block must all have $doc_offsets larger than or equal to
     *     this value.
     */
    public function advance($gen_doc_offset = null)
    {
        if ($gen_doc_offset == null) {
            $this->plainAdvance();
            return;
        }
        $is_ascending = ($this->direction == self::ASCENDING);
        $cur_gen_doc_offset = $this->currentGenDocOffsetWithWord();
        if ($cur_gen_doc_offset == -1 ||
            $this->genDocOffsetCmp($cur_gen_doc_offset,
            $gen_doc_offset, $this->direction) >= 0) {
            return;
        }
        $advance_check = ($is_ascending) ?
            ($this->current_generation < $gen_doc_offset[0]) :
            ($this->current_generation > $gen_doc_offset[0]);
        if ($advance_check) {
            $this->advanceGeneration($gen_doc_offset[0]);
            $this->next_offset = $this->current_offset;
        }
        if ($this->current_generation == $gen_doc_offset[0]) {
            $offset_pair = $this->nextDocIndexOffsetPair(
                $gen_doc_offset[1]);
            if ($offset_pair === false) {
                $this->advanceGeneration();
                $this->next_offset = $this->current_offset;
            } else {
                list($this->current_offset, $this->current_doc_offset) =
                    $offset_pair;
                $this->next_offset = $this->current_offset;
            }
        }
        if ($is_ascending) {
            $this->seen_docs = ($this->current_offset - $this->start_offset);
        } else {
            $this->seen_docs = ($this->last_offset - $this->current_offset);
        }
        $this->current_block_fresh = false;
    }
    /**
     * Computes a pair [posting_slice_offset, $doc_index], such that
     * the $doc_index when shift to make a doc_offset is greater than
     * $doc_offset and posting_slice_offset is the offset of the first
     * posting with this property.
     * @param int $doc_offset that we are try to find a posting whose
     *  doc_index has a bigger doc_offset
     * @return array [posting_slice_offset, $doc_index]
     */
    public function nextDocIndexOffsetPair($doc_offset)
    {
        $is_ascending = ($this->direction == self::ASCENDING);
        $end_offset = ($is_ascending)? $this->last_offset : $this->start_offset;
        $postings = $this->getGenerationPostings($this->generation_pointer);
        if (empty($postings[$end_offset]) ) {
            return false;
        }
        $last_doc = $postings[$end_offset]["DOC_MAP_INDEX"];
        if (($is_ascending && $last_doc < $doc_offset) ||
           (!$is_ascending && $last_doc > $doc_offset)) {
              return false;
        }
        $next_offset = ($this->next_offset ?? $this->current_offset);
        $last_offset = $next_offset;
        $next_doc = $postings[$next_offset]["DOC_MAP_INDEX"] ?? $doc_offset;
        $cmp = ($is_ascending) ?
            ($next_doc < $doc_offset && $next_offset <= $end_offset):
            ($next_doc > $doc_offset && $next_offset >= $end_offset);
        $delta = ($is_ascending) ? 1 : -1;
        while ($cmp)  {
            $last_offset = $next_offset;
            $next_offset += $delta;
            $delta *= 2;
            $next_doc = $postings[$next_offset]["DOC_MAP_INDEX"] ?? $doc_offset;
            $cmp = ($is_ascending) ?
                ($next_doc < $doc_offset && $next_offset <= $end_offset):
                ($next_doc > $doc_offset && $next_offset >= $end_offset);
        }
        if (($is_ascending && $next_offset > $end_offset) ||
            (!$is_ascending && $next_offset < $end_offset)) {
            $next_offset = $end_offset;
        }
        while(abs($next_offset - $last_offset) > 1) {
            $mid_offset = ($next_offset + $last_offset) >> 1;
            $mid_doc = $postings[$mid_offset]["DOC_MAP_INDEX"];
            $cmp = ($is_ascending) ?
                ($mid_doc < $doc_offset) : ($mid_doc > $doc_offset);
            if ($cmp) {
                $last_offset = $mid_offset;
            } else {
                $next_offset = $mid_offset;
                $next_doc = $mid_doc;
            }
        }
        if (abs($next_offset - $last_offset) == 1) {
            $next_doc = $postings[$next_offset]["DOC_MAP_INDEX"];
        }
        return [$next_offset, $next_doc];
    }
    /**
     * Forwards the iterator one group of docs. This is what's called
     * by @see advance($gen_doc_offset) if $gen_doc_offset is null
     */
    public function plainAdvance()
    {
        $is_ascending = ($this->direction == self::ASCENDING);
        $this->advanceSeenDocs();
        $this->current_doc_offset = null;
        $update_check = ($is_ascending) ?
            ($this->current_offset < $this->next_offset) :
            ($this->current_offset > $this->next_offset);
        if ($update_check) {
            $this->current_offset = $this->next_offset;
            $update_check = ($is_ascending) ?
                ($this->current_offset > $this->last_offset) :
                ($this->current_offset < $this->start_offset);
            if ($update_check) {
                $this->advanceGeneration();
                $this->next_offset = $this->current_offset;
            }
        } else {
            $this->advanceGeneration();
            $this->next_offset = $this->current_offset;
        }
    }
    /**
     * Switches which index shard is being used to return occurrences of
     * the word to the next shard containing the word
     *
     * @param int $generation generation to advance beyond
     */
    public function advanceGeneration($generation = null)
    {
        if ($generation === null) {
            $generation = $this->current_generation;
        }
        $this->generation_pointer ??= 0; //if not set set to 0
        $is_ascending = ($this->direction == self::ASCENDING);
        do {
            $gen_check = ($is_ascending) ?
                ($this->generation_pointer < $this->num_generations) :
                ($this->generation_pointer >= 0);
            if ($gen_check) {
                if ($is_ascending) {
                    $this->generation_pointer++;
                } else {
                    $this->generation_pointer--;
                }
            }
            $gen_check = ($is_ascending) ?
                $this->generation_pointer < $this->num_generations :
                $this->generation_pointer >= 0;
            if ($gen_check) {
                $partition_info =
                    $this->dictionary_info[$this->generation_pointer];
                $this->current_generation = $partition_info['PARTITION'];
                $this->start_offset = 0;
                $this->last_offset = ($partition_info['NUM_DOCS'] ?? 1) - 1;
                $this->current_offset = ($is_ascending) ? $this->start_offset:
                    $this->last_offset;
            }
            $gen_check = ($is_ascending) ?
                ($this->current_generation < $generation &&
                $this->generation_pointer < $this->num_generations) :
                ($this->current_generation > $generation &&
                $this->generation_pointer >= 0);
        } while($gen_check);
    }
    /**
     * Given a partition number in the the index's PartitionDocumentBundle
     * retrieves all the posting for the word iterator's term in that
     * partition.
     *
     * @param int $generation partition to get  postings for
     * @return array of posting items
     */
    public function getGenerationPostings($generation)
    {
        static $test_time = 0;
        if (empty($this->dictionary_info[$generation])) {
            return [];
        }
        $generation_info = $this->dictionary_info[$generation];
        if (!empty($generation_info['POSTINGS']) &&
            is_array($generation_info['POSTINGS'])) {
            return $generation_info['POSTINGS']; //already loaded
        }
        $index = IndexManager::getIndex($this->index_name);
        if (empty($generation_info['POSTINGS_OFFSET']) ||
            empty($generation_info['POSTINGS_LEN'])) {
            $postings_entry = "";
        } else {
            $postings_entry = $index->getPostingsString(
                $generation_info['PARTITION'],
                $generation_info['POSTINGS_OFFSET'],
                $generation_info['POSTINGS_LEN']);
        }
        $postings = [];
        if (!empty($postings_entry)) {
            $postings_info = $index->unpackPostings($postings_entry);
            if (!empty($postings_info)) {
                list($postings,) = $postings_info;
            }
        }
        $this->dictionary_info[$generation]['POSTINGS'] = $postings;
        return $postings;
    }
    /**
     * Gets the doc_offset and generation for the next document that
     * would be return by this iterator
     *
     * @return mixed an array with the desired document offset
     * and generation; -1 on fail
     */
    public function currentGenDocOffsetWithWord()
    {
        if ($this->current_doc_offset !== null) {
            return [$this->current_generation, $this->current_doc_offset];
        }
        $is_ascending = ($this->direction == self::ASCENDING);
        $offset_check = ($is_ascending) ?
            ($this->current_offset > $this->last_offset ||
            $this->generation_pointer >= $this->num_generations) :
            ($this->current_offset < $this->start_offset||
            $this->generation_pointer < -1);
        if ($offset_check ||
            empty($this->dictionary_info[$this->generation_pointer])) {
            return -1;
        } else {
            $partition_info = $this->dictionary_info[$this->generation_pointer];
            $this->current_generation = $partition_info['PARTITION'];
            $postings = $this->getGenerationPostings($this->generation_pointer);
            $this->current_doc_offset =
                $postings[$this->current_offset]['DOC_MAP_INDEX'] ?? -1;
        }
        return [$this->current_generation, $this->current_doc_offset];
    }
}

ViewGit