viewgit/inc/functions.php:22 Function utf8_encode() is deprecated [8192]
<?php /** * SeekQuarry/Yioop -- * Open Source Pure PHP Search Engine, Crawler, and Indexer * * Copyright (C) 2009 - 2024 Chris Pollett chris@pollett.org * * LICENSE: * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <https://www.gnu.org/licenses/>. * * END LICENSE * * @author Chris Pollett chris@pollett.org * @license https://www.gnu.org/licenses/ GPL3 * @link https://www.seekquarry.com/ * @copyright 2009 - 2024 * @filesource */ namespace seekquarry\yioop\library\index_bundle_iterators; use seekquarry\yioop\configs as C; use seekquarry\yioop\library as L; use seekquarry\yioop\library\IndexShard; use seekquarry\yioop\library\IndexDocumentBundle; use seekquarry\yioop\library\IndexManager; use seekquarry\yioop\library\PartitionDocumentBundle; use seekquarry\yioop\models\ParallelModel; /** * Used to iterate through the documents associated with a word in * an IndexArchiveBundle. It also makes it easy to get the summaries * of these documents. * * A description of how words and the documents containing them are stored * is given in the documentation of IndexArchiveBundle. * * @author Chris Pollett * @see IndexArchiveBundle */ class WordIterator extends IndexBundleIterator { /** * Host Key position + 1 (first char says doc, inlink or external link) */ const HOST_KEY_POS = 17; /** * Length of a doc key part */ const KEY_LEN = 8; /** * Word key above in our modified base 64 encoding * @var string */ public $base64_word_key; /** * The current value of the doc_offset of current posting if known * @var int */ public $current_doc_offset; /** * Numeric number of current shard * @var int */ public $current_generation; /** * The current byte offset in the IndexShard (if older index) * @var int */ public $current_offset; /** * An array of shard generation and posting list offsets, lengths, and * numbers of documents * @var array */ public $dictionary_info; /** * Keeps track of whether the word_iterator list is empty because the * word does not appear in the index shard * @var int */ public $empty; /** * Model responsible for keeping track of edited and deleted search results * @var SearchfiltersModel */ public $filter; /** * Index into dictionary_info corresponding to the current shard * @var int */ public $generation_pointer; /** * The timestamp of the index is associated with this iterator * @var string */ public $index_name; /** * Whether word key corresponds to a meta word * @var string */ public $is_meta; /** * Last Offset of word occurrence in the IndexShard * @var int */ public $last_offset; /** * The next byte offset in the IndexShard * @var int */ public $next_offset; /** * The total number of shards that have data for this word * @var int */ public $num_generations; /** * @var int */ public $max_items_per_partition; /** * @var int */ public $avg_items_per_partition; /** * @var int */ public $total_number_of_partitions; /** * @var int */ public $num_occurrences; /** * @var int */ public $threshold_exceeded; /** * @var int */ public $archive_file; /** * @var int */ public $term_info_computed; /** * @var int */ public $total_num_docs_and_links; /** * How url, keywords, and title words should influence relevance * and doc rank calculations * @var array */ public $ranking_factors; /** * First shard generation that word info was obtained for * @var int */ public $start_generation; /** * Starting Offset of word occurrence in the IndexShard * @var int */ public $start_offset; /** * Whether the iterator iterates forward or backward through documents in * bundle * @var int */ public $direction; /** * hash of word or phrase that the iterator iterates over * @var string */ public $word_key; /** * Whether the latest version of each document should be searched for * @var boolean */ public $retrieve_latest; /** * Creates a word iterator with the given parameters. * * @param string $word_key hash of word or phrase to iterate docs of * @param string $index_name time_stamp of the to use * @param bool $raw whether the $word_key is our variant of base64 encoded * @param SearchfiltersModel $filter Model responsible for keeping track * of edited and deleted search results * @param int $results_per_block the maximum number of results that can * be returned by a findDocsWithWord call * @param int $direction when results are access from $index_name in * which order they should be presented. self::ASCENDING is from first * added to last added, self::DESCENDING is from last added to first * added. Note: this value is not saved permanently. So you * could in theory open two read only versions of the same bundle but * reading the results in different directions * @param array $ranking_factors field say how url, keywords, and * title words should influence relevance and doc rank calculations * @param boolean $retrieve_latest whether the latest indexed instance of a * document should be returned or not (might have multiple instances * if crawl indexes document more than once) * (@see PhraseModel::lookupSummaryOffsetGeneration()) */ public function __construct($word_key, $index_name, $raw = false, $filter = null, $results_per_block = IndexBundleIterator::RESULTS_PER_BLOCK, $direction = self::ASCENDING, $ranking_factors = [], $retrieve_latest = true) { if ($raw == false) { //get rid of our modified base64 encoding $word_key = L\unbase64Hash($word_key); } $this->direction = $direction; $this->filter = $filter; $this->word_key = $word_key; $this->is_meta = L\PhraseParser::checkMetaTerm($this->word_key); $this->base64_word_key = L\base64Hash($word_key); $this->index_name = $index_name; $this->termInfoIteratorFields($index_name, $word_key); $this->current_doc_offset = null; $this->results_per_block = $results_per_block; $this->current_block_fresh = false; $this->retrieve_latest = $retrieve_latest; $this->start_generation = ($direction == self::ASCENDING) ? 0 : "ACTIVE"; foreach (["CLD_URL_BONUS" => C\CLD_URL_BONUS, "HOST_URL_BONUS" => C\HOST_URL_BONUS, "HOST_KEYWORD_BONUS" => C\HOST_KEYWORD_BONUS, "PATH_KEYWORD_BONUS" => C\PATH_KEYWORD_BONUS, "TITLE_BONUS" => C\TITLE_BONUS, "WIKI_BONUS" => C\WIKI_BONUS, "NUM_SLASHES_BONUS" => C\NUM_SLASHES_BONUS, ] as $factor => $default) { $this->ranking_factors[$factor] = $ranking_factors[$factor] ?? $default; } if (!$this->empty) { $this->reset(); } } /** * Returns CrawlConstants::ASCENDING or CrawlConstants::DESCENDING * depending on the direction in which this iterator ttraverse the * underlying index archive bundle. * * @return int direction traversing underlying archive bundle */ public function getDirection() { return $this->direction; } /** * Resets the iterator to the first document block that it could iterate * over */ public function reset() { if (!$this->empty) {//we shouldn't be called when empty - but to be safe $this->termInfoIteratorFields($this->index_name, $this->word_key); $info = ($this->direction == self::ASCENDING) ? $this->dictionary_info[0] : $this->dictionary_info[ $this->num_generations - 1]; $this->current_generation = $info['PARTITION']; $this->start_offset = 0; $this->last_offset = $info['NUM_DOCS'] - 1; } else { $this->start_offset = 0; $this->last_offset = -1; $this->num_generations = -1; } if ($this->direction == self::ASCENDING) { $this->current_offset = $this->start_offset; $this->generation_pointer = 0; } else { $this->current_offset = $this->last_offset; /* reset pointer to the number of gens, which in reverse is the first one we want */ $this->generation_pointer = $this->num_generations - 1; } $this->count_block = 0; $this->seen_docs = 0; $this->current_doc_offset = null; } /** * Used to compute fields such as $this->total_num_docs for this iterator on * term $word_key for index $index_name * * @param string $index_name name of index to compute statistics with * respect to * @param string $word_key term to compute statics with respect to */ protected function termInfoIteratorFields($index_name, $word_key) { if (!empty($this->term_info_computed)) { return; } $word_info = IndexManager::getWordInfo($index_name, $word_key, -1, -1, C\NUM_DISTINCT_GENERATIONS, true); $this->total_num_docs = $word_info['TOTAL_NUM_DOCS'] ?? 0; $this->total_num_docs_and_links = $word_info['TOTAL_NUM_LINKS_AND_DOCS'] ?? 0; $this->max_items_per_partition = $word_info['MAX_ITEMS_PER_PARTITION'] ?? PartitionDocumentBundle::MAX_ITEMS_PER_FILE; $this->avg_items_per_partition = $word_info['AVG_ITEMS_PER_PARTITION'] ?? PartitionDocumentBundle::MAX_ITEMS_PER_FILE; $this->total_number_of_partitions = $word_info['TOTAL_NUMBER_OF_PARTITIONS'] ?? 0; $this->num_docs = $word_info['TOTAL_COUNT'] ?? 0; $this->num_occurrences = $word_info['TOTAL_OCCURRENCES'] ?? 0; $this->dictionary_info = $word_info['ROWS'] ?? []; $this->threshold_exceeded = $word_info['THESHOLD_EXCEEDED'] ?? false; $this->archive_file = $word_info['ARCHIVE_FILE'] ?? ""; if (empty($this->dictionary_info)) { $this->empty = true; $this->num_generations = 0; } else { $this->num_generations = count($this->dictionary_info); $this->empty = ($this->num_generations == 0); } $this->term_info_computed = true; } /** * Hook function used by currentDocsWithWord to return the current block * of docs if it is not cached * * @return mixed doc ids and score if there are docs left, -1 otherwise */ public function findDocsWithWord() { if ($this->empty) { return -1; } $ascending = ($this->direction == self::ASCENDING); if ($ascending) { if (($this->generation_pointer >= $this->num_generations) || $this->generation_pointer == $this->num_generations - 1 && $this->current_offset > $this->last_offset) { return -1; } } else { if (($this->generation_pointer < 0) || ($this->generation_pointer == 0 && $this->current_offset < $this->start_offset)) { return -1; } } $pre_results = []; if (!$this->empty) { $pre_results = $this->getPostingsSliceResults(); } $results = []; $doc_key_len = self::KEY_LEN; foreach ($pre_results as $keys => $data) { $host_key = substr($keys, self::HOST_KEY_POS, self::KEY_LEN); if (!empty($this->filter) && $this->filter->isFiltered($host_key)) { continue; } // inlinks is the domain of the inlink $key_parts = str_split($keys, $doc_key_len); $data[self::KEY] = $keys; if (isset($key_parts[2])) { list(, $data[self::HASH], $data[self::INLINKS]) = $key_parts; } else { continue; } $data[self::CRAWL_TIME] = $this->index_name; $results[$keys] = $data; } $this->count_block = count($results); if ($this->generation_pointer == $this->num_generations - 1 && empty($pre_results)) { $results = -1; } $this->pages = $results; return $results; } /** * Given the current_offset, result_per_block, and index used get the * result_per_block postings starting from current_offset in the current * direction (ascending or descending) for the term word iterator * iterates over from the index. */ public function getPostingsSliceResults() { $this->next_offset = $this->current_offset; if ($this->direction == self::ASCENDING) { if ($this->current_offset < $this->start_offset) { $this->current_offset = $this->start_offset; $this->next_offset = $this->current_offset; } if ($this->next_offset > $this->last_offset) { return []; } $start_slice = $this->next_offset; $num_slice = min($this->results_per_block, $this->last_offset - $this->next_offset + 1); $this->next_offset += $num_slice; } else { if ($this->current_offset > $this->last_offset) { $this->current_offset = $this->last_offset; $this->next_offset = $this->current_offset; } if ($this->next_offset < $this->start_offset) { return []; } $num_slice = max($this->results_per_block, $this->start_offset); $this->next_offset -= $num_slice; $start_slice = $this->next_offset + 1; } $postings = $this->getGenerationPostings($this->generation_pointer); $postings = array_slice($postings, $start_slice, $num_slice); $key_postings = $this->getDocKeyPositionsScoringInfo($postings, $this->current_generation); return $key_postings; } /** * Get the positions file specs for the given base index folder. * * @param string $base_folder of index folder * @return array [file handle, file size] */ public function getPositionsFile($base_folder) { $positions_filename = $base_folder . "/" . IndexDocumentBundle::POSITIONS_FILENAME; if (file_exists($positions_filename)) { $fh = fopen($positions_filename, "r"); $file_size = filesize($positions_filename); } else { $fh = false; $file_size = 0; } return [$fh, $file_size]; } /** * Get the positions entry associated with a particular posting. * * @param array $posting of positions entry * @param int $positions_file_size of positions file * @param resource $positions_fh of positions file * @return array of positions */ public function getPositionsList($posting, $positions_file_size, $positions_fh) { if ($posting['POSITIONS_LEN'] > 0 && $posting['POSITIONS_LEN'] < $positions_file_size && !empty($positions_fh)) { if (fseek($positions_fh, $posting['POSITIONS_OFFSET']) >= 0) { $encoded_positions = fread($positions_fh, $posting['POSITIONS_LEN']); $position_list = L\decodePositionList( $encoded_positions, $posting['FREQUENCY']); } else { $position_list = []; } } else { $position_list = []; } return $position_list; } /** * Add to a set of postings from a partition scoring information, position * list information and info about the relative weights of given position * based on the position list file and doc_map file. * * @param array $postings posting data to add scoring information to * @param int $partition which partition from the PartitionDocumentBundle * postings a re related to */ public function getDocKeyPositionsScoringInfo($postings, $partition) { $key_postings = []; $index = IndexManager::getIndex($this->index_name); $base_folder = $index->getPartitionBaseFolder($partition); $doc_map_filename = $base_folder . "/" . IndexDocumentBundle::DOC_MAP_FILENAME; $doc_map_tools = $index->doc_map_tools; list($fh, $file_size) = $this->getPositionsFile($base_folder); $number_of_partitions = $this->total_number_of_partitions; $num_doc_keys = $doc_map_tools->countTableEntries($doc_map_filename); $is_ascending = ($this->direction == self::ASCENDING); $num_seen_partitions = ($is_ascending) ? $partition + 1 : $number_of_partitions - $partition; $occurrences_per_doc = $this->num_occurrences / max($this->total_num_docs, 1); foreach ($postings as $posting) { $posting[self::GENERATION] = $partition; $posting[self::POSITION_LIST] = $this->getPositionsList($posting, $file_size, $fh); $doc_map_index = $posting['DOC_MAP_INDEX']; $entry = $doc_map_tools->findEntryAtIndexTableName( $doc_map_filename, $doc_map_index); $docid_len = IndexDocumentBundle::DOCID_LEN; $termsfilter_len = IndexDocumentBundle::TERMSFILTER_LEN; if (strlen($entry) < $docid_len) { continue; } $doc_key = substr($entry, 0, $docid_len); $is_text = IndexDocumentBundle::isType($doc_key, "text"); /** * For backward compatibility: only check for the latest * crawled version of a page if $entry[24] == 't' * (the beginning character of the term bloom * filter string attached to doc_map entries). */ $values = (strlen($entry) >= ($docid_len + $termsfilter_len + 1) && $entry[$docid_len] == 't') ? substr($entry, $docid_len + $termsfilter_len + 1) : substr($entry, $docid_len); if ($this->retrieve_latest && $entry[$docid_len] == 't' && $is_text) { $url_hash = substr($doc_key, 0, 8); $latest_version_info = IndexManager::lookupLatestVersionPage($url_hash, $this->index_name); if ($latest_version_info != null) { list($latest_partition, $latest_posting) = $latest_version_info; /** * Ensure that the discovered latest version * isn't the same as the current posting. */ if ($partition != $latest_partition || $latest_posting['DOC_MAP_INDEX'] != $doc_map_index) { $latest_base_folder = $index-> getPartitionBaseFolder($latest_partition); $latest_doc_map_filename = $latest_base_folder . "/" . IndexDocumentBundle::DOC_MAP_FILENAME; $latest_doc_map_index = $latest_posting['DOC_MAP_INDEX']; $latest_doc_map_entry = $doc_map_tools->findEntryAtIndexTableName( $latest_doc_map_filename, $latest_doc_map_index); if (strlen($latest_doc_map_entry) < $docid_len) { continue; } $latest_doc_key = substr($latest_doc_map_entry, 0, $docid_len); $terms_filter = substr($latest_doc_map_entry, $docid_len + 1, $termsfilter_len); if (!IndexDocumentBundle::checkTermExists( $this->word_key, $terms_filter)) { continue; } else { /** * The current term id exists in the most recent * version of the document; replace the current * posting entries with the latest entry. */ $posting[self::GENERATION] = $latest_partition; $posting['DOC_MAP_INDEX'] = $latest_doc_map_index; $doc_key = $latest_doc_key; $values = substr($latest_doc_map_entry, $docid_len + $termsfilter_len + 1); $latest_term_postings = $this-> getGenerationPostings($latest_partition); $target_posting = array_filter($latest_term_postings, function ($p) use ($latest_doc_map_index) { return $p['DOC_MAP_INDEX'] == $latest_doc_map_index; }); if (count($target_posting) > 0) { $posting['POSITIONS_LEN'] = $target_posting[0]['POSITIONS_LEN']; $posting['POSITIONS_OFFSET'] = $target_posting[0]['POSITIONS_OFFSET']; $posting['FREQUENCY'] = $target_posting[0]['FREQUENCY']; $latest_base_folder = $index-> getPartitionBaseFolder($partition); list($latest_positions_fh, $latest_positions_file_size) = $this-> getPositionsFile($latest_base_folder); $posting[self::POSITION_LIST] = $this->getPositionsList($posting, $latest_positions_file_size, $latest_positions_fh); } } } } } if (IndexDocumentBundle::isType($doc_key, "doc")) { $posting[self::IS_DOC] = true; } $doc_info = $doc_map_tools->unpack($values); if (empty($doc_info)) { continue; } $time = time(); $posting[self::KEY] = $doc_key; list($posting[self::DOC_LEN], $original_score) = array_values(array_shift($doc_info)); $is_timestamp_score = ($original_score <= $time && $original_score > ($time >> 1)); /* DOC_RANK calculate is a computes a document quality measure either based on time item was added (freshness) or a sum of signals (how early or late it was added to index), whether the url was a CLD or HOST, whether page was a wiki page, and the number of slashes in the url path */ if ($is_timestamp_score) { $posting[self::DOC_RANK] = $time / (max(1, $time - $original_score)) * $this->getMaxDocQualityScore(); } else { $posting[self::DOC_RANK] = $this->computeDocRank($doc_key, $doc_map_index, $num_seen_partitions, $number_of_partitions, $num_doc_keys, $this->avg_items_per_partition, $this->max_items_per_partition, $this->ranking_factors, $is_ascending); } list($preface_positions, $num_description_scores) = array_values(array_shift($doc_info)); $num_description_scores = intval($num_description_scores); $posting["PATH_KEYWORDS_END_POS"] = ($preface_positions & 255); $preface_positions = $preface_positions >> 8; $posting["TITLE_END_POS"] = ($preface_positions & 255); $preface_positions = $preface_positions >> 8; $posting["HOST_KEYWORDS_END_POS"] = ($preface_positions & 255); $posting[self::DESCRIPTION_SCORES] = array_slice($doc_info, 0, $num_description_scores); if ($posting['FREQUENCY'] > 0) { list($bonuses, $frequency) = $this->frequencyScoring( $occurrences_per_doc, $posting[self::POSITION_LIST], $posting[self::DOC_LEN], $posting["HOST_KEYWORDS_END_POS"], $posting["TITLE_END_POS"], $posting["PATH_KEYWORDS_END_POS"], $posting[self::DESCRIPTION_SCORES]); // Divergence-from-randomness + preface score $nonzero_occurrences_per_doc = ($occurrences_per_doc > 0) ? $occurrences_per_doc : 1; $posting[self::RELEVANCE] = $bonuses * log(1 + 1/$nonzero_occurrences_per_doc, 2) / ($bonuses + 1) + ((log(1 + $occurrences_per_doc, 2) + $frequency * log(1 + 1/$nonzero_occurrences_per_doc, 2)) / ($frequency + 1)); } else { /* this will typically be the relaveance score for a meta word As will always be frequency 1 and have no position info set close to 0. (Not zero to avoid div by 0's) */ $posting[self::RELEVANCE] = 0.01; } $posting[self::SCORE] = $posting[self::DOC_RANK] + $posting[self::RELEVANCE]; $posting[self::USER_RANKS] = array_slice($doc_info, $num_description_scores); $key_postings[$doc_key] = $posting; } if (!empty($fh)) { fclose($fh); } return $key_postings; } /** * Computes weighted frequencies of a term within a document with respect to * the length of the document, the positions of the term with the document * and the overall importance score for a given position within the document * Also computes the score of the posting for the host keywords, * title keywords, and path keywords bonuses. * * @param float $occurrences_per_doc expected number of occurrence of term * per/doc. * @param array $positions positions of this iterators term in the document * @param int $num_words number of terms in the document * @param int $host_keywords_end_pos term offset into the document summary * that demarks the end of the host keywords portion of the summary * @param int $title_end_pos absolute term offset into the document summary * that demarks the end of the title portion of the summary * @param int $path_keywords_end_pos absolute term offset into the document * summary that demarks the end of the title portion of the summary * @param array $descriptions_scores boundaries and scores of different * regions with document * @return array [score for host title path keywords bonuses, frequency] */ public function frequencyScoring( $occurrences_per_doc, $positions, $num_words, $host_keywords_end_pos, $title_end_pos, $path_keywords_end_pos, $descriptions_scores) { $num_words = max($num_words, 1); /* * Amati and van Rijsbergen suggest a normalization of * log_2(1 + l_avg/l_d) for divergence-from-randomness * Here l_avg = average num words in a document, l_d = num words * current document. C\MAX_DESCRIPTION_LEN is the max number * of characters in a document. Assuming the average word is * around 5 chars + whitespace char + punctuation, and most documents * are summarized to close to the max character length, we * approximate l_avg as C\MAX_DESCRIPTION_LEN/7 in the below. */ $pseudo_doc_length = 7 * $num_words; $length_normalization = log(1 + C\MAX_DESCRIPTION_LEN/(7 * $num_words), 2); if (empty($descriptions_scores)) { return count($positions) * $length_normalization; } $host_bonus = $this->ranking_factors["HOST_KEYWORD_BONUS"]; $path_bonus = $this->ranking_factors["PATH_KEYWORD_BONUS"]; $title_bonus = $this->ranking_factors["TITLE_BONUS"]; $len_term = strlen($this->word_key); $first_index = 0; $old_pos = 0; /* Sum of description scores without bonus scores we add below is 1. So with the scores we add below is $max_doc_norm_score. The foreach loop that follows measures what fraction of this comes from $this->word_key occurrences, so will be a number less than $max_doc_norm_score; */ $descriptions_scores = array_merge( [['POS' => - $path_keywords_end_pos - 1, 'SCORE' => $host_bonus], ['POS' => $host_keywords_end_pos - $path_keywords_end_pos - 1, 'SCORE' => $title_bonus], ['POS' => $title_end_pos - $path_keywords_end_pos - 1, 'SCORE' => $path_bonus], ], $descriptions_scores); $num_scores = count($descriptions_scores); $weighted_frequency = 0; $bonuses = 0; foreach ($positions as $position) { $last_index = $num_scores - 1; /* description score offsets are with respect to the description only so we subtract from the term position the offset of the non-description */ $position -= ($path_keywords_end_pos + 1); while ($first_index < $last_index) { $mid_index = ceil(($first_index + $last_index)/2.0); if ($descriptions_scores[$mid_index]['POS'] > $position) { $last_index = $mid_index - 1; } else { $first_index = $mid_index; } } $weight = $descriptions_scores[$first_index]['SCORE']; $start_description_pos = $descriptions_scores[$first_index]['POS']; $len_description = max(abs(($first_index == $num_scores - 1) ? $pseudo_doc_length - $start_description_pos : $descriptions_scores[$first_index + 1]['POS'] - $start_description_pos), $len_term, 1); $frequency_term = $weight * $len_term / $len_description; if ($position <= 0) { $bonuses += $weight; //$frequency_term; } else { $weighted_frequency += $frequency_term; } } $frequency = $weighted_frequency * $length_normalization; return [$bonuses, $frequency]; } /** * Updates the seen_docs count during an advance() call */ public function advanceSeenDocs() { if ($this->current_block_fresh != true) { if ($this->direction == self::ASCENDING) { $remaining_postings = $this->last_offset - $this->next_offset; $num_docs = min($this->results_per_block, $remaining_postings); $delta_sign = 1; } else { $remaining_postings = $this->next_offset - $this->start_offset + 1; $num_docs = min($this->results_per_block, $remaining_postings); $delta_sign = -1; } $this->next_offset = $this->current_offset; $this->next_offset += $delta_sign * $num_docs; if ($num_docs <= 0) { return; } } else { $num_docs = $this->count_block; } $this->current_block_fresh = false; $this->seen_docs += $num_docs; } /** * Forwards the iterator one group of docs * @param array $gen_doc_offset a generation, doc_offset pair. If not null, * (in the ascending search case opposite for descending), the pair * must be of greater than or equal generation, and if equal the * next block must all have $doc_offsets larger than or equal to * this value. */ public function advance($gen_doc_offset = null) { if ($gen_doc_offset == null) { $this->plainAdvance(); return; } $is_ascending = ($this->direction == self::ASCENDING); $cur_gen_doc_offset = $this->currentGenDocOffsetWithWord(); if ($cur_gen_doc_offset == -1 || $this->genDocOffsetCmp($cur_gen_doc_offset, $gen_doc_offset, $this->direction) >= 0) { return; } $advance_check = ($is_ascending) ? ($this->current_generation < $gen_doc_offset[0]) : ($this->current_generation > $gen_doc_offset[0]); if ($advance_check) { $this->advanceGeneration($gen_doc_offset[0]); $this->next_offset = $this->current_offset; } if ($this->current_generation == $gen_doc_offset[0]) { $offset_pair = $this->nextDocIndexOffsetPair( $gen_doc_offset[1]); if ($offset_pair === false) { $this->advanceGeneration(); $this->next_offset = $this->current_offset; } else { list($this->current_offset, $this->current_doc_offset) = $offset_pair; $this->next_offset = $this->current_offset; } } if ($is_ascending) { $this->seen_docs = ($this->current_offset - $this->start_offset); } else { $this->seen_docs = ($this->last_offset - $this->current_offset); } $this->current_block_fresh = false; } /** * Computes a pair [posting_slice_offset, $doc_index], such that * the $doc_index when shift to make a doc_offset is greater than * $doc_offset and posting_slice_offset is the offset of the first * posting with this property. * @param int $doc_offset that we are try to find a posting whose * doc_index has a bigger doc_offset * @return array [posting_slice_offset, $doc_index] */ public function nextDocIndexOffsetPair($doc_offset) { $is_ascending = ($this->direction == self::ASCENDING); $end_offset = ($is_ascending)? $this->last_offset : $this->start_offset; $postings = $this->getGenerationPostings($this->generation_pointer); if (empty($postings[$end_offset]) ) { return false; } $last_doc = $postings[$end_offset]["DOC_MAP_INDEX"]; if (($is_ascending && $last_doc < $doc_offset) || (!$is_ascending && $last_doc > $doc_offset)) { return false; } $next_offset = ($this->next_offset ?? $this->current_offset); $last_offset = $next_offset; $next_doc = $postings[$next_offset]["DOC_MAP_INDEX"] ?? $doc_offset; $cmp = ($is_ascending) ? ($next_doc < $doc_offset && $next_offset <= $end_offset): ($next_doc > $doc_offset && $next_offset >= $end_offset); $delta = ($is_ascending) ? 1 : -1; while ($cmp) { $last_offset = $next_offset; $next_offset += $delta; $delta *= 2; $next_doc = $postings[$next_offset]["DOC_MAP_INDEX"] ?? $doc_offset; $cmp = ($is_ascending) ? ($next_doc < $doc_offset && $next_offset <= $end_offset): ($next_doc > $doc_offset && $next_offset >= $end_offset); } if (($is_ascending && $next_offset > $end_offset) || (!$is_ascending && $next_offset < $end_offset)) { $next_offset = $end_offset; } while(abs($next_offset - $last_offset) > 1) { $mid_offset = ($next_offset + $last_offset) >> 1; $mid_doc = $postings[$mid_offset]["DOC_MAP_INDEX"]; $cmp = ($is_ascending) ? ($mid_doc < $doc_offset) : ($mid_doc > $doc_offset); if ($cmp) { $last_offset = $mid_offset; } else { $next_offset = $mid_offset; $next_doc = $mid_doc; } } if (abs($next_offset - $last_offset) == 1) { $next_doc = $postings[$next_offset]["DOC_MAP_INDEX"]; } return [$next_offset, $next_doc]; } /** * Forwards the iterator one group of docs. This is what's called * by @see advance($gen_doc_offset) if $gen_doc_offset is null */ public function plainAdvance() { $is_ascending = ($this->direction == self::ASCENDING); $this->advanceSeenDocs(); $this->current_doc_offset = null; $update_check = ($is_ascending) ? ($this->current_offset < $this->next_offset) : ($this->current_offset > $this->next_offset); if ($update_check) { $this->current_offset = $this->next_offset; $update_check = ($is_ascending) ? ($this->current_offset > $this->last_offset) : ($this->current_offset < $this->start_offset); if ($update_check) { $this->advanceGeneration(); $this->next_offset = $this->current_offset; } } else { $this->advanceGeneration(); $this->next_offset = $this->current_offset; } } /** * Switches which index shard is being used to return occurrences of * the word to the next shard containing the word * * @param int $generation generation to advance beyond */ public function advanceGeneration($generation = null) { if ($generation === null) { $generation = $this->current_generation; } $this->generation_pointer ??= 0; //if not set set to 0 $is_ascending = ($this->direction == self::ASCENDING); do { $gen_check = ($is_ascending) ? ($this->generation_pointer < $this->num_generations) : ($this->generation_pointer >= 0); if ($gen_check) { if ($is_ascending) { $this->generation_pointer++; } else { $this->generation_pointer--; } } $gen_check = ($is_ascending) ? $this->generation_pointer < $this->num_generations : $this->generation_pointer >= 0; if ($gen_check) { $partition_info = $this->dictionary_info[$this->generation_pointer]; $this->current_generation = $partition_info['PARTITION']; $this->start_offset = 0; $this->last_offset = ($partition_info['NUM_DOCS'] ?? 1) - 1; $this->current_offset = ($is_ascending) ? $this->start_offset: $this->last_offset; } $gen_check = ($is_ascending) ? ($this->current_generation < $generation && $this->generation_pointer < $this->num_generations) : ($this->current_generation > $generation && $this->generation_pointer >= 0); } while($gen_check); } /** * Given a partition number in the the index's PartitionDocumentBundle * retrieves all the posting for the word iterator's term in that * partition. * * @param int $generation partition to get postings for * @return array of posting items */ public function getGenerationPostings($generation) { static $test_time = 0; if (empty($this->dictionary_info[$generation])) { return []; } $generation_info = $this->dictionary_info[$generation]; if (!empty($generation_info['POSTINGS']) && is_array($generation_info['POSTINGS'])) { return $generation_info['POSTINGS']; //already loaded } $index = IndexManager::getIndex($this->index_name); if (empty($generation_info['POSTINGS_OFFSET']) || empty($generation_info['POSTINGS_LEN'])) { $postings_entry = ""; } else { $postings_entry = $index->getPostingsString( $generation_info['PARTITION'], $generation_info['POSTINGS_OFFSET'], $generation_info['POSTINGS_LEN']); } $postings = []; if (!empty($postings_entry)) { $postings_info = $index->unpackPostings($postings_entry); if (!empty($postings_info)) { list($postings,) = $postings_info; } } $this->dictionary_info[$generation]['POSTINGS'] = $postings; return $postings; } /** * Gets the doc_offset and generation for the next document that * would be return by this iterator * * @return mixed an array with the desired document offset * and generation; -1 on fail */ public function currentGenDocOffsetWithWord() { if ($this->current_doc_offset !== null) { return [$this->current_generation, $this->current_doc_offset]; } $is_ascending = ($this->direction == self::ASCENDING); $offset_check = ($is_ascending) ? ($this->current_offset > $this->last_offset || $this->generation_pointer >= $this->num_generations) : ($this->current_offset < $this->start_offset|| $this->generation_pointer < -1); if ($offset_check || empty($this->dictionary_info[$this->generation_pointer])) { return -1; } else { $partition_info = $this->dictionary_info[$this->generation_pointer]; $this->current_generation = $partition_info['PARTITION']; $postings = $this->getGenerationPostings($this->generation_pointer); $this->current_doc_offset = $postings[$this->current_offset]['DOC_MAP_INDEX'] ?? -1; } return [$this->current_generation, $this->current_doc_offset]; } }