viewgit/inc/functions.php:22 Function utf8_encode() is deprecated [8192]
<?php /** * SeekQuarry/Yioop -- * Open Source Pure PHP Search Engine, Crawler, and Indexer * * Copyright (C) 2009 - 2023 Chris Pollett chris@pollett.org * * LICENSE: * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <https://www.gnu.org/licenses/>. * * END LICENSE * * @author Chris Pollett chris@pollett.org * @license https://www.gnu.org/licenses/ GPL3 * @link https://www.seekquarry.com/ * @copyright 2009 - 2023 * @filesource */ namespace seekquarry\yioop\library\index_bundle_iterators; use seekquarry\yioop\configs as C; use seekquarry\yioop\library as L; /** * Used to iterate over the documents which occur in all of a set of * iterator results * * @author Chris Pollett * @see IndexArchiveBundle */ class IntersectIterator extends IndexBundleIterator { /** * Number of seconds before timeout and stop * syncGenDocOffsetsAmongstIterators if slow */ const SYNC_TIMEOUT = 3; /** * An array of iterators whose intersection we get documents from * @var array */ public $index_bundle_iterators; /** * Number of elements in $this->index_bundle_iterators * @var int */ public $num_iterators; /** * The number of iterated docs before the restriction test * @var int */ public $seen_docs_unfiltered; /** * Index of the iterator amongst those we are intersecting to advance * next * @var int */ public $to_advance_index; /** * Associative array (term position in original query => iterator index * of an iterator for that term). This is to handle queries where the * same term occurs multiple times. For example, the rock back "The The" * * @var array */ public $word_iterator_map; /** * Number of elements in $this->word_iterator_map * @var int */ public $num_words; /** * Each element in this array corresponds to one quoted phrase in the * original query. Each element is in turn an array with elements * corresponding to a position of term in the original query followed * its length (a term might involve more than one word so the length * could be greater than one). It is also allowed that entries might * be of the form *num => * to indicates that an asterisk (a wild card that * can match any number of terms) appeared at that place in the query * @var array */ public $quote_positions; /** * A weighting factor to multiply with each doc SCORE returned from this * iterator * @var float */ public $weight; /** * Whether to run a timer that shuts down the intersect iterator if * syncGenDocOffsetsAmongstIterators takes longer than the time out period * @var bool */ public $sync_timer_on; /** * Start time for syncGenDocOffsetsAmongstIterators * @var int */ public $sync_time; /** * Which of the iterators has the current document with least index * @var int */ public $least_num_doc_index; /** * Creates an intersect iterator with the given parameters. * * @param object $index_bundle_iterators to use as a source of documents * to iterate over * @param array $word_iterator_map ssociative array ( * term position in original query => iterator index * of an iterator for that term) * @param array $quote_positions Each element in this array corresponds * to one quoted phrase in the original query. @see $quote_positions * field variable in this class for more info * @param float $weight multiplicative factor to apply to scores returned * from this iterator */ public function __construct($index_bundle_iterators, $word_iterator_map, $quote_positions = null, $weight = 1) { $this->index_bundle_iterators = $index_bundle_iterators; $this->word_iterator_map = $word_iterator_map; $this->num_words = count($word_iterator_map); $this->num_iterators = count($index_bundle_iterators); $this->num_docs = 40000000000; // a really big number $this->quote_positions = $quote_positions; $this->weight = $weight; $this->results_per_block = 1; $this->sync_timer_on = false; $this->sync_time = 0; /* We take an initial guess of the num_docs we return as the least of the num_docs of the underlying iterators. We are also setting up here that we return at most one posting at a time from each iterator */ $this->seen_docs = 0; $this->seen_docs_unfiltered = 0; for ($i = 0; $i < $this->num_iterators; $i++) { if (empty($this->index_bundle_iterators[$i])) { continue; } $num_docs = $this->index_bundle_iterators[$i]->num_docs; if ($num_docs < $this->num_docs) { $this->num_docs = $num_docs; $this->least_num_doc_index = $i; } $this->index_bundle_iterators[$i]->setResultsPerBlock(1); } $this->total_num_docs = $this->num_docs; } /** * Returns the iterators to the first document block that it could iterate * over */ public function reset() { for ($i = 0; $i < $this->num_iterators; $i++) { if (empty($this->index_bundle_iterators[$i])) { continue; } $this->index_bundle_iterators[$i]->setResultsPerBlock(1); $this->index_bundle_iterators[$i]->reset(); } $this->seen_docs = 0; $this->seen_docs_unfiltered = 0; } /** * Hook function used by currentDocsWithWord to return the current block * of docs if it is not cached * * @return mixed doc ids and rank if there are docs left, -1 otherwise */ public function findDocsWithWord() { static $test_time = 0; $status = $this->syncGenDocOffsetsAmongstIterators(); if ($status == -1) { return -1; } //next we finish computing divergence from randomness $docs = $this->index_bundle_iterators[0]->currentDocsWithWord(); $weight = $this->weight; if (is_array($docs) && count($docs) == 1) { //we get intersect docs one at a time so should be only one $keys = array_keys($docs); $key = $keys[0]; $position_lists = []; $len_lists = []; $docs[$key][self::POSITION_LIST] ??= []; $position_lists[0] = $docs[$key][self::POSITION_LIST]; $len_lists[0] = count($docs[$key][self::POSITION_LIST]); for ($i = 1; $i < $this->num_words; $i++) { if ($this->word_iterator_map[$i] < $i) { // this is the case of a repeated term $position_lists[] = $position_lists[$this->word_iterator_map[$i]] ?? []; $docs[$key][self::RELEVANCE] += $docs[$key][self::RELEVANCE]; } else { // first occurrence of term case $i_docs = $this->index_bundle_iterators[ $this->word_iterator_map[$i]]->currentDocsWithWord(); if (isset($i_docs[$key][self::POSITION_LIST]) && ($ct = count($i_docs[$key][self::POSITION_LIST]) > 0 )) { $position_lists[] = $i_docs[$key][self::POSITION_LIST]; $len_lists[] = $ct; } if (isset($i_docs[$key])) { $docs[$key][self::RELEVANCE] += ($i_docs[$key][self::RELEVANCE] ?? 0); } } } if (!empty($docs)) { // proximity is aggregated into score in phrase model $docs[$key][self::SCORE] = $docs[$key][self::DOC_RANK] + $docs[$key][self::RELEVANCE]; if ($weight != 1) { $docs[$key][self::DOC_RANK] *= $weight; $docs[$key][self::RELEVANCE] *= $weight; $docs[$key][self::SCORE] *= $weight; } } } $this->count_block = (empty($docs) || $docs < 0) ? 0 : count($docs); $this->pages = $docs; return $docs; } /** * Used to check if quoted terms in search query appear exactly in * the position lists of the current document * * @param array $position_lists of search terms in the current document * @return bool whether the quoted terms in the search appear exactly */ public function checkQuotes(&$position_lists) { foreach ($this->quote_positions as $ngram_positions_within_quoted_query) { if ($this->checkQuote($position_lists, 0, "*", $ngram_positions_within_quoted_query) < 1) { return false; } } return true; } /** * Auxiliary function for @see checkQuotes used to check if quoted terms * in search query appear exactly in the position lists of the current * document * * @param array $position_lists of search terms in the current document * @param int $cur_pos to look after in any position list * @param mixed $next_pos * or int if * next_pos must be >= $cur_pos * +len_search_term. $next_pos represents the position the next * quoted term should be at * @param array $ngram_positions_within_quoted_query pairs: * $ngram_position_within_quoted_query => $len_of_ngram * @return int -1 on failure, 0 on backtrack, 1 on success */ public function checkQuote(&$position_lists, $cur_pos, $next_pos, $ngram_positions_within_quoted_query) { if ($ngram_positions_within_quoted_query == [] || $ngram_positions_within_quoted_query == null) { return 1; } $ngram_index = key($ngram_positions_within_quoted_query); $len = $ngram_positions_within_quoted_query[$ngram_index]; unset($ngram_positions_within_quoted_query[$ngram_index]); if (strcmp($len, "*") == 0) { return $this->checkQuote($position_lists, $cur_pos, "*", $ngram_positions_within_quoted_query); } $ngram_position_list = $position_lists[$ngram_index]; $is_star = (strcmp($next_pos, "*") == 0); $next_pos = ($is_star) ? $cur_pos + $len: $next_pos; while(true) { $found = false; foreach ($ngram_position_list as $occurrence_position) { if ($occurrence_position >= $next_pos) { $found = true; break; } } if (!$found) { return -1; } if ($is_star || $occurrence_position == $next_pos) { $check = $this->checkQuote($position_lists, $occurrence_position, $occurrence_position + $len, $ngram_positions_within_quoted_query); if ($check != 0) { return $check; } $next_pos = $occurrence_position + $len; } else { return 0; } } } /** * Given the position_lists of a collection of terms computes * a score for how close those words were in the given document * * @param array &$word_position_lists a 2D array item * number => position_list (locations in doc where item occurred) for * that item. * @param array &$word_len_lists length for each item of its position list * @param bool $is_doc whether this is the position list of a document * or a link * @param int $doc_len the length of the document * @return sum of inverse of all covers computed by plane sweep algorithm */ public function computeProximity(&$word_position_lists, &$word_len_lists, $is_doc, $doc_len) { $num_iterators = $this->num_iterators; if ($num_iterators < 1) { return 0; } $covers = []; $position_list = $word_position_lists; $interval = []; $num_words = count($position_list); for ($i = 0; $i < $num_words; $i++) { $min = (!empty($position_list[$i])) ? array_shift($position_list[$i]) : null; if (empty($min)) { break; } else { array_push($interval, [$min, $i]); for ($j = 0; $j < $num_words; $j++) { if (isset($position_list[$j][0]) && $min == $position_list[$j][0]) { array_shift($position_list[$j]); } } } } if (count($interval) != $num_words) { return 0; } sort($interval); $l = array_shift($interval); $r = end($interval); $stop = false; if (count($position_list[$l[1]]) == 0) { $stop = true; } while(!$stop) { $p = array_shift($position_list[$l[1]]); for ($i = 0;$i < $num_words; $i++){ if (isset($position_list[$i][0]) && $p == $position_list[$i][0]) { array_shift($position_list[$i]); } } $q = $interval[0][0]; if ($p > $r[0]) { array_push($covers, [$l[0], $r[0]]); array_push($interval, [$p, $l[1]]); } else { if ($p < $q) { array_unshift($interval, [$p, $l[1]]); } else { array_push($interval, [$p, $l[1]]); sort($interval); } } $l = array_shift($interval); $r = end($interval); if (count($position_list[$l[1]]) == 0) { $stop = true; } } array_push($covers, [$l[0],$r[0]]); $score = 0; foreach ($covers as $cover) { $score += (1/($cover[1] - $cover[0] + 1)); } $score = ($num_words * $score)/max($doc_len, 1); // this will ensure the score is less than 1 return $score; } /** * Finds the next generation and doc offset amongst all the iterators * that contains the word. It assumes that the (generation, doc offset) * pairs are ordered in an increasing fashion for the underlying iterators */ public function syncGenDocOffsetsAmongstIterators() { if ($this->sync_time === 0) { $this->sync_time = time(); } if ($this->sync_timer_on) { $time_out = self::SYNC_TIMEOUT + $this->sync_time; } else { //will probably never timeout this way so like no timer $time_out = 2 * (self::SYNC_TIMEOUT + $this->sync_time); } if (($biggest_gen_offset = $this->index_bundle_iterators[ 0]->currentGenDocOffsetWithWord()) == -1) { return -1; } $gen_doc_offset[0] = $biggest_gen_offset; $all_same = true; $direction = $this->getDirection(); /*find biggest generation/partition, offset/doc_map_index from among the sub iterators */ for ($i = 1; $i < $this->num_iterators; $i++) { if (empty($this->index_bundle_iterators[$i])) { return -1; } if ((($cur_gen_doc_offset = $this->index_bundle_iterators[ $i]->currentGenDocOffsetWithWord()) == -1) || time() > $time_out) { return -1; } $gen_doc_offset[$i] = $cur_gen_doc_offset; $gen_doc_cmp = $this->genDocOffsetCmp($cur_gen_doc_offset, $biggest_gen_offset, $direction); if ($gen_doc_cmp > 0) { $biggest_gen_offset = $cur_gen_doc_offset; $all_same = false; } else if ($gen_doc_cmp < 0) { $all_same = false; } } if ($all_same) { return 1; } $last_changed = -1; $i = 0; //cycle through till get all gen/offsets in sync while($i != $last_changed) { if (time() > $time_out) { return -1; } if (!empty($gen_doc_offset[$i]) && $this->genDocOffsetCmp($gen_doc_offset[$i], $biggest_gen_offset, $direction) < 0) { $iterator = $this->index_bundle_iterators[$i]; $iterator->advance($biggest_gen_offset); if( ($cur_gen_doc_offset = $iterator->currentGenDocOffsetWithWord()) == -1) { return -1; } $gen_doc_offset[$i] = $cur_gen_doc_offset; $cmp = $this->genDocOffsetCmp($cur_gen_doc_offset, $biggest_gen_offset, $direction); if ($cmp < 0) { return -1; } else if ($cmp > 0) { $last_changed = $i; $biggest_gen_offset = $cur_gen_doc_offset; } } $i++; if ($i == $this->num_iterators) { $i = 0; $last_changed = max($last_changed, 0); } } return 1; } /** * Returns CrawlConstants::ASCENDING or CrawlConstants::DESCENDING * depending on the direction in which this iterator ttraverse the * underlying index archive bundle. * * @return int direction traversing underlying archive bundle */ public function getDirection() { if (!empty($this->index_bundle_iterators[0])) { return $this->index_bundle_iterators[0]->getDirection(); } return self::ASCENDING; } /** * Forwards the iterator one group of docs * @param array $gen_doc_offset a generation, doc_offset pair. If set, * the must be of greater than or equal generation, and if equal the * next block must all have $doc_offsets larger than or equal to * this value */ public function advance($gen_doc_offset = null) { $this->current_block_fresh = false; $this->seen_docs += 1; $i = $this->least_num_doc_index; $this->seen_docs_unfiltered = $this->index_bundle_iterators[$i]->seen_docs; if ($this->seen_docs_unfiltered > 0) { $this->num_docs = floor(($this->seen_docs * $this->total_num_docs) / $this->seen_docs_unfiltered); } $this->index_bundle_iterators[0]->advance($gen_doc_offset); } /** * Gets the doc_offset and generation for the next document that * would be return by this iterator * * @return mixed an array with the desired document offset * and generation; -1 on fail */ public function currentGenDocOffsetWithWord() { $this->syncGenDocOffsetsAmongstIterators(); return $this->index_bundle_iterators[0]->currentGenDocOffsetWithWord(); } /** * This method is supposed to set * the value of the result_per_block field. This field controls * the maximum number of results that can be returned in one go by * currentDocsWithWord(). This method cannot be consistently * implemented for this iterator and expect it to behave nicely * it this iterator is used together with union_iterator. So * to prevent a user for doing this, calling this method results * in a user defined error * * @param int $num the maximum number of results that can be returned by * a block */ public function setResultsPerBlock($num) { if ($num != 1) { trigger_error("Cannot set the results per block of an intersect iterator", E_USER_ERROR); } } /** * This method calculates the max relevance value for query underlying * the iterator to the document currently being iterated over * by the query * @return float maximum score for document relevance to a query */ public function getMaxRelevanceScore() { $max_relevance = 0; foreach ($this->index_bundle_iterators as $iterator) { $max_relevance += $iterator->getMaxRelevanceScore(); } return $max_score; } }