diff --git a/src/executables/ArcTool.php b/src/executables/ArcTool.php index 22824ca2f..c52fcab88 100755 --- a/src/executables/ArcTool.php +++ b/src/executables/ArcTool.php @@ -314,8 +314,13 @@ class ArcTool extends DictionaryUpdater implements CrawlConstants $doc_map_tools = $index->doc_map_tools; $entry = $doc_map_tools->findEntryAtIndexTableName($doc_map_filename, $doc_map_index); - $doc_key = substr($entry, 0, IndexDocumentBundle::DOCID_LEN); - $entry = substr($entry, IndexDocumentBundle::DOCID_LEN); + $docid_len = IndexDocumentBundle::DOCID_LEN; + $termsfilter_len = IndexDocumentBundle::TERMSFILTER_LEN; + $doc_key = substr($entry, 0, $docid_len); + $entry = (strlen($entry) >= ($docid_len + $termsfilter_len + 1) && + $entry[$docid_len] == 't') ? + substr($entry, $docid_len + $termsfilter_len + 1) : + substr($entry, $docid_len); $doc_map_tools = $index->doc_map_tools; echo "Doc Key: " . L\toHexString($doc_key) . "\n"; echo "Partition: $partition\n"; diff --git a/src/library/IndexManager.php b/src/library/IndexManager.php index e2a920a92..da7cd7f41 100644 --- a/src/library/IndexManager.php +++ b/src/library/IndexManager.php @@ -31,6 +31,8 @@ namespace seekquarry\yioop\library; use seekquarry\yioop\configs as C; +use seekquarry\yioop\library as L; +use seekquarry\yioop\models\ParallelModel; /** * For crawlHash @@ -56,10 +58,19 @@ class IndexManager implements CrawlConstants * @var array */ public static $index_times = []; + /** + * List of entries of the form name of url => doc_map info when cached + * @var array + */ + public static $urls_cache = []; /** * Max number of IndexArchiveBundles that can be cached */ const INDEX_CACHE_SIZE = 1000; + /** + * Max number of URLs to be cached for most recent version of a page lookup + */ + const URLS_CACHE_SIZE = 1000; /** * Returns a reference to the managed copy of an IndexArchiveBundle object * with a given timestamp or feed (for handling media feeds) @@ -367,4 +378,42 @@ class IndexManager implements CrawlConstants $num_docs_cache[$index_name][$term] = $total; return $total; } + + /** + * Finds posting info related to the most recent version + * of a URL in the given index + * + * @param string hash of the URL to be looked up + * @param string current index + * @return array of posting info | null + */ + public static function lookupLatestVersionPage($url_hash, $index_name) + { + // Check if the url hash exists in the cache + if (array_key_exists($url_hash, self::$urls_cache)) { + return self::$urls_cache[$url_hash]; + } + $model_for_url_hash_lookup = new ParallelModel(); + $page_versions = $model_for_url_hash_lookup-> + lookupSummaryOffsetGeneration(L\base64Hash($url_hash), + $index_name, false, true); + if (key_exists('ROWS', $page_versions) && + count($page_versions['ROWS']) > 0) { + $latest_postings_info = + end($page_versions['ROWS'])['POSTINGS']; + $latest_partition = + end($page_versions['ROWS'])['PARTITION']; + if (is_array($latest_postings_info) && + count($latest_postings_info) > 0) { + $latest_posting = end($latest_postings_info); + if (count(self::$urls_cache) >= self::URLS_CACHE_SIZE) { + self::$urls_cache = []; + } + self::$urls_cache[$url_hash] = array($latest_partition, + $latest_posting); + return self::$urls_cache[$url_hash]; + } + } + return null; + } } diff --git a/src/library/PhraseParser.php b/src/library/PhraseParser.php index 3334baf7a..ff9444def 100755 --- a/src/library/PhraseParser.php +++ b/src/library/PhraseParser.php @@ -441,7 +441,7 @@ class PhraseParser $term_parts = explode("-", $term ?? ""); array_shift($term_parts); - foreach($term_parts as $part) { + foreach ($term_parts as $part) { $pos_lists[$part][] = $t; } $t++; @@ -1584,4 +1584,23 @@ class PhraseParser } return $result; } + + /** + * Checks if the given term is a meta word + * + * @param string $term to check + * @return bool meta term or not + */ + public static function checkMetaTerm($term) + { + $check_meta = false; + foreach (self::$meta_words_list as $meta) { + $meta_word = str_replace(':', '3A', $meta); + if (str_starts_with($term, $meta_word)) { + $check_meta = true; + break; + } + } + return $check_meta; + } } diff --git a/src/library/index_bundle_iterators/GroupIterator.php b/src/library/index_bundle_iterators/GroupIterator.php index 96e560469..c58a9d757 100644 --- a/src/library/index_bundle_iterators/GroupIterator.php +++ b/src/library/index_bundle_iterators/GroupIterator.php @@ -208,7 +208,12 @@ class GroupIterator extends IndexBundleIterator $pages = -1; } } else if (!empty($new_pages)) { - $pages = array_merge($pages, $new_pages); + if (count($new_pages) == 1) { + $pages = array_merge($pages, $new_pages); + } else { + $pages = $new_pages; + $done = true; + } $count = count($pages); } if (isset($this->index_bundle_iterator->hard_query)) { diff --git a/src/library/index_bundle_iterators/IntersectIterator.php b/src/library/index_bundle_iterators/IntersectIterator.php index 92f21cbb8..e553da3d5 100644 --- a/src/library/index_bundle_iterators/IntersectIterator.php +++ b/src/library/index_bundle_iterators/IntersectIterator.php @@ -545,4 +545,17 @@ class IntersectIterator extends IndexBundleIterator an intersect iterator", E_USER_ERROR); } } + /** + * Returns the sum of maxScores for nested WordIterators + * + * @return int maxScore + */ + public function getMaxScore() + { + $maxScore = 0; + foreach ($this->index_bundle_iterators as $iterator) { + $maxScore += $iterator->getMaxScore(); + } + return $maxScore; + } } diff --git a/src/library/index_bundle_iterators/UnionIterator.php b/src/library/index_bundle_iterators/UnionIterator.php index 071ac78fd..f48518c2c 100644 --- a/src/library/index_bundle_iterators/UnionIterator.php +++ b/src/library/index_bundle_iterators/UnionIterator.php @@ -77,6 +77,45 @@ class UnionIterator extends IndexBundleIterator * @var int */ public $total_num_docs; + /** + * Heap of query terms whose scores are considered while finding results + * @var array + */ + public $terms_heap; + /** + * Heap of query terms whose scores are not considered while finding results + * @var array + */ + public $low_scoring_terms; + /** + * Heap of result documents + * @var array + */ + public $results_heap; + /** + * Heap constant to track the next occurrence of the term on a consituent + * iterator + */ + const NEXT_DOC = 'NEXT_DOC'; + /** + * Heap constant to track the index of a consituent iterator on + * $index_bundle_iterators + */ + const ITERATOR = 'ITERATOR'; + /** + * Heap constant to track the MaxScore of the term on a consituent + * iterator + */ + const MAX_SCORE = 'MAX_SCORE'; + /** + * Heap constant to track the doc fetched by a consituent iterator + */ + const DOC = 'DOC'; + /** + * Heap constant to track the score of a doc fetched by a consituent + * iterator + */ + const DOC_SCORE = 'DOC_SCORE'; /** * Creates a union iterator with the given parameters. * @@ -101,6 +140,11 @@ class UnionIterator extends IndexBundleIterator $this->seen_docs_unfiltered = 0; $this->index_name = $index_name; $this->total_num_docs = $total_num_docs; + $this->low_scoring_terms = []; + for ($i = 0; $i < self::RESULTS_PER_BLOCK; $i++) { + $this->results_heap[$i][self::DOC_SCORE] = 0; + } + $this->initializeTermsHeap($this->terms_heap); for ($i = 0; $i < $this->num_iterators; $i++) { $this->num_docs += $this->index_bundle_iterators[$i]->num_docs; /* @@ -147,6 +191,31 @@ class UnionIterator extends IndexBundleIterator $this->seen_docs_unfiltered = 0; $doc_block = $this->currentDocsWithWord(); } + /** + * Calculates the total relevance score of the result document + * + * @param array $heap of terms + * @param int $relevance_score previously calculated relevance score + * @return array + */ + public function getDocScore($heap, $relevance_score = 0) + { + $d = $this->currentGenDocOffsetWithWord(); + $doc = []; + while ($d != -1 && !empty($heap) && $heap[0][self::NEXT_DOC] === $d) { + $iterator_idx = $heap[0][self::ITERATOR]; + $iterator = $this->index_bundle_iterators[$iterator_idx]; + $docs = $iterator->findDocsWithWord(); + if (is_array($docs) && count($docs) == 1) { + $keys = array_keys($docs); + $doc = $docs[$keys[0]]; + $relevance_score += $doc[self::RELEVANCE]; + } + array_splice($heap, 0, 1); + $this->heapifyDown($heap, true); + } + return [$doc, $relevance_score]; + } /** * Hook function used by currentDocsWithWord to return the current block * of docs if it is not cached @@ -156,170 +225,141 @@ class UnionIterator extends IndexBundleIterator public function findDocsWithWord() { $pages = []; - $docs = []; $found_docs = false; - $results_heap = []; - $k_least_score = ['LEAST_SCORE' => 0, 'INDEX' => 0]; - $query_terms = $this->getQueryTerms(); - for ($i = 0; $i < $this->num_iterators; $i++) { - $docs = $this->index_bundle_iterators[$i]->currentDocsWithWord(); - if (is_array($docs)) { - /* - Iterate over all the documents fetched and add a doc to the - results' max heap only if the heap is not full / the - relevance score of the doc is greater than the current kth-best - score - */ - foreach ($docs as $doc_key => $doc) { - $doc["ITERATOR"] = $i; - $this->key_iterator_table[$doc_key] = $i; - $score = $doc[self::RELEVANCE]; - $full_heap = - (count($results_heap) == $this->results_per_block); - if ($full_heap && $score <= $k_least_score['LEAST_SCORE']) { - continue; - } else { - $next_page_index = $full_heap ? - $k_least_score['INDEX'] : - count($results_heap); - $results_heap[$next_page_index]['SCORE'] = $score; - $results_heap[$next_page_index]['DOC'] = $doc; - $this->heapifyUp($results_heap, $next_page_index); - } - /* - If the heap is full after inserting the new doc, - recompute the minimum score in the heap (which will be - replaced with the next doc that has to be inserted) - */ - if ($full_heap) { - $min_score = min($results_heap); - $k_least_score = ['LEAST_SCORE' => $min_score, - 'INDEX' => array_search($min_score, $results_heap)]; - } - } - /* - Drop query terms whose maxScores are lower than the current - kth-best score, where k is the max number of results that - can be returned - */ - if (count($results_heap) == $this->results_per_block) { - $this->compareByMaxScore($query_terms, - $k_least_score['LEAST_SCORE']); - } - $found_docs = true; - } + list($doc, $relevance_score) = $this->getDocScore($this->terms_heap); + if (!empty($this->low_scoring_terms)) { + list($doc_copy, $additional_score) = + $this->getDocScore($this->low_scoring_terms, $relevance_score); + $relevance_score += $additional_score; + } + + if (!empty($doc) && $relevance_score > + $this->results_heap[0][self::DOC_SCORE]) { + // Update the document's scores + $doc[self::RELEVANCE] = $relevance_score; + $score = $relevance_score + $doc[self::DOC_RANK]; + $doc[self::SCORE] = $score; + $found_docs = true; + $this->results_heap[0][self::DOC] = $doc; + $this->results_heap[0][self::DOC_SCORE] = $score; + $this->heapifyDown($this->results_heap, false); } - if ($found_docs == false) { - $this->pages = $docs; - return $docs; + $found_top_results = $this->results_heap[0][self::DOC_SCORE] > 0; + if (!$found_docs || $found_top_results) { + $pages = ($this-> + results_heap[self::RESULTS_PER_BLOCK-1][self::DOC_SCORE] == 0) ? + -1 : $this->getResultsHeap(); } else { - // Get the top k result documents from the max heap - while (!empty($results_heap)) { - $pages[] = $this->extractMaxScoringDoc($results_heap)['DOC']; - } + $pages = [$doc]; } - $this->count_block_unfiltered = count($pages); $this->pages = $pages; - $this->count_block = count($pages); + if (is_array($pages)) { + $this->count_block_unfiltered = count($pages); + $this->count_block = count($pages); + } return $pages; } /** - * Compares each of the query terms' maxScores with the current - * least score in the max heap of result documents (i.e., the current - * kth-best score). If the term's maxScore is <= the current least score - * in the top k results, remove the word iterator associated with that - * term, as it will never make it to the top k documents. + * Gets the docs in the results min heap sorted in + * descending order by score * - * @param array $query_terms on this union iterator - * @param int $least_score current kth-best score + * @return mixed array of result docs if any, -1 otherwise */ - public function compareByMaxScore(&$query_terms, $least_score) + public function getResultsHeap() { - foreach ($query_terms as $query_term => $term_info) { - if ($term_info['MAX_SCORE'] > 0 && - $term_info['MAX_SCORE'] <= $least_score) { - $iterator_index = $term_info['ITERATOR']; - $iterator = $this->index_bundle_iterators[$iterator_index]; - if ($iterator instanceof IntersectIterator) { - $word_iterators = $iterator->index_bundle_iterators; - for ($j = 0; $j < count($word_iterators); $j++) { - if ($word_iterators[$j]->word_key == $query_term) { - array_splice($this-> - index_bundle_iterators[$iterator_index], $j, 1); - unset($query_terms[$query_term]); - break; - } - } - } else { - if ($iterator->word_key == $query_term) { - array_splice($this->index_bundle_iterators, - $iterator_index, 1); - unset($query_terms[$query_term]); - } - } + $pages = []; + while (!empty($this->results_heap)) { + $doc = $this->extractMinScoringDoc($this->results_heap); + if ($doc[self::DOC_SCORE] > 0) { + array_unshift($pages, $doc[self::DOC]); } } + // Re-initialize the results heap for the next set of docs + $this->results_heap = []; + for ($i = 0; $i < self::RESULTS_PER_BLOCK; $i++) { + $this->results_heap[$i][self::DOC_SCORE] = 0; + } + if (empty($pages)) { + $pages = -1; + } + return $pages; } /** - * Gets the top-scoring document in the max heap of result documents. + * Compare between elements for heapify operations * - * @param array $heap of result docs - * @return object top-scoring document + * @param array $i first element + * @param array $j second element + * @param boolean $is_terms_heap basis for comparison + * @return boolean result of comparison */ - public function extractMaxScoringDoc(&$heap) + public function compareElements($i, $j, $is_terms_heap) { - $top_doc = $heap[0]; - $last_index = count($heap) - 1; - $heap[0] = $heap[$last_index]; - unset($heap[$last_index]); - $this->heapifyDown($heap, 0); - return $top_doc; + $is_ascending = $this->getDirection(); + if (!$is_terms_heap) { + return $i[self::DOC_SCORE] > $j[self::DOC_SCORE]; + } + if ($is_ascending) { + if ($i[self::NEXT_DOC] == -1) { + return true; + } else if ($j[self::NEXT_DOC] == -1) { + return false; + } + return $i[self::NEXT_DOC][0] > $j[self::NEXT_DOC][0] || + ($i[self::NEXT_DOC][0] == $j[self::NEXT_DOC][0] && + $i[self::NEXT_DOC][1] > $j[self::NEXT_DOC][1]); + } else { + return $j[self::NEXT_DOC][0] > $i[self::NEXT_DOC][0] || + ($j[self::NEXT_DOC][0] == $i[self::NEXT_DOC][0] && + $j[self::NEXT_DOC][1] > $i[self::NEXT_DOC][1]); + } } /** - * Reheaps the given heap using bubble down operations (after extracting - * the root document from the heap). + * Performs reheap using bubble-down operation * - * @param array $heap of result docs - * @param int $index to begin heapifyDown operation + * @param array $heap to be reheaped + * @param boolean $is_terms_heap to check comparison condition */ - public function heapifyDown(&$heap, $index) + public function heapifyDown(&$heap, $is_terms_heap) { + $index = 0; $heap_size = count($heap); while ($index < $heap_size) { $left = $index * 2 + 1; $right = $index * 2 + 2; - $top_doc = $index; - if ($left < $heap_size && $heap[$left] > $heap[$top_doc]) { - $top_doc = $left; + $least_doc = $index; + if ($left < $heap_size && + $this->compareElements($heap[$least_doc], $heap[$left], + $is_terms_heap)) { + $least_doc = $left; } - if ($right < $heap_size && $heap[$right] > $heap[$top_doc]) { - $top_doc = $right; + if ($right < $heap_size && + $this->compareElements($heap[$least_doc], $heap[$right], + $is_terms_heap)) { + $least_doc = $right; } - if ($top_doc != $index) { - $temp_doc = $heap[$top_doc]; - $heap[$top_doc] = $heap[$index]; + if ($least_doc != $index) { + $temp_doc = $heap[$least_doc]; + $heap[$least_doc] = $heap[$index]; $heap[$index] = $temp_doc; - $index = $top_doc; + $index = $least_doc; } else { break; } } } /** - * Reheaps the given heap using bubble up operations (after inserting a new - * document into the heap). + * Performs reheap using bubble-up operation * - * @param array $heap of result docs - * @param int $index to begin heapifyUp operation + * @param array $heap to be reheaped + * @param boolean $is_terms_heap to check comparison condition */ - public function heapifyUp(&$heap, $index) + public function heapifyUp(&$heap, $is_terms_heap) { - if ($index == 0) { - return; - } + $index = count($heap) - 1; while ($index > 0) { $parent_index = floor(($index - 1) / 2); - if ($heap[$parent_index] >= $heap[$index]) { + if ($this->compareElements($heap[$index], $heap[$parent_index], + $is_terms_heap)) { break; } $temp_doc = $heap[$parent_index]; @@ -328,46 +368,44 @@ class UnionIterator extends IndexBundleIterator $index = $parent_index; } } - /** - * This method fetches all the query terms associated with the nested - * word iterators on the current union iterator instance. + * Gets the lowest-scoring document in the min heap of result documents. + * + * @param array $heap of result docs + * @return object lowest-scoring document + */ + public function extractMinScoringDoc(&$heap) + { + $lowest_doc = $heap[0]; + $last_index = count($heap) - 1; + $heap[0] = $heap[$last_index]; + unset($heap[$last_index]); + $this->heapifyDown($heap, false); + return $lowest_doc; + } + /** + * This method creates a heap out of all the query terms + * associated with the nested word iterators on the current + * union iterator instance. * - * @return array of query terms + * @param array $terms heap */ - public function getQueryTerms() + public function initializeTermsHeap(&$terms) { - static $query_terms = []; - if (!empty($query_terms)) { - return $query_terms; + if (!empty($terms)) { + return; } for ($i = 0; $i < $this->num_iterators; $i++) { $iterator = $this->index_bundle_iterators[$i]; - if ($iterator instanceof IntersectIterator) { - $word_iterators = $iterator->index_bundle_iterators; - } else { - $word_iterators = [$iterator]; - } - foreach ($word_iterators as $word_iterator) { - if (property_exists($word_iterator, 'word_key')) { - $word_key = $word_iterator->word_key; - $check_meta = false; - foreach (PhraseParser::$meta_words_list as $meta) { - $meta_word = str_replace(':', '3A', $meta); - if (str_starts_with($word_key, $meta_word)) { - $check_meta = true; - break; - } - } - if (!$check_meta) { - $max_score = $word_iterator->getMaxScore(); - $query_terms[$word_key] = ['ITERATOR' => $i, - 'MAX_SCORE' => $max_score]; - } - } - } + $max_score = $iterator->getMaxScore(); + $position = $iterator->currentGenDocOffsetWithWord(); + $terms[] = [ + self::ITERATOR => $i, + self::MAX_SCORE => $max_score, + self::NEXT_DOC => $position + ]; + $this->heapifyUp($terms, true); } - return $query_terms; } /** * Forwards the iterator one group of docs @@ -381,9 +419,35 @@ class UnionIterator extends IndexBundleIterator $this->advanceSeenDocs(); $this->seen_docs_unfiltered += $this->count_block_unfiltered; $total_num_docs = 0; - for ($i = 0; $i < $this->num_iterators; $i++) { - $total_num_docs += $this->index_bundle_iterators[$i]->num_docs; - $this->index_bundle_iterators[$i]->advance($gen_doc_offset); + $d = $this->currentGenDocOffsetWithWord(); + $score_k = $this->results_heap[0][self::DOC_SCORE]; + while ($d != -1 && $this->terms_heap[0][self::NEXT_DOC] === $d) { + $iterator_idx = $this->terms_heap[0][self::ITERATOR]; + $iterator = $this->index_bundle_iterators[$iterator_idx]; + $total_num_docs += $iterator->num_docs; + $iterator->advance($gen_doc_offset); + $next_doc = $iterator->currentGenDocOffsetWithWord(); + $this->terms_heap[0][self::NEXT_DOC] = $next_doc; + if ($score_k > $this->terms_heap[0][self::MAX_SCORE]) { + $this->low_scoring_terms[] = $this->terms_heap[0]; + $this->heapifyUp($this->low_scoring_terms, true); + array_splice($this->terms_heap, 0, 1); + } + $this->heapifyDown($this->terms_heap, true); + } + $d = []; + $d[self::NEXT_DOC] = $this->currentGenDocOffsetWithWord(); + if (!empty($this->low_scoring_terms)) { + while ($d[self::NEXT_DOC] != -1 && + $this->compareElements($d, $this->low_scoring_terms[0], true)) { + $lowest_doc = $this->low_scoring_terms[0]; + $iterator_idx = $lowest_doc[self::ITERATOR]; + $iterator = $this->index_bundle_iterators[$iterator_idx]; + $iterator->advance($gen_doc_offset); + $next_doc = $iterator->currentGenDocOffsetWithWord(); + $this->low_scoring_terms[0][self::NEXT_DOC] = $next_doc; + $this->heapifyDown($this->low_scoring_terms, true); + } } if ($this->seen_docs_unfiltered > 0) { $this->num_docs = @@ -413,16 +477,13 @@ class UnionIterator extends IndexBundleIterator /** * This method is supposed to get the doc_offset and generation * for the next document that would be return by - * this iterator. As the union iterator as written returns a block - * of size at least the number of iterators in it, and this iterator - * is intended to be used when results_per_block is 1, we generate - * a user defined error. + * this iterator. * - * @return mixed the desired document offset and generation (actually, - * triggers error). + * @return mixed the desired document offset and generation. */ public function currentGenDocOffsetWithWord() { - trigger_error("Cannot get the doc offset and generation with word of - a union iterator", E_USER_ERROR); + return !empty($this->terms_heap) ? + (key_exists(self::NEXT_DOC, $this->terms_heap[0]) ? + $this->terms_heap[0][self::NEXT_DOC] : -1) : -1; } } diff --git a/src/library/index_bundle_iterators/WordIterator.php b/src/library/index_bundle_iterators/WordIterator.php index a3051bf20..3592c8250 100644 --- a/src/library/index_bundle_iterators/WordIterator.php +++ b/src/library/index_bundle_iterators/WordIterator.php @@ -242,10 +242,10 @@ class WordIterator extends IndexBundleIterator //get rid of our modified base64 encoding $word_key = L\unbase64Hash($word_key); } - $this->is_meta = (strpos(substr($word_key, 9), ":") !== false); $this->direction = $direction; $this->filter = $filter; $this->word_key = $word_key; + $this->is_meta = L\PhraseParser::checkMetaTerm($this->word_key); $this->base64_word_key = L\base64Hash($word_key); $this->index_name = $index_name; $this->termInfoIteratorFields($index_name, $word_key); @@ -570,83 +570,76 @@ class WordIterator extends IndexBundleIterator substr($entry, $docid_len); if ($this->retrieve_latest && $entry[$docid_len] == 't') { $url_hash = substr($doc_key, 0, 8); - $model_for_url_hash_lookup = new ParallelModel(); - $page_versions = $model_for_url_hash_lookup-> - lookupSummaryOffsetGeneration(L\base64Hash($url_hash), - $this->index_name, false, true); - if (key_exists('ROWS', $page_versions) && - count($page_versions['ROWS']) > 0) { - $latest_postings_info = - end($page_versions['ROWS'])['POSTINGS']; - $latest_partition = - end($page_versions['ROWS'])['PARTITION']; - if (is_array($latest_postings_info) && - count($latest_postings_info) > 0) { - $latest_posting = end($latest_postings_info); - /** - * Ensure that the discovered latest version - * isn't the same as the current posting. - */ - if ($partition != $latest_partition || - $latest_posting['DOC_MAP_INDEX'] != - $doc_map_index) { - $latest_base_folder = $index-> - getPartitionBaseFolder($latest_partition); - $latest_doc_map_filename = $latest_base_folder . - "/" . IndexDocumentBundle::DOC_MAP_FILENAME; - $latest_doc_map_index = - $latest_posting['DOC_MAP_INDEX']; - $latest_doc_map_entry = - $doc_map_tools->findEntryAtIndexTableName( - $latest_doc_map_filename,$latest_doc_map_index); - if (strlen($latest_doc_map_entry) < $docid_len) { - continue; - } - $latest_doc_key = substr($latest_doc_map_entry, 0, - $docid_len); - $terms_filter = substr($latest_doc_map_entry, - $docid_len + 1, $termsfilter_len); - if (!$this->checkTermExists($this->word_key, - $terms_filter)) { - continue; - } else { - /** - * The current term id exists in the most recent - * version of the document; replace the current - * posting entries with the latest entry. - */ - $posting[self::GENERATION] = $latest_partition; - $posting['DOC_MAP_INDEX'] = - $latest_doc_map_index; - $doc_key = $latest_doc_key; - $values = substr($latest_doc_map_entry, - $docid_len + $termsfilter_len + 1); - $latest_term_postings = $this-> - getGenerationPostings($latest_partition); - $target_posting = - array_filter($latest_term_postings, + $latest_version_info = + IndexManager::lookupLatestVersionPage($url_hash, + $this->index_name); + if ($latest_version_info != null) { + $latest_partition = $latest_version_info[0]; + $latest_posting = $latest_version_info[1]; + /** + * Ensure that the discovered latest version + * isn't the same as the current posting. + */ + if ($partition != $latest_partition || + $latest_posting['DOC_MAP_INDEX'] != + $doc_map_index) { + $latest_base_folder = $index-> + getPartitionBaseFolder($latest_partition); + $latest_doc_map_filename = $latest_base_folder . + "/" . IndexDocumentBundle::DOC_MAP_FILENAME; + $latest_doc_map_index = + $latest_posting['DOC_MAP_INDEX']; + $latest_doc_map_entry = + $doc_map_tools->findEntryAtIndexTableName( + $latest_doc_map_filename, + $latest_doc_map_index); + if (strlen($latest_doc_map_entry) < $docid_len) { + continue; + } + $latest_doc_key = substr($latest_doc_map_entry, 0, + $docid_len); + $terms_filter = substr($latest_doc_map_entry, + $docid_len + 1, $termsfilter_len); + if (!$this->checkTermExists($this->word_key, + $terms_filter)) { + continue; + } else { + /** + * The current term id exists in the most recent + * version of the document; replace the current + * posting entries with the latest entry. + */ + $posting[self::GENERATION] = $latest_partition; + $posting['DOC_MAP_INDEX'] = + $latest_doc_map_index; + $doc_key = $latest_doc_key; + $values = substr($latest_doc_map_entry, + $docid_len + $termsfilter_len + 1); + $latest_term_postings = $this-> + getGenerationPostings($latest_partition); + $target_posting = + array_filter($latest_term_postings, function ($p) use ($latest_doc_map_index) { return $p['DOC_MAP_INDEX'] == $latest_doc_map_index; }); - if (count($target_posting) > 0) { - $posting['POSITIONS_LEN'] = - $target_posting[0]['POSITIONS_LEN']; - $posting['POSITIONS_OFFSET'] = - $target_posting[0]['POSITIONS_OFFSET']; - $posting['FREQUENCY'] = - $target_posting[0]['FREQUENCY']; - $latest_base_folder = $index-> - getPartitionBaseFolder($partition); - list($latest_positions_fh, + if (count($target_posting) > 0) { + $posting['POSITIONS_LEN'] = + $target_posting[0]['POSITIONS_LEN']; + $posting['POSITIONS_OFFSET'] = + $target_posting[0]['POSITIONS_OFFSET']; + $posting['FREQUENCY'] = + $target_posting[0]['FREQUENCY']; + $latest_base_folder = $index-> + getPartitionBaseFolder($partition); + list($latest_positions_fh, $latest_positions_file_size) = $this-> - getPositionsFile($latest_base_folder); - $posting[self::POSITION_LIST] = - $this->getPositionsList($posting, - $latest_positions_file_size, - $latest_positions_fh); - } + getPositionsFile($latest_base_folder); + $posting[self::POSITION_LIST] = + $this->getPositionsList($posting, + $latest_positions_file_size, + $latest_positions_fh); } } } @@ -798,6 +791,9 @@ class WordIterator extends IndexBundleIterator */ public function getMaxScore() { + if ($this->is_meta) { + return 0.01; + } $max_score = $this->getMaxDocQualityScore() + $this->getMaxRelevanceScore(); return $max_score;