Changes to heap logic in UnionIterator, fix for ArcTool, url caching for latest version lookup

Gargi Sheguri [2023-11-30 17:Nov:th]

Changes to heap logic in UnionIterator, fix for ArcTool, url caching for latest version lookup

Signed-off-by: Chris Pollett <chris@pollett.org>

Filename
src/executables/ArcTool.php
src/library/IndexManager.php
src/library/PhraseParser.php
src/library/index_bundle_iterators/GroupIterator.php
src/library/index_bundle_iterators/IntersectIterator.php
src/library/index_bundle_iterators/UnionIterator.php
src/library/index_bundle_iterators/WordIterator.php

diff --git a/src/executables/ArcTool.php b/src/executables/ArcTool.php
index 22824ca2f..c52fcab88 100755
--- a/src/executables/ArcTool.php
+++ b/src/executables/ArcTool.php
@@ -314,8 +314,13 @@ class ArcTool extends DictionaryUpdater implements CrawlConstants
         $doc_map_tools = $index->doc_map_tools;
         $entry = $doc_map_tools->findEntryAtIndexTableName($doc_map_filename,
             $doc_map_index);
-        $doc_key = substr($entry, 0, IndexDocumentBundle::DOCID_LEN);
-        $entry = substr($entry, IndexDocumentBundle::DOCID_LEN);
+        $docid_len = IndexDocumentBundle::DOCID_LEN;
+        $termsfilter_len = IndexDocumentBundle::TERMSFILTER_LEN;
+        $doc_key = substr($entry, 0, $docid_len);
+        $entry = (strlen($entry) >= ($docid_len + $termsfilter_len + 1) &&
+            $entry[$docid_len] == 't') ?
+            substr($entry, $docid_len + $termsfilter_len + 1) :
+            substr($entry, $docid_len);
         $doc_map_tools = $index->doc_map_tools;
         echo "Doc Key: " . L\toHexString($doc_key) . "\n";
         echo "Partition: $partition\n";
diff --git a/src/library/IndexManager.php b/src/library/IndexManager.php
index e2a920a92..da7cd7f41 100644
--- a/src/library/IndexManager.php
+++ b/src/library/IndexManager.php
@@ -31,6 +31,8 @@
 namespace seekquarry\yioop\library;

 use seekquarry\yioop\configs as C;
+use seekquarry\yioop\library as L;
+use seekquarry\yioop\models\ParallelModel;

 /**
  * For crawlHash
@@ -56,10 +58,19 @@ class IndexManager implements CrawlConstants
      * @var array
      */
     public static $index_times = [];
+    /**
+     * List of entries of the form name of url => doc_map info when cached
+     * @var array
+     */
+    public static $urls_cache = [];
     /**
      * Max number of IndexArchiveBundles that can be cached
      */
     const INDEX_CACHE_SIZE = 1000;
+    /**
+     * Max number of URLs to be cached for most recent version of a page lookup
+     */
+    const URLS_CACHE_SIZE = 1000;
     /**
      * Returns a reference to the managed copy of an IndexArchiveBundle object
      * with a given timestamp or feed (for handling media feeds)
@@ -367,4 +378,42 @@ class IndexManager implements CrawlConstants
         $num_docs_cache[$index_name][$term] = $total;
         return $total;
     }
+
+    /**
+     * Finds posting info related to the most recent version
+     * of a URL in the given index
+     *
+     * @param string hash of the URL to be looked up
+     * @param string current index
+     * @return array of posting info | null
+     */
+    public static function lookupLatestVersionPage($url_hash, $index_name)
+    {
+        // Check if the url hash exists in the cache
+        if (array_key_exists($url_hash, self::$urls_cache)) {
+            return self::$urls_cache[$url_hash];
+        }
+        $model_for_url_hash_lookup = new ParallelModel();
+        $page_versions = $model_for_url_hash_lookup->
+            lookupSummaryOffsetGeneration(L\base64Hash($url_hash),
+            $index_name, false, true);
+        if (key_exists('ROWS', $page_versions) &&
+            count($page_versions['ROWS']) > 0) {
+            $latest_postings_info =
+                end($page_versions['ROWS'])['POSTINGS'];
+            $latest_partition =
+                end($page_versions['ROWS'])['PARTITION'];
+            if (is_array($latest_postings_info) &&
+                count($latest_postings_info) > 0) {
+                $latest_posting = end($latest_postings_info);
+                if (count(self::$urls_cache) >= self::URLS_CACHE_SIZE) {
+                    self::$urls_cache = [];
+                }
+                self::$urls_cache[$url_hash] = array($latest_partition,
+                    $latest_posting);
+                return self::$urls_cache[$url_hash];
+            }
+        }
+        return null;
+    }
 }
diff --git a/src/library/PhraseParser.php b/src/library/PhraseParser.php
index 3334baf7a..ff9444def 100755
--- a/src/library/PhraseParser.php
+++ b/src/library/PhraseParser.php
@@ -441,7 +441,7 @@ class PhraseParser

                 $term_parts = explode("-", $term ?? "");
                 array_shift($term_parts);
-                foreach($term_parts as $part) {
+                foreach ($term_parts as $part) {
                     $pos_lists[$part][] = $t;
                 }
                 $t++;
@@ -1584,4 +1584,23 @@ class PhraseParser
         }
         return $result;
     }
+
+    /**
+     * Checks if the given term is a meta word
+     *
+     * @param string $term to check
+     * @return bool meta term or not
+     */
+    public static function checkMetaTerm($term)
+    {
+        $check_meta = false;
+        foreach (self::$meta_words_list as $meta) {
+            $meta_word = str_replace(':', '3A', $meta);
+            if (str_starts_with($term, $meta_word)) {
+                $check_meta = true;
+                break;
+            }
+        }
+        return $check_meta;
+    }
 }
diff --git a/src/library/index_bundle_iterators/GroupIterator.php b/src/library/index_bundle_iterators/GroupIterator.php
index 96e560469..c58a9d757 100644
--- a/src/library/index_bundle_iterators/GroupIterator.php
+++ b/src/library/index_bundle_iterators/GroupIterator.php
@@ -208,7 +208,12 @@ class GroupIterator extends IndexBundleIterator
                     $pages = -1;
                 }
             } else if (!empty($new_pages)) {
-                $pages = array_merge($pages, $new_pages);
+                if (count($new_pages) == 1) {
+                    $pages = array_merge($pages, $new_pages);
+                } else {
+                    $pages = $new_pages;
+                    $done = true;
+                }
                 $count = count($pages);
             }
             if (isset($this->index_bundle_iterator->hard_query)) {
diff --git a/src/library/index_bundle_iterators/IntersectIterator.php b/src/library/index_bundle_iterators/IntersectIterator.php
index 92f21cbb8..e553da3d5 100644
--- a/src/library/index_bundle_iterators/IntersectIterator.php
+++ b/src/library/index_bundle_iterators/IntersectIterator.php
@@ -545,4 +545,17 @@ class IntersectIterator extends IndexBundleIterator
                 an intersect iterator", E_USER_ERROR);
         }
      }
+     /**
+      * Returns the sum of maxScores for nested WordIterators
+      *
+      * @return int maxScore
+      */
+    public function getMaxScore()
+    {
+        $maxScore = 0;
+        foreach ($this->index_bundle_iterators as $iterator) {
+            $maxScore += $iterator->getMaxScore();
+        }
+        return $maxScore;
+    }
 }
diff --git a/src/library/index_bundle_iterators/UnionIterator.php b/src/library/index_bundle_iterators/UnionIterator.php
index 071ac78fd..f48518c2c 100644
--- a/src/library/index_bundle_iterators/UnionIterator.php
+++ b/src/library/index_bundle_iterators/UnionIterator.php
@@ -77,6 +77,45 @@ class UnionIterator extends IndexBundleIterator
      * @var int
      */
     public $total_num_docs;
+    /**
+     * Heap of query terms whose scores are considered while finding results
+     * @var array
+     */
+    public $terms_heap;
+    /**
+     * Heap of query terms whose scores are not considered while finding results
+     * @var array
+     */
+    public $low_scoring_terms;
+    /**
+     * Heap of result documents
+     * @var array
+     */
+    public $results_heap;
+    /**
+     * Heap constant to track the next occurrence of the term on a consituent
+     * iterator
+     */
+    const NEXT_DOC = 'NEXT_DOC';
+    /**
+     * Heap constant to track the index of a consituent iterator on
+     * $index_bundle_iterators
+     */
+    const ITERATOR = 'ITERATOR';
+    /**
+     * Heap constant to track the MaxScore of the term on a consituent
+     * iterator
+     */
+    const MAX_SCORE = 'MAX_SCORE';
+    /**
+     * Heap constant to track the doc fetched by a consituent iterator
+     */
+    const DOC = 'DOC';
+    /**
+     * Heap constant to track the score of a doc fetched by a consituent
+     * iterator
+     */
+    const DOC_SCORE = 'DOC_SCORE';
     /**
      * Creates a union iterator with the given parameters.
      *
@@ -101,6 +140,11 @@ class UnionIterator extends IndexBundleIterator
         $this->seen_docs_unfiltered = 0;
         $this->index_name = $index_name;
         $this->total_num_docs = $total_num_docs;
+        $this->low_scoring_terms = [];
+        for ($i = 0; $i < self::RESULTS_PER_BLOCK; $i++) {
+            $this->results_heap[$i][self::DOC_SCORE] = 0;
+        }
+        $this->initializeTermsHeap($this->terms_heap);
         for ($i = 0; $i < $this->num_iterators; $i++) {
             $this->num_docs += $this->index_bundle_iterators[$i]->num_docs;
             /*
@@ -147,6 +191,31 @@ class UnionIterator extends IndexBundleIterator
         $this->seen_docs_unfiltered = 0;
         $doc_block = $this->currentDocsWithWord();
     }
+    /**
+     * Calculates the total relevance score of the result document
+     *
+     * @param array $heap of terms
+     * @param int $relevance_score previously calculated relevance score
+     * @return array
+     */
+    public function getDocScore($heap, $relevance_score = 0)
+    {
+        $d = $this->currentGenDocOffsetWithWord();
+        $doc = [];
+        while ($d != -1 && !empty($heap) && $heap[0][self::NEXT_DOC] === $d) {
+            $iterator_idx = $heap[0][self::ITERATOR];
+            $iterator = $this->index_bundle_iterators[$iterator_idx];
+            $docs = $iterator->findDocsWithWord();
+            if (is_array($docs) && count($docs) == 1) {
+                $keys = array_keys($docs);
+                $doc = $docs[$keys[0]];
+                $relevance_score += $doc[self::RELEVANCE];
+            }
+            array_splice($heap, 0, 1);
+            $this->heapifyDown($heap, true);
+        }
+        return [$doc, $relevance_score];
+    }
     /**
      * Hook function used by currentDocsWithWord to return the current block
      * of docs if it is not cached
@@ -156,170 +225,141 @@ class UnionIterator extends IndexBundleIterator
     public function findDocsWithWord()
     {
         $pages = [];
-        $docs = [];
         $found_docs = false;
-        $results_heap = [];
-        $k_least_score = ['LEAST_SCORE' => 0, 'INDEX' => 0];
-        $query_terms = $this->getQueryTerms();
-        for ($i = 0; $i < $this->num_iterators; $i++) {
-            $docs =  $this->index_bundle_iterators[$i]->currentDocsWithWord();
-            if (is_array($docs)) {
-                /*
-                 Iterate over all the documents fetched and add a doc to the
-                 results' max heap only if the heap is not full / the
-                 relevance score of the doc is greater than the current kth-best
-                 score
-                 */
-                foreach ($docs as $doc_key => $doc) {
-                    $doc["ITERATOR"] = $i;
-                    $this->key_iterator_table[$doc_key] = $i;
-                    $score = $doc[self::RELEVANCE];
-                    $full_heap =
-                        (count($results_heap) == $this->results_per_block);
-                    if ($full_heap && $score <= $k_least_score['LEAST_SCORE']) {
-                        continue;
-                    } else {
-                        $next_page_index = $full_heap ?
-                            $k_least_score['INDEX'] :
-                            count($results_heap);
-                        $results_heap[$next_page_index]['SCORE'] = $score;
-                        $results_heap[$next_page_index]['DOC'] = $doc;
-                        $this->heapifyUp($results_heap, $next_page_index);
-                    }
-                    /*
-                     If the heap is full after inserting the new doc,
-                     recompute the minimum score in the heap (which will be
-                     replaced with the next doc that has to be inserted)
-                     */
-                    if ($full_heap) {
-                        $min_score = min($results_heap);
-                        $k_least_score = ['LEAST_SCORE' => $min_score,
-                            'INDEX' => array_search($min_score, $results_heap)];
-                    }
-                }
-                /*
-                 Drop query terms whose maxScores are lower than the current
-                 kth-best score, where k is the max number of results that
-                 can be returned
-                 */
-                if (count($results_heap) == $this->results_per_block) {
-                    $this->compareByMaxScore($query_terms,
-                        $k_least_score['LEAST_SCORE']);
-                }
-                $found_docs = true;
-            }
+        list($doc, $relevance_score) = $this->getDocScore($this->terms_heap);
+        if (!empty($this->low_scoring_terms)) {
+            list($doc_copy, $additional_score) =
+                $this->getDocScore($this->low_scoring_terms, $relevance_score);
+            $relevance_score += $additional_score;
+        }
+
+        if (!empty($doc) && $relevance_score >
+            $this->results_heap[0][self::DOC_SCORE]) {
+            // Update the document's scores
+            $doc[self::RELEVANCE] = $relevance_score;
+            $score = $relevance_score + $doc[self::DOC_RANK];
+            $doc[self::SCORE] = $score;
+            $found_docs = true;
+            $this->results_heap[0][self::DOC] = $doc;
+            $this->results_heap[0][self::DOC_SCORE] = $score;
+            $this->heapifyDown($this->results_heap, false);
         }
-        if ($found_docs == false) {
-            $this->pages = $docs;
-            return $docs;
+        $found_top_results = $this->results_heap[0][self::DOC_SCORE] > 0;
+        if (!$found_docs || $found_top_results) {
+            $pages = ($this->
+                results_heap[self::RESULTS_PER_BLOCK-1][self::DOC_SCORE] == 0) ?
+                -1 : $this->getResultsHeap();
         } else {
-            // Get the top k result documents from the max heap
-            while (!empty($results_heap)) {
-                $pages[] = $this->extractMaxScoringDoc($results_heap)['DOC'];
-            }
+            $pages = [$doc];
         }
-        $this->count_block_unfiltered = count($pages);
         $this->pages = $pages;
-        $this->count_block = count($pages);
+        if (is_array($pages)) {
+            $this->count_block_unfiltered = count($pages);
+            $this->count_block = count($pages);
+        }
         return $pages;
     }
     /**
-     * Compares each of the query terms' maxScores with the current
-     * least score in the max heap of result documents (i.e., the current
-     * kth-best score). If the term's maxScore is <= the current least score
-     * in the top k results, remove the word iterator associated with that
-     * term, as it will never make it to the top k documents.
+     * Gets the docs in the results min heap sorted in
+     * descending order by score
      *
-     * @param array $query_terms on this union iterator
-     * @param int $least_score current kth-best score
+     * @return mixed array of result docs if any, -1 otherwise
      */
-    public function compareByMaxScore(&$query_terms, $least_score)
+    public function getResultsHeap()
     {
-        foreach ($query_terms as $query_term => $term_info) {
-            if ($term_info['MAX_SCORE'] > 0 &&
-                $term_info['MAX_SCORE'] <= $least_score) {
-                $iterator_index = $term_info['ITERATOR'];
-                $iterator = $this->index_bundle_iterators[$iterator_index];
-                if ($iterator instanceof IntersectIterator) {
-                    $word_iterators = $iterator->index_bundle_iterators;
-                    for ($j = 0; $j < count($word_iterators); $j++) {
-                        if ($word_iterators[$j]->word_key == $query_term) {
-                            array_splice($this->
-                            index_bundle_iterators[$iterator_index], $j, 1);
-                            unset($query_terms[$query_term]);
-                            break;
-                        }
-                    }
-                } else {
-                    if ($iterator->word_key == $query_term) {
-                        array_splice($this->index_bundle_iterators,
-                            $iterator_index, 1);
-                        unset($query_terms[$query_term]);
-                    }
-                }
+        $pages = [];
+        while (!empty($this->results_heap)) {
+            $doc = $this->extractMinScoringDoc($this->results_heap);
+            if ($doc[self::DOC_SCORE] > 0) {
+                array_unshift($pages, $doc[self::DOC]);
             }
         }
+        // Re-initialize the results heap for the next set of docs
+        $this->results_heap = [];
+        for ($i = 0; $i < self::RESULTS_PER_BLOCK; $i++) {
+            $this->results_heap[$i][self::DOC_SCORE] = 0;
+        }
+        if (empty($pages)) {
+            $pages = -1;
+        }
+        return $pages;
     }
     /**
-     * Gets the top-scoring document in the max heap of result documents.
+     * Compare between elements for heapify operations
      *
-     * @param array $heap of result docs
-     * @return object top-scoring document
+     * @param array $i first element
+     * @param array $j second element
+     * @param boolean $is_terms_heap basis for comparison
+     * @return boolean result of comparison
      */
-    public function extractMaxScoringDoc(&$heap)
+    public function compareElements($i, $j, $is_terms_heap)
     {
-        $top_doc = $heap[0];
-        $last_index = count($heap) - 1;
-        $heap[0] = $heap[$last_index];
-        unset($heap[$last_index]);
-        $this->heapifyDown($heap, 0);
-        return $top_doc;
+        $is_ascending = $this->getDirection();
+        if (!$is_terms_heap) {
+            return $i[self::DOC_SCORE] > $j[self::DOC_SCORE];
+        }
+        if ($is_ascending) {
+            if ($i[self::NEXT_DOC] == -1) {
+                return true;
+            } else if ($j[self::NEXT_DOC] == -1) {
+                return false;
+            }
+            return $i[self::NEXT_DOC][0] > $j[self::NEXT_DOC][0] ||
+                ($i[self::NEXT_DOC][0] == $j[self::NEXT_DOC][0] &&
+                    $i[self::NEXT_DOC][1] > $j[self::NEXT_DOC][1]);
+        } else {
+            return $j[self::NEXT_DOC][0] > $i[self::NEXT_DOC][0] ||
+                ($j[self::NEXT_DOC][0] == $i[self::NEXT_DOC][0] &&
+                    $j[self::NEXT_DOC][1] > $i[self::NEXT_DOC][1]);
+        }
     }
     /**
-     * Reheaps the given heap using bubble down operations (after extracting
-     * the root document from the heap).
+     * Performs reheap using bubble-down operation
      *
-     * @param array $heap of result docs
-     * @param int $index to begin heapifyDown operation
+     * @param array $heap to be reheaped
+     * @param boolean $is_terms_heap to check comparison condition
      */
-    public function heapifyDown(&$heap, $index)
+    public function heapifyDown(&$heap, $is_terms_heap)
     {
+        $index = 0;
         $heap_size = count($heap);
         while ($index < $heap_size) {
             $left = $index * 2 + 1;
             $right = $index * 2 + 2;
-            $top_doc = $index;
-            if ($left < $heap_size && $heap[$left] > $heap[$top_doc]) {
-                $top_doc = $left;
+            $least_doc = $index;
+            if ($left < $heap_size &&
+                $this->compareElements($heap[$least_doc], $heap[$left],
+                    $is_terms_heap)) {
+                $least_doc = $left;
             }
-            if ($right < $heap_size && $heap[$right] > $heap[$top_doc]) {
-                $top_doc = $right;
+            if ($right < $heap_size &&
+                $this->compareElements($heap[$least_doc], $heap[$right],
+                    $is_terms_heap)) {
+                $least_doc = $right;
             }
-            if ($top_doc != $index) {
-                $temp_doc = $heap[$top_doc];
-                $heap[$top_doc] = $heap[$index];
+            if ($least_doc != $index) {
+                $temp_doc = $heap[$least_doc];
+                $heap[$least_doc] = $heap[$index];
                 $heap[$index] = $temp_doc;
-                $index = $top_doc;
+                $index = $least_doc;
             } else {
                 break;
             }
         }
     }
     /**
-     * Reheaps the given heap using bubble up operations (after inserting a new
-     * document into the heap).
+     * Performs reheap using bubble-up operation
      *
-     * @param array $heap of result docs
-     * @param int $index to begin heapifyUp operation
+     * @param array $heap to be reheaped
+     * @param boolean $is_terms_heap to check comparison condition
      */
-    public function heapifyUp(&$heap, $index)
+    public function heapifyUp(&$heap, $is_terms_heap)
     {
-        if ($index == 0) {
-            return;
-        }
+        $index = count($heap) - 1;
         while ($index > 0) {
             $parent_index = floor(($index - 1) / 2);
-            if ($heap[$parent_index] >= $heap[$index]) {
+            if ($this->compareElements($heap[$index], $heap[$parent_index],
+                $is_terms_heap)) {
                 break;
             }
             $temp_doc = $heap[$parent_index];
@@ -328,46 +368,44 @@ class UnionIterator extends IndexBundleIterator
             $index = $parent_index;
         }
     }
-
     /**
-     * This method fetches all the query terms associated with the nested
-     * word iterators on the current union iterator instance.
+     * Gets the lowest-scoring document in the min heap of result documents.
+     *
+     * @param array $heap of result docs
+     * @return object lowest-scoring document
+     */
+    public function extractMinScoringDoc(&$heap)
+    {
+        $lowest_doc = $heap[0];
+        $last_index = count($heap) - 1;
+        $heap[0] = $heap[$last_index];
+        unset($heap[$last_index]);
+        $this->heapifyDown($heap, false);
+        return $lowest_doc;
+    }
+    /**
+     * This method creates a heap out of all the query terms
+     * associated with the nested word iterators on the current
+     * union iterator instance.
      *
-     * @return array of query terms
+     * @param array $terms heap
      */
-    public function getQueryTerms()
+    public function initializeTermsHeap(&$terms)
     {
-        static $query_terms = [];
-        if (!empty($query_terms)) {
-            return $query_terms;
+        if (!empty($terms)) {
+            return;
         }
         for ($i = 0; $i < $this->num_iterators; $i++) {
             $iterator =  $this->index_bundle_iterators[$i];
-            if ($iterator instanceof IntersectIterator) {
-                $word_iterators = $iterator->index_bundle_iterators;
-            } else {
-                $word_iterators = [$iterator];
-            }
-            foreach ($word_iterators as $word_iterator) {
-                if (property_exists($word_iterator, 'word_key')) {
-                    $word_key = $word_iterator->word_key;
-                    $check_meta = false;
-                    foreach (PhraseParser::$meta_words_list as $meta) {
-                        $meta_word = str_replace(':', '3A', $meta);
-                        if (str_starts_with($word_key, $meta_word)) {
-                            $check_meta = true;
-                            break;
-                        }
-                    }
-                    if (!$check_meta) {
-                        $max_score = $word_iterator->getMaxScore();
-                        $query_terms[$word_key] = ['ITERATOR' => $i,
-                            'MAX_SCORE' => $max_score];
-                    }
-                }
-            }
+            $max_score = $iterator->getMaxScore();
+            $position = $iterator->currentGenDocOffsetWithWord();
+            $terms[] = [
+                self::ITERATOR => $i,
+                self::MAX_SCORE => $max_score,
+                self::NEXT_DOC => $position
+            ];
+            $this->heapifyUp($terms, true);
         }
-        return $query_terms;
     }
     /**
      * Forwards the iterator one group of docs
@@ -381,9 +419,35 @@ class UnionIterator extends IndexBundleIterator
         $this->advanceSeenDocs();
         $this->seen_docs_unfiltered += $this->count_block_unfiltered;
         $total_num_docs = 0;
-        for ($i = 0; $i < $this->num_iterators; $i++) {
-            $total_num_docs += $this->index_bundle_iterators[$i]->num_docs;
-            $this->index_bundle_iterators[$i]->advance($gen_doc_offset);
+        $d = $this->currentGenDocOffsetWithWord();
+        $score_k = $this->results_heap[0][self::DOC_SCORE];
+        while ($d != -1 && $this->terms_heap[0][self::NEXT_DOC] === $d) {
+            $iterator_idx = $this->terms_heap[0][self::ITERATOR];
+            $iterator = $this->index_bundle_iterators[$iterator_idx];
+            $total_num_docs += $iterator->num_docs;
+            $iterator->advance($gen_doc_offset);
+            $next_doc = $iterator->currentGenDocOffsetWithWord();
+            $this->terms_heap[0][self::NEXT_DOC] = $next_doc;
+            if ($score_k > $this->terms_heap[0][self::MAX_SCORE]) {
+                 $this->low_scoring_terms[] = $this->terms_heap[0];
+                 $this->heapifyUp($this->low_scoring_terms, true);
+                 array_splice($this->terms_heap, 0, 1);
+            }
+            $this->heapifyDown($this->terms_heap, true);
+        }
+        $d = [];
+        $d[self::NEXT_DOC] = $this->currentGenDocOffsetWithWord();
+        if (!empty($this->low_scoring_terms)) {
+            while ($d[self::NEXT_DOC] != -1 &&
+                $this->compareElements($d, $this->low_scoring_terms[0], true)) {
+                $lowest_doc = $this->low_scoring_terms[0];
+                $iterator_idx = $lowest_doc[self::ITERATOR];
+                $iterator = $this->index_bundle_iterators[$iterator_idx];
+                $iterator->advance($gen_doc_offset);
+                $next_doc = $iterator->currentGenDocOffsetWithWord();
+                $this->low_scoring_terms[0][self::NEXT_DOC] = $next_doc;
+                $this->heapifyDown($this->low_scoring_terms, true);
+            }
         }
         if ($this->seen_docs_unfiltered > 0) {
             $this->num_docs =
@@ -413,16 +477,13 @@ class UnionIterator extends IndexBundleIterator
     /**
      * This method is supposed to get the doc_offset and generation
      * for the next document that would be return by
-     * this iterator. As the union iterator as written returns a block
-     * of size at least the number of iterators in it, and this iterator
-     * is intended to be used when results_per_block is 1, we generate
-     * a user defined error.
+     * this iterator.
      *
-     * @return mixed the desired document offset and generation (actually,
-     * triggers error).
+     * @return mixed the desired document offset and generation.
      */
     public function currentGenDocOffsetWithWord() {
-        trigger_error("Cannot get the doc offset and generation with word of
-            a union iterator", E_USER_ERROR);
+        return !empty($this->terms_heap) ?
+            (key_exists(self::NEXT_DOC, $this->terms_heap[0]) ?
+                $this->terms_heap[0][self::NEXT_DOC] : -1) : -1;
     }
 }
diff --git a/src/library/index_bundle_iterators/WordIterator.php b/src/library/index_bundle_iterators/WordIterator.php
index a3051bf20..3592c8250 100644
--- a/src/library/index_bundle_iterators/WordIterator.php
+++ b/src/library/index_bundle_iterators/WordIterator.php
@@ -242,10 +242,10 @@ class WordIterator extends IndexBundleIterator
             //get rid of our modified base64 encoding
             $word_key = L\unbase64Hash($word_key);
         }
-        $this->is_meta = (strpos(substr($word_key, 9), ":") !== false);
         $this->direction = $direction;
         $this->filter = $filter;
         $this->word_key = $word_key;
+        $this->is_meta = L\PhraseParser::checkMetaTerm($this->word_key);
         $this->base64_word_key = L\base64Hash($word_key);
         $this->index_name = $index_name;
         $this->termInfoIteratorFields($index_name, $word_key);
@@ -570,83 +570,76 @@ class WordIterator extends IndexBundleIterator
                 substr($entry, $docid_len);
             if ($this->retrieve_latest && $entry[$docid_len] == 't') {
                 $url_hash = substr($doc_key, 0, 8);
-                $model_for_url_hash_lookup = new ParallelModel();
-                $page_versions = $model_for_url_hash_lookup->
-                    lookupSummaryOffsetGeneration(L\base64Hash($url_hash),
-                    $this->index_name, false, true);
-                if (key_exists('ROWS', $page_versions) &&
-                    count($page_versions['ROWS']) > 0) {
-                    $latest_postings_info =
-                        end($page_versions['ROWS'])['POSTINGS'];
-                    $latest_partition =
-                        end($page_versions['ROWS'])['PARTITION'];
-                    if (is_array($latest_postings_info) &&
-                        count($latest_postings_info) > 0) {
-                        $latest_posting = end($latest_postings_info);
-                        /**
-                         * Ensure that the discovered latest version
-                         * isn't the same as the current posting.
-                         */
-                        if ($partition != $latest_partition ||
-                            $latest_posting['DOC_MAP_INDEX'] !=
-                            $doc_map_index) {
-                            $latest_base_folder = $index->
-                                getPartitionBaseFolder($latest_partition);
-                            $latest_doc_map_filename = $latest_base_folder .
-                                "/" . IndexDocumentBundle::DOC_MAP_FILENAME;
-                            $latest_doc_map_index =
-                                $latest_posting['DOC_MAP_INDEX'];
-                            $latest_doc_map_entry =
-                                $doc_map_tools->findEntryAtIndexTableName(
-                                $latest_doc_map_filename,$latest_doc_map_index);
-                            if (strlen($latest_doc_map_entry) < $docid_len) {
-                                continue;
-                            }
-                            $latest_doc_key = substr($latest_doc_map_entry, 0,
-                                $docid_len);
-                            $terms_filter = substr($latest_doc_map_entry,
-                                $docid_len + 1, $termsfilter_len);
-                            if (!$this->checkTermExists($this->word_key,
-                                $terms_filter)) {
-                                continue;
-                            } else {
-                                /**
-                                 * The current term id exists in the most recent
-                                 * version of the document; replace the current
-                                 * posting entries with the latest entry.
-                                 */
-                                $posting[self::GENERATION] = $latest_partition;
-                                $posting['DOC_MAP_INDEX'] =
-                                    $latest_doc_map_index;
-                                $doc_key = $latest_doc_key;
-                                $values = substr($latest_doc_map_entry,
-                                    $docid_len + $termsfilter_len + 1);
-                                $latest_term_postings = $this->
-                                    getGenerationPostings($latest_partition);
-                                $target_posting =
-                                    array_filter($latest_term_postings,
+                $latest_version_info =
+                    IndexManager::lookupLatestVersionPage($url_hash,
+                    $this->index_name);
+                if ($latest_version_info != null) {
+                    $latest_partition = $latest_version_info[0];
+                    $latest_posting = $latest_version_info[1];
+                    /**
+                     * Ensure that the discovered latest version
+                     * isn't the same as the current posting.
+                     */
+                    if ($partition != $latest_partition ||
+                        $latest_posting['DOC_MAP_INDEX'] !=
+                        $doc_map_index) {
+                        $latest_base_folder = $index->
+                        getPartitionBaseFolder($latest_partition);
+                        $latest_doc_map_filename = $latest_base_folder .
+                            "/" . IndexDocumentBundle::DOC_MAP_FILENAME;
+                        $latest_doc_map_index =
+                            $latest_posting['DOC_MAP_INDEX'];
+                        $latest_doc_map_entry =
+                            $doc_map_tools->findEntryAtIndexTableName(
+                                $latest_doc_map_filename,
+                                $latest_doc_map_index);
+                        if (strlen($latest_doc_map_entry) < $docid_len) {
+                            continue;
+                        }
+                        $latest_doc_key = substr($latest_doc_map_entry, 0,
+                            $docid_len);
+                        $terms_filter = substr($latest_doc_map_entry,
+                            $docid_len + 1, $termsfilter_len);
+                        if (!$this->checkTermExists($this->word_key,
+                            $terms_filter)) {
+                            continue;
+                        } else {
+                            /**
+                             * The current term id exists in the most recent
+                             * version of the document; replace the current
+                             * posting entries with the latest entry.
+                             */
+                            $posting[self::GENERATION] = $latest_partition;
+                            $posting['DOC_MAP_INDEX'] =
+                                $latest_doc_map_index;
+                            $doc_key = $latest_doc_key;
+                            $values = substr($latest_doc_map_entry,
+                                $docid_len + $termsfilter_len + 1);
+                            $latest_term_postings = $this->
+                                getGenerationPostings($latest_partition);
+                            $target_posting =
+                                array_filter($latest_term_postings,
                                     function ($p) use ($latest_doc_map_index)
                                     {
                                         return $p['DOC_MAP_INDEX'] ==
                                             $latest_doc_map_index;
                                     });
-                                if (count($target_posting) > 0) {
-                                    $posting['POSITIONS_LEN'] =
-                                        $target_posting[0]['POSITIONS_LEN'];
-                                    $posting['POSITIONS_OFFSET'] =
-                                        $target_posting[0]['POSITIONS_OFFSET'];
-                                    $posting['FREQUENCY'] =
-                                        $target_posting[0]['FREQUENCY'];
-                                    $latest_base_folder = $index->
-                                        getPartitionBaseFolder($partition);
-                                    list($latest_positions_fh,
+                            if (count($target_posting) > 0) {
+                                $posting['POSITIONS_LEN'] =
+                                    $target_posting[0]['POSITIONS_LEN'];
+                                $posting['POSITIONS_OFFSET'] =
+                                    $target_posting[0]['POSITIONS_OFFSET'];
+                                $posting['FREQUENCY'] =
+                                    $target_posting[0]['FREQUENCY'];
+                                $latest_base_folder = $index->
+                                    getPartitionBaseFolder($partition);
+                                list($latest_positions_fh,
                                     $latest_positions_file_size) = $this->
-                                        getPositionsFile($latest_base_folder);
-                                    $posting[self::POSITION_LIST] =
-                                        $this->getPositionsList($posting,
-                                            $latest_positions_file_size,
-                                            $latest_positions_fh);
-                                }
+                                getPositionsFile($latest_base_folder);
+                                $posting[self::POSITION_LIST] =
+                                    $this->getPositionsList($posting,
+                                        $latest_positions_file_size,
+                                        $latest_positions_fh);
                             }
                         }
                     }
@@ -798,6 +791,9 @@ class WordIterator extends IndexBundleIterator
      */
     public function getMaxScore()
     {
+        if ($this->is_meta) {
+            return 0.01;
+        }
         $max_score = $this->getMaxDocQualityScore() +
                 $this->getMaxRelevanceScore();
         return $max_score;

ViewGit