Switch to term at a time disjunctive processing, a=chris

Chris Pollett [2023-12-09 01:Dec:th]

Switch to term at a time disjunctive processing, a=chris

Filename
src/library/CrawlConstants.php
src/library/index_bundle_iterators/GroupIterator.php
src/library/index_bundle_iterators/IndexBundleIterator.php
src/library/index_bundle_iterators/IntersectIterator.php
src/library/index_bundle_iterators/NetworkIterator.php
src/library/index_bundle_iterators/UnionIterator.php
src/library/index_bundle_iterators/WordIterator.php
src/models/PhraseModel.php

diff --git a/src/library/CrawlConstants.php b/src/library/CrawlConstants.php
index 79e58dbe7..5401c77ad 100755
--- a/src/library/CrawlConstants.php
+++ b/src/library/CrawlConstants.php
@@ -264,7 +264,7 @@ interface CrawlConstants
     const SCRAPER_INFO = 'eq';
     const SEQUENCE_NUMBER = 'er';
     const FETCHER_QUEUE_SERVER_RATIO = 'es';
-    const NEXT_DOC = 'et';
+    const GEN_OFFSET = 'et';
     const ITERATOR = 'eu';
     const MAX_SCORE = 'ev';
 }
diff --git a/src/library/index_bundle_iterators/GroupIterator.php b/src/library/index_bundle_iterators/GroupIterator.php
index c58a9d757..03b3825e6 100644
--- a/src/library/index_bundle_iterators/GroupIterator.php
+++ b/src/library/index_bundle_iterators/GroupIterator.php
@@ -208,18 +208,9 @@ class GroupIterator extends IndexBundleIterator
                     $pages = -1;
                 }
             } else if (!empty($new_pages)) {
-                if (count($new_pages) == 1) {
-                    $pages = array_merge($pages, $new_pages);
-                } else {
-                    $pages = $new_pages;
-                    $done = true;
-                }
+                $pages += $new_pages;
                 $count = count($pages);
             }
-            if (isset($this->index_bundle_iterator->hard_query)) {
-                $this->results_per_block =
-                    $this->index_bundle_iterator->hard_query;
-            }
             if ($count < $this->results_per_block && !$done) {
                 $this->index_bundle_iterator->advance();
             } else {
diff --git a/src/library/index_bundle_iterators/IndexBundleIterator.php b/src/library/index_bundle_iterators/IndexBundleIterator.php
index dfb27e68d..e1554322d 100644
--- a/src/library/index_bundle_iterators/IndexBundleIterator.php
+++ b/src/library/index_bundle_iterators/IndexBundleIterator.php
@@ -132,26 +132,6 @@ abstract class IndexBundleIterator implements CrawlConstants
      {
          return 5;
      }
-     /**
-      * This method calculates the max relevance value for query underlying
-      * the iterator to the document currently being iterated over
-      * by the query
-      * @return float maximum score for document relevance to a query
-      */
-     public function getMaxRelevanceScore()
-     {
-         return 0.01;
-     }
-     /**
-      * This method calculates the maximum overall score value for any document
-      * returned by this iterator. It should be overriden in subclasses as
-      * makes sense
-      * @return float maximum score
-      */
-    public function getMaxScore()
-    {
-        return $this->getMaxDocQualityScore() + $this->getMaxRelevanceScore();
-    }
     /**
      * Returns a string representation of a plan by which the current iterator
      * finds its results
@@ -347,13 +327,13 @@ abstract class IndexBundleIterator implements CrawlConstants
         $remaining_partitions =  ($is_ascending) ?
             $number_of_partitions - $num_seen_partitions :
             $num_seen_partitions - 1;
-        $pre_rank_and_bonuses = ($remaining_partitions *
+        $rank_and_bonuses = ($remaining_partitions *
             $this->avg_items_per_partition)/
             (($number_of_partitions + 1) *
             ($avg_items_per_partition + 1)) +
             $last_partition_pos / $max_items_per_partition;
         if (IndexDocumentBundle::isAHostDocId($doc_key)) {
-            $pre_rank_and_bonuses +=
+            $rank_and_bonuses +=
                 (IndexDocumentBundle::isACldDocId($doc_key)) ?
                 $cld_bonus : $host_bonus;
         }
@@ -374,12 +354,11 @@ abstract class IndexBundleIterator implements CrawlConstants
             IndexDocumentBundle::DOCID_PART_LEN << 1] ?? 0) & 96;
         if ($doc_id_format != 96) {
             if (IndexDocumentBundle::isAWikipediaPage($doc_key)) {
-                $pre_rank_and_bonuses += $wiki_bonus;
+                $rank_and_bonuses += $wiki_bonus;
             }
-            $pre_rank_and_bonuses  += $num_slashes_bonus /
+            $rank_and_bonuses  += $num_slashes_bonus /
                 (IndexDocumentBundle::findNumSlashes($doc_key) + 1);
         }
-        return $this->getMaxDocQualityScore() *
-            $pre_rank_and_bonuses / $max_pre_rank_and_bonuses;
+        return $rank_and_bonuses;
     }
 }
diff --git a/src/library/index_bundle_iterators/IntersectIterator.php b/src/library/index_bundle_iterators/IntersectIterator.php
index 613ca0011..8e232a0dc 100644
--- a/src/library/index_bundle_iterators/IntersectIterator.php
+++ b/src/library/index_bundle_iterators/IntersectIterator.php
@@ -313,90 +313,6 @@ class IntersectIterator extends IndexBundleIterator
             }
         }
     }
-    /**
-     * Given the position_lists of a collection of terms computes
-     * a score for how close those words were in the given document
-     *
-     * @param array &$word_position_lists a 2D array item
-     *      number => position_list (locations in doc where item occurred) for
-     *      that item.
-     * @param array &$word_len_lists length for each item of its position list
-     * @param bool $is_doc whether this is the position list of a document
-     *     or a link
-     * @param int $doc_len the length of the document
-     * @return sum of inverse of all covers computed by plane sweep algorithm
-     */
-    public function computeProximity(&$word_position_lists, &$word_len_lists,
-        $is_doc, $doc_len)
-    {
-        $num_iterators = $this->num_iterators;
-        if ($num_iterators < 1) {
-            return 0;
-        }
-        $covers = [];
-        $position_list = $word_position_lists;
-        $interval = [];
-        $num_words = count($position_list);
-        for ($i = 0; $i < $num_words; $i++) {
-            $min = (!empty($position_list[$i])) ?
-                array_shift($position_list[$i]) : null;
-            if (empty($min)) {
-                break;
-            } else {
-                array_push($interval, [$min, $i]);
-                for ($j = 0; $j < $num_words; $j++) {
-                    if (isset($position_list[$j][0]) &&
-                        $min == $position_list[$j][0]) {
-                        array_shift($position_list[$j]);
-                    }
-                }
-            }
-        }
-        if (count($interval) != $num_words) {
-            return 0;
-        }
-        sort($interval);
-        $l = array_shift($interval);
-        $r = end($interval);
-        $stop = false;
-        if (count($position_list[$l[1]]) == 0) {
-            $stop = true;
-        }
-        while(!$stop) {
-            $p = array_shift($position_list[$l[1]]);
-            for ($i = 0;$i < $num_words; $i++){
-                if (isset($position_list[$i][0]) &&
-                    $p == $position_list[$i][0]) {
-                    array_shift($position_list[$i]);
-                }
-            }
-            $q = $interval[0][0];
-            if ($p > $r[0]) {
-                array_push($covers, [$l[0], $r[0]]);
-                array_push($interval, [$p, $l[1]]);
-            } else {
-                if ($p < $q) {
-                    array_unshift($interval, [$p, $l[1]]);
-                } else {
-                    array_push($interval, [$p, $l[1]]);
-                    sort($interval);
-                }
-            }
-            $l = array_shift($interval);
-            $r = end($interval);
-            if (count($position_list[$l[1]]) == 0) {
-                $stop = true;
-            }
-        }
-        array_push($covers, [$l[0],$r[0]]);
-        $score = 0;
-        foreach ($covers as $cover) {
-            $score += (1/($cover[1] - $cover[0] + 1));
-        }
-        $score = ($num_words * $score)/max($doc_len, 1);
-            // this will ensure the score is less than 1
-        return $score;
-    }
     /**
      * Finds the next generation and doc offset amongst all the iterators
      * that contains the word. It assumes that the (generation, doc offset)
@@ -545,18 +461,4 @@ class IntersectIterator extends IndexBundleIterator
                 an intersect iterator", E_USER_ERROR);
         }
      }
-     /**
-      * This method calculates the max relevance value for query underlying
-      * the iterator to the document currently being iterated over
-      * by the query
-      * @return float maximum score for document relevance to a query
-      */
-     public function getMaxRelevanceScore()
-    {
-        $max_relevance = 0;
-        foreach ($this->index_bundle_iterators as $iterator) {
-            $max_relevance += $iterator->getMaxRelevanceScore();
-        }
-        return $max_relevance;
-    }
 }
diff --git a/src/library/index_bundle_iterators/NetworkIterator.php b/src/library/index_bundle_iterators/NetworkIterator.php
index fd9382e06..8510fd0a9 100644
--- a/src/library/index_bundle_iterators/NetworkIterator.php
+++ b/src/library/index_bundle_iterators/NetworkIterator.php
@@ -86,12 +86,6 @@ class NetworkIterator extends IndexBundleIterator
      * @var int
      */
     public $num_downloaded;
-    /**
-     * Used to keep track of the original desired number of results to be
-     * returned in one find docs call versus the number actually retrieved.
-     * @var int
-     */
-    public $hard_query;
     /**
      * Flags used to keep track of whether a given machine has more search
      * result data. Array of booleans
@@ -140,7 +134,6 @@ class NetworkIterator extends IndexBundleIterator
         $this->next_results_per_server =
             self::serverAdjustedResultsPerBlock($num_servers,
             $this->results_per_block);
-        $this->hard_query = false;
         $this->base_query = "q=" . urlencode($query).
             "&f=serial&network=false&raw=1&its=$timestamp&guess=false";
         foreach (["cld_url_bonus" => C\CLD_URL_BONUS,
@@ -181,7 +174,6 @@ class NetworkIterator extends IndexBundleIterator
             self::serverAdjustedResultsPerBlock($num_servers,
             $this->results_per_block);
         $count = count($this->queue_servers);
-        $this->hard_query = false;
         for ($i = 0; $i < $count; $i++) {
             $this->more_flags[$i] = true;
         }
@@ -306,9 +298,6 @@ class NetworkIterator extends IndexBundleIterator
             }
         }
         $machine_times = substr( $machine_times, 0, -strlen("<br>"));
-        if (isset($pre_result["HARD_QUERY"])) {
-            $this->hard_query  = $pre_result["HARD_QUERY"];
-        }
         if ($num_with_results > 0) {
             $this->next_results_per_server =
                 self::serverAdjustedResultsPerBlock($num_with_results,
diff --git a/src/library/index_bundle_iterators/UnionIterator.php b/src/library/index_bundle_iterators/UnionIterator.php
index 82e4340cc..4ea3975fe 100644
--- a/src/library/index_bundle_iterators/UnionIterator.php
+++ b/src/library/index_bundle_iterators/UnionIterator.php
@@ -63,11 +63,6 @@ class UnionIterator extends IndexBundleIterator
      * @var int
      */
     public $seen_docs_unfiltered;
-    /**
-     * stores a mapping between seen doc keys and which iterator they came from
-     * @var array
-     */
-    public $key_iterator_table;
     /**
      * The timestamp of the index associated with this iterator
      * @var string
@@ -78,21 +73,6 @@ class UnionIterator extends IndexBundleIterator
      * @var int
      */
     public $total_num_docs;
-    /**
-     * Heap of query terms whose scores are considered while finding results
-     * @var array
-     */
-    public $terms_heap;
-    /**
-     * Heap of query terms whose scores are not considered while finding results
-     * @var array
-     */
-    public $low_scoring_terms;
-    /**
-     * Heap of result documents
-     * @var array
-     */
-    public $results_heap;
     /**
      * Creates a union iterator with the given parameters.
      *
@@ -104,33 +84,34 @@ class UnionIterator extends IndexBundleIterator
     public function __construct($index_bundle_iterators,
         $index_name, $total_num_docs)
     {
-        $this->index_bundle_iterators = $index_bundle_iterators;
         /*
             estimate number of results by sum of all iterator counts,
             then improve estimate as iterate
         */
-        $this->num_iterators = count($index_bundle_iterators);
+        $num_iterators = count($index_bundle_iterators);
+        $this->num_iterators = $num_iterators;
         $this->num_docs = 0;
         /*
             result_per_block is at most the sum of
             results_per_block of things we are iterating. Value
             is already init'd in base class.
          */
-        $this->results_per_block = C\MIN_RESULTS_TO_GROUP;
-        $this->key_iterator_table = [];
+        $this->results_per_block = intval(C\MIN_RESULTS_TO_GROUP);
         $this->seen_docs = 0;
         $this->seen_docs_unfiltered = 0;
         $this->index_name = $index_name;
         $this->total_num_docs = $total_num_docs;
-        $this->low_scoring_terms = [];
-        for ($i = 0; $i < $this->results_per_block; $i++) {
-            $this->results_heap[$i][self::SCORE] = 0;
-        }
-        $this->initializeTermsHeap();
-        for ($i = 0; $i < $this->num_iterators; $i++) {
-            $this->index_bundle_iterators[$i]->setResultsPerBlock(1);
-            $this->num_docs += $this->index_bundle_iterators[$i]->num_docs;
-            $this->seen_docs += $this->index_bundle_iterators[$i]->seen_docs;
+        $num_smaller = array_fill(0, $num_iterators, 0);
+        for ($i = 0; $i < $num_iterators; $i++) {
+            $index_bundle_iterators[$i]->setResultsPerBlock(1);
+            $num_docs = $index_bundle_iterators[$i]->num_docs;
+            $this->num_docs += $num_docs;
+            for ($j = 0; $j < $i; $j++) {
+                if ($num_docs < $index_bundle_iterators[$j]->num_docs) {
+                    $num_smaller[$j]++;
+                }
+            }
+            $this->seen_docs += $index_bundle_iterators[$i]->seen_docs;
             if (isset($this->index_bundle_iterators[$i]->seen_docs_unfiltered)){
                 $this->seen_docs_unfiltered +=
                     $this->index_bundle_iterators[$i]->seen_docs_unfiltered;
@@ -138,11 +119,17 @@ class UnionIterator extends IndexBundleIterator
                 $this->seen_docs_unfiltered += $this->seen_docs;
             }
         }
-        $doc_block = $this->currentDocsWithWord();
+        asort($num_smaller);
+        $i = 0;
+        foreach ($num_smaller as $index => $count) {
+            $this->index_bundle_iterators[$i] =
+                $index_bundle_iterators[$index];
+            $i++;
+        }
     }
     /**
      * Returns CrawlConstants::ASCENDING or CrawlConstants::DESCENDING
-     * depending on the direction in which this iterator ttraverse the
+     * depending on the direction in which this iterator traverse the
      * underlying index archive bundle.
      *
      * @return int direction traversing underlying archive bundle
@@ -167,31 +154,6 @@ class UnionIterator extends IndexBundleIterator
         $this->seen_docs_unfiltered = 0;
         $doc_block = $this->currentDocsWithWord();
     }
-    /**
-     * Calculates the total relevance score of the result document
-     *
-     * @param array $heap of terms
-     * @param int $relevance_score previously calculated relevance score
-     * @return array
-     */
-    public function getDocScore($heap, $relevance_score = 0)
-    {
-        $d = $this->currentGenDocOffsetWithWord();
-        $doc = [];
-        while ($d != -1 && !empty($heap) && $heap[0][self::NEXT_DOC] === $d) {
-            $iterator_idx = $heap[0][self::ITERATOR];
-            $iterator = $this->index_bundle_iterators[$iterator_idx];
-            $docs = $iterator->findDocsWithWord();
-            if (is_array($docs) && count($docs) == 1) {
-                $keys = array_keys($docs);
-                $doc = $docs[$keys[0]];
-                $relevance_score += $doc[self::RELEVANCE];
-            }
-            array_splice($heap, 0, 1);
-            $this->heapifyDown($heap, true);
-        }
-        return [$doc, $relevance_score];
-    }
     /**
      * Hook function used by currentDocsWithWord to return the current block
      * of docs if it is not cached
@@ -202,185 +164,118 @@ class UnionIterator extends IndexBundleIterator
     {
         $pages = [];
         $found_docs = false;
-        list($doc, $relevance_score) = $this->getDocScore($this->terms_heap);
-        if (!empty($this->low_scoring_terms)) {
-            list($doc_copy, $additional_score) =
-                $this->getDocScore($this->low_scoring_terms, $relevance_score);
-            $relevance_score += $additional_score;
-        }
-        if (!empty($doc) && $relevance_score >
-            $this->results_heap[0][self::SCORE]) {
-            // Update the document's scores
-            $doc[self::RELEVANCE] = $relevance_score;
-            $doc[self::SCORE] = $relevance_score + $doc[self::DOC_RANK];
-            $found_docs = true;
-            $this->results_heap[0] = [self::DOC_INFO => $doc,
-                self::SCORE => $doc[self::SCORE]];
-            $this->heapifyDown($this->results_heap, false);
-        }
-        $found_top_results = $this->results_heap[0][self::SCORE] > 0;
-        if (!$found_docs || $found_top_results) {
-            $pages = ($this->results_heap[
-                $this->results_per_block - 1][self::SCORE] == 0) ?
-                -1 : $this->getResultsHeap();
-        } else {
-            $pages = [$doc];
-        }
-        $this->pages = $pages;
-        if (is_array($pages)) {
-            $this->count_block_unfiltered = count($pages);
-            $this->count_block = count($pages);
+        $num_iterators = $this->num_iterators;
+        $iterators = $this->index_bundle_iterators;
+        $this->count_block_unfiltered = 0;
+        $direction = $this->getDirection();
+        $max_accumulators = intval($this->results_per_block);
+        $to_accumulators = [];
+        for ($i = 0; $i < $num_iterators; $i++) {
+            $from_accumulators = $to_accumulators;
+            $max_in_pos = count($from_accumulators);
+            $to_accumulators = [];
+            $quota_left = $max_accumulators - $max_in_pos;
+            $iterator = $iterators[$i];
+            if ($quota_left == 0) {
+                for ($j = 0; $j < $max_accumulators; $j++) {
+                    $current_gen_doc_offset = $from_accumulators[$j][
+                        self::GEN_OFFSET];
+                    $iterator_offset = $iterator->currentGenDocOffsetWithWord();
+                    if ($this->genDocOffsetCmp($iterator_offset,
+                        $current_gen_doc_offset, $direction) < 0) {
+                        $iterator->advance($current_gen_doc_offset);
+                        $this->count_block_unfiltered++;
+                    }
+                    if( ($iterator_offset =
+                        $iterator->currentGenDocOffsetWithWord()) == -1) {
+                        break;
+                    }
+                    $to_accumulators[$j] = $from_accumulators[$j];
+                    $cmp = $this->genDocOffsetCmp($iterator_offset,
+                        $current_gen_doc_offset, $direction);
+                    if ($cmp == 0) {
+                        $docs = $iterator->findDocsWithWord();
+                        if (is_array($docs) && count($docs) == 1) {
+                            $keys = array_keys($docs);
+                            $doc = $docs[$keys[0]];
+                            $to_accumulators[$j][self::RELEVANCE] +=
+                                $doc[self::RELEVANCE];
+                        }
+                    }
+                }
+                for ($k = $j; $k < $max_accumulators; $k++) {
+                    $to_accumulators[$k] = $from_accumulators[$k];
+                }
+            } else {
+                $in_pos = 0;
+                $out_pos = 0;
+                while ($out_pos < $max_accumulators) {
+                    if(($iterator_offset =
+                        $iterator->currentGenDocOffsetWithWord()) == -1) {
+                        break;
+                    }
+                    if ($in_pos < $max_in_pos) {
+                        $current_gen_doc_offset = $from_accumulators[$in_pos][
+                            self::GEN_OFFSET];
+                        $cmp = $this->genDocOffsetCmp($iterator_offset,
+                            $current_gen_doc_offset, $direction);
+                    } else {
+                        $cmp = -1;
+                    }
+                    if ($cmp < 0) {
+                        $remaining_in_accumulator =
+                            $max_in_pos - $in_pos - 1;
+                        if ($max_accumulators - $out_pos
+                            > $remaining_in_accumulator) {
+                            $docs = $iterator->findDocsWithWord();
+                            if (is_array($docs) && count($docs) == 1) {
+                                $keys = array_keys($docs);
+                                $doc = $docs[$keys[0]];
+                                $to_accumulators[$out_pos] = $doc;
+                                $to_accumulators[$out_pos][self::GEN_OFFSET] =
+                                    $iterator_offset;
+                                $out_pos++;
+                            }
+                            $iterator->advance();
+                        } else {
+                            $to_accumulators[$out_pos++] =
+                                $from_accumulators[$in_pos++];
+                        }
+                    } else if($cmp == 0) {
+                        $to_accumulators[$out_pos] =
+                            $from_accumulators[$in_pos];
+                        $docs = $iterator->findDocsWithWord();
+                        if (is_array($docs) && count($docs) == 1) {
+                            $keys = array_keys($docs);
+                            $doc = $docs[$keys[0]];
+                            $to_accumulators[$out_pos][self::RELEVANCE] +=
+                                $doc[self::RELEVANCE];
+                        }
+                        $out_pos++;
+                        $in_pos++;
+                    } else {
+                        $to_accumulators[$out_pos++] =
+                            $from_accumulators[$in_pos++];
+                    }
+                    $this->count_block_unfiltered++;
+                }
+            }
         }
-        return $pages;
-    }
-    /**
-     * Gets the docs in the results min heap sorted in
-     * descending order by score
-     *
-     * @return mixed array of result docs if any, -1 otherwise
-     */
-    public function getResultsHeap()
-    {
         $pages = [];
-        while (!empty($this->results_heap)) {
-            $doc = $this->extractMinScoringDoc($this->results_heap);
-            if ($doc[self::SCORE] > 0) {
-                array_unshift($pages, $doc[self::DOC_INFO]);
+        foreach ($to_accumulators as $accumulator) {
+            if (!empty($accumulator[self::KEY])) {
+                $accumulator[self::SCORE] = $accumulator[self::DOC_RANK] +
+                    $accumulator[self::RELEVANCE];
+                $pages[$accumulator[self::KEY]] = $accumulator;
             }
         }
-        // Re-initialize the results heap for the next set of docs
-        $initial_heap_item = [self::SCORE => 0, self::DOC_INFO => null];
-        $this->results_heap = array_fill(0, $this->results_per_block,
-            $initial_heap_item);
         if (empty($pages)) {
-            $pages = -1;
+            return -1;
         }
+        $this->pages = $pages;
+        $this->count_block = count($pages);
         return $pages;
     }
-    /**
-     * Compare between elements for heapify operations
-     *
-     * @param array $i first element
-     * @param array $j second element
-     * @param boolean $is_terms_heap basis for comparison
-     * @return boolean result of comparison
-     */
-    public function compareElements($i, $j, $is_terms_heap)
-    {
-        $is_ascending = $this->getDirection();
-        if (!$is_terms_heap) {
-            return $i[self::SCORE] > $j[self::SCORE];
-        }
-        $i_next_doc = $i[self::NEXT_DOC];
-        $j_next_doc = $j[self::NEXT_DOC];
-        if ($is_ascending) {
-            if ($i_next_doc == -1) {
-                return true;
-            } else if ($j[self::NEXT_DOC] == -1) {
-                return false;
-            }
-            return $i_next_doc[0] > $j_next_doc[0] ||
-                ($i_next_doc[0] == $j_next_doc[0] &&
-                    $i_next_doc[1] > $j_next_doc[1]);
-        } else {
-            return $j_next_doc[0] > $i_next_doc[0] ||
-                ($j_next_doc[0] == $i_next_doc[0] &&
-                    $j_next_doc[1] > $i_next_doc[1]);
-        }
-    }
-    /**
-     * Performs reheap using bubble-down operation
-     *
-     * @param array $heap to be reheaped
-     * @param boolean $is_terms_heap to check comparison condition
-     */
-    public function heapifyDown(&$heap, $is_terms_heap)
-    {
-        $index = 0;
-        $heap_size = count($heap);
-        while ($index < $heap_size) {
-            $left = ($index << 1) + 1;
-            $right = ($index + 1) << 1;
-            $least_doc = $index;
-            if ($left < $heap_size &&
-                $this->compareElements($heap[$least_doc], $heap[$left],
-                    $is_terms_heap)) {
-                $least_doc = $left;
-            }
-            if ($right < $heap_size &&
-                $this->compareElements($heap[$least_doc], $heap[$right],
-                    $is_terms_heap)) {
-                $least_doc = $right;
-            }
-            if ($least_doc != $index) {
-                $temp_doc = $heap[$least_doc];
-                $heap[$least_doc] = $heap[$index];
-                $heap[$index] = $temp_doc;
-                $index = $least_doc;
-            } else {
-                break;
-            }
-        }
-    }
-    /**
-     * Performs reheap using bubble-up operation
-     *
-     * @param array $heap to be reheaped
-     * @param boolean $is_terms_heap to check comparison condition
-     */
-    public function heapifyUp(&$heap, $is_terms_heap)
-    {
-        $index = count($heap) - 1;
-        while ($index > 0) {
-            $parent_index = ($index - 1) >> 1;
-            if ($this->compareElements($heap[$index], $heap[$parent_index],
-                $is_terms_heap)) {
-                break;
-            }
-            $temp_doc = $heap[$parent_index];
-            $heap[$parent_index] = $heap[$index];
-            $heap[$index] = $temp_doc;
-            $index = $parent_index;
-        }
-    }
-    /**
-     * Gets the lowest-scoring document in the min heap of result documents.
-     *
-     * @param array $heap of result docs
-     * @return object lowest-scoring document
-     */
-    public function extractMinScoringDoc(&$heap)
-    {
-        $lowest_doc = $heap[0];
-        $last_index = count($heap) - 1;
-        $heap[0] = $heap[$last_index];
-        unset($heap[$last_index]);
-        $this->heapifyDown($heap, false);
-        return $lowest_doc;
-    }
-    /**
-     * This method creates a heap out of all the query terms
-     * associated with the nested word iterators on the current
-     * union iterator instance.
-     *
-     */
-    public function initializeTermsHeap()
-    {
-        if (!empty($this->terms_heap)) {
-            return;
-        }
-        $this->terms_heap = [];
-        $num_iterators = $this->num_iterators;
-        for ($i = 0; $i < $num_iterators; $i++) {
-            $iterator =  $this->index_bundle_iterators[$i];
-            $this->terms_heap[] = [
-                self::ITERATOR => $i,
-                self::MAX_SCORE => $iterator->getMaxScore(),
-                self::NEXT_DOC => $iterator->currentGenDocOffsetWithWord()
-            ];
-            $this->heapifyUp($this->terms_heap, true);
-        }
-    }
     /**
      * Forwards the iterator one group of docs
      * @param array $gen_doc_offset a generation, doc_offset pair. If set,
@@ -390,46 +285,17 @@ class UnionIterator extends IndexBundleIterator
      */
     public function advance($gen_doc_offset = null)
     {
-        $this->advanceSeenDocs();
+        $this->current_block_fresh = false;
+        $this->seen_docs += $this->count_block;
         $this->seen_docs_unfiltered += $this->count_block_unfiltered;
-        $total_num_docs = 0;
-        $d = $this->currentGenDocOffsetWithWord();
-        $score_k = $this->results_heap[0][self::SCORE];
-        while ($d != -1 && $this->terms_heap[0][self::NEXT_DOC] === $d) {
-            $iterator_idx = $this->terms_heap[0][self::ITERATOR];
-            $iterator = $this->index_bundle_iterators[$iterator_idx];
-            $total_num_docs += $iterator->num_docs;
-            $iterator->advance($gen_doc_offset);
-            $next_doc = $iterator->currentGenDocOffsetWithWord();
-            $this->terms_heap[0][self::NEXT_DOC] = $next_doc;
-            if ($score_k > $this->terms_heap[0][self::MAX_SCORE]) {
-                 $this->low_scoring_terms[] = $this->terms_heap[0];
-                 $this->heapifyUp($this->low_scoring_terms, true);
-                 array_splice($this->terms_heap, 0, 1);
-            }
-            $this->heapifyDown($this->terms_heap, true);
-        }
-        $d = [];
-        $d[self::NEXT_DOC] = $this->currentGenDocOffsetWithWord();
-        if (!empty($this->low_scoring_terms)) {
-            while ($d[self::NEXT_DOC] != -1 &&
-                $this->compareElements($d, $this->low_scoring_terms[0], true)) {
-                $lowest_doc = $this->low_scoring_terms[0];
-                $iterator_idx = $lowest_doc[self::ITERATOR];
-                $iterator = $this->index_bundle_iterators[$iterator_idx];
+        $this->num_docs =
+            floor(($this->seen_docs * $this->total_num_docs) /
+            $this->seen_docs_unfiltered);
+        if ($gen_doc_offset != null) {
+            foreach ($this->index_bundle_iterators as $iterator) {
                 $iterator->advance($gen_doc_offset);
-                $next_doc = $iterator->currentGenDocOffsetWithWord();
-                $this->low_scoring_terms[0][self::NEXT_DOC] = $next_doc;
-                $this->heapifyDown($this->low_scoring_terms, true);
             }
         }
-        if ($this->seen_docs_unfiltered > 0) {
-            $this->num_docs =
-                floor(($this->seen_docs * $total_num_docs) /
-                $this->seen_docs_unfiltered);
-        } else {
-            $this->num_docs = 0;
-        }
     }
     /**
      * This method is supposed to set
@@ -456,22 +322,14 @@ class UnionIterator extends IndexBundleIterator
      * @return mixed the desired document offset and generation.
      */
     public function currentGenDocOffsetWithWord() {
-        return !empty($this->terms_heap) ?
-            (key_exists(self::NEXT_DOC, $this->terms_heap[0]) ?
-                $this->terms_heap[0][self::NEXT_DOC] : -1) : -1;
+        $gen_doc_offset = -1;
+        $index_bundle_iterators = $this->index_bundle_iterators;
+        foreach ($index_bundle_iterators as $iterator) {
+            $gen_doc_offset = $iterator->currentGenDocOffsetWithWord();
+            if ($gen_doc_offset != -1) {
+                break;
+            }
+        }
+        return $gen_doc_offset;
     }
-    /**
-     * This method calculates the max relevance value for query underlying
-     * the iterator to the document currently being iterated over
-     * by the query
-     * @return float maximum score for document relevance to a query
-     */
-    public function getMaxRelevanceScore()
-   {
-       $max_relevance = 0;
-       foreach ($this->index_bundle_iterators as $iterator) {
-           $max_relevance += $iterator->getMaxRelevanceScore();
-       }
-       return $max_score;
-   }
 }
diff --git a/src/library/index_bundle_iterators/WordIterator.php b/src/library/index_bundle_iterators/WordIterator.php
index 7cbde614d..0477d85d1 100644
--- a/src/library/index_bundle_iterators/WordIterator.php
+++ b/src/library/index_bundle_iterators/WordIterator.php
@@ -687,7 +687,8 @@ class WordIterator extends IndexBundleIterator
             $posting[self::DESCRIPTION_SCORES] = array_slice($doc_info, 0,
                 $num_description_scores);
             if ($posting['FREQUENCY'] > 0) {
-                $frequency = $this->frequencyNormalizationScoring(
+                list($bonuses, $frequency) =
+                    $this->frequencyScoring(
                     $occurrences_per_doc,
                     $posting[self::POSITION_LIST],
                     $posting[self::DOC_LEN],
@@ -696,7 +697,9 @@ class WordIterator extends IndexBundleIterator
                     $posting["PATH_KEYWORDS_END_POS"],
                     $posting[self::DESCRIPTION_SCORES]);
                 // Divergence-from-randomness + preface score
-                $posting[self::RELEVANCE] =
+                $posting[self::RELEVANCE] = 2.5 * $bonuses *
+                    log(1 + 1/max(1, $occurrences_per_doc), 2) /
+                    ($bonuses + 1) +
                     ((log(1 + $occurrences_per_doc, 2) + $frequency *
                     log(1 + 1/max(1, $occurrences_per_doc), 2)) /
                     ($frequency + 1));
@@ -721,37 +724,11 @@ class WordIterator extends IndexBundleIterator
         return $key_postings;
     }
     /**
-     * This method calculates the max relevance value for the relevance
-     * calculation of the term to the query
-     * @return float maximum score for document relevance to a query
-     */
-    public function getMaxRelevanceScore()
-    {
-        $occurrences_per_doc = $this->num_occurrences /
-            max($this->total_num_docs, 1);
-        $max_score = 1 + log(1 + 1 / max(1, $occurrences_per_doc), 2);
-        return $max_score;
-    }
-    /**
-     * This method calculates the maximum overall score value for any document
-     * returned by this iterator.
-     * @return float maxScore
-     */
-    public function getMaxScore()
-    {
-        if ($this->is_meta) {
-            return 0.01;
-        }
-        $max_score = $this->getMaxDocQualityScore() +
-                $this->getMaxRelevanceScore();
-        return $max_score;
-    }
-    /**
-     * Normalizes the frequencies of a term within a document with respect to
+     * Computes weighted frequencies of a term within a document with respect to
      * the length of the document, the positions of the term with the document
      * and the overall importance score for a given position within the document
      * Also computes the score of the posting for the host keywords,
-     * title keywords, and path keywords.
+     * title keywords, and path keywords bonuses.
      *
      * @param float $occurrences_per_doc expected number of occurrence of term
      *  per/doc.
@@ -765,10 +742,9 @@ class WordIterator extends IndexBundleIterator
      *  summary that demarks the end of the title portion of the summary
      * @param array $descriptions_scores boundaries and scores of different
      *  regions with document
-     * @return array [normalized frequency, score for host name, title,
-     *     and path keywords]
+     * @return array [score for host title path keywords bonuses, frequency]
      */
-    public function frequencyNormalizationScoring(
+    public function frequencyScoring(
         $occurrences_per_doc, $positions, $num_words, $host_keywords_end_pos,
         $title_end_pos, $path_keywords_end_pos, $descriptions_scores)
     {
@@ -793,7 +769,6 @@ class WordIterator extends IndexBundleIterator
         $path_bonus = $this->ranking_factors["PATH_KEYWORD_BONUS"];
         $title_bonus = $this->ranking_factors["TITLE_BONUS"];
         $len_term = strlen($this->word_key);
-        $max_doc_norm_score = $host_bonus + $path_bonus + $title_bonus + 1;
         $first_index = 0;
         $old_pos = 0;
         /*
@@ -813,6 +788,7 @@ class WordIterator extends IndexBundleIterator
            ], $descriptions_scores);
         $num_scores = count($descriptions_scores);
         $weighted_frequency = 0;
+        $bonuses = 0;
         foreach ($positions as $position) {
             $last_index = $num_scores - 1;
             /* description score offsets are with respect to the description
@@ -828,17 +804,21 @@ class WordIterator extends IndexBundleIterator
                     $first_index = $mid_index;
                 }
             }
-            $weight = $descriptions_scores[$first_index]['SCORE'];
+            $weight = $descriptions_scores[$first_index]['SCORE'];;
             $start_description_pos = $descriptions_scores[$first_index]['POS'];
             $len_description = ($first_index == $num_scores - 1) ?
                 $pseudo_doc_length - $start_description_pos :
                 $descriptions_scores[$first_index + 1]['POS'] -
                 $start_description_pos;
-            $weighted_frequency += $weight * $len_term / $len_description;
+            $frequency_term = $weight * $len_term / $len_description;
+            if ($position <= 0) {
+                $bonuses += $frequency_term;
+            } else {
+                $weighted_frequency += $frequency_term;
+            }
         }
-        $frequency = ($weighted_frequency/$max_doc_norm_score) * $num_words *
-            $length_normalization;
-        return $frequency;
+        $frequency = $weighted_frequency * $length_normalization;
+        return [$bonuses, $frequency];
     }
     /**
      * Updates the seen_docs count during an advance() call
diff --git a/src/models/PhraseModel.php b/src/models/PhraseModel.php
index b2b501087..e545c9ebf 100755
--- a/src/models/PhraseModel.php
+++ b/src/models/PhraseModel.php
@@ -622,8 +622,9 @@ class PhraseModel extends ParallelModel
         }
         foreach ($split_terms as $term) {
             if (!in_array($term, $special_words)) {
-                $search_terms = array_merge($search_terms,
+                $term = implode(" ",
                     PhraseParser::segmentSegment($term, $locale_tag));
+                $search_terms[] = $term;
             }
         }
         $phrase = '';

ViewGit