Refactor out computeDocRank so can be used by DocIterator as well

Chris Pollett [2023-12-01 02:Dec:st]

Refactor out computeDocRank so can be used by DocIterator as well

Filename
src/library/CrawlConstants.php
src/library/index_bundle_iterators/DisjointIterator.php
src/library/index_bundle_iterators/DocIterator.php
src/library/index_bundle_iterators/IndexBundleIterator.php
src/library/index_bundle_iterators/IntersectIterator.php
src/library/index_bundle_iterators/NegationIterator.php
src/library/index_bundle_iterators/UnionIterator.php
src/library/index_bundle_iterators/WordIterator.php
src/models/PhraseModel.php

diff --git a/src/library/CrawlConstants.php b/src/library/CrawlConstants.php
index 0353ea6ce..79e58dbe7 100755
--- a/src/library/CrawlConstants.php
+++ b/src/library/CrawlConstants.php
@@ -264,4 +264,7 @@ interface CrawlConstants
     const SCRAPER_INFO = 'eq';
     const SEQUENCE_NUMBER = 'er';
     const FETCHER_QUEUE_SERVER_RATIO = 'es';
+    const NEXT_DOC = 'et';
+    const ITERATOR = 'eu';
+    const MAX_SCORE = 'ev';
 }
diff --git a/src/library/index_bundle_iterators/DisjointIterator.php b/src/library/index_bundle_iterators/DisjointIterator.php
deleted file mode 100644
index 83bdebab6..000000000
--- a/src/library/index_bundle_iterators/DisjointIterator.php
+++ /dev/null
@@ -1,262 +0,0 @@
-<?php
-/**
- * SeekQuarry/Yioop --
- * Open Source Pure PHP Search Engine, Crawler, and Indexer
- *
- * Copyright (C) 2009 - 2023 Chris Pollett chris@pollett.org
- *
- * LICENSE:
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <https://www.gnu.org/licenses/>.
- *
- * END LICENSE
- *
- * @author Chris Pollett chris@pollett.org
- * @license https://www.gnu.org/licenses/ GPL3
- * @link https://www.seekquarry.com/
- * @copyright 2009 - 2023
- * @filesource
- */
-namespace seekquarry\yioop\library\index_bundle_iterators;
-
-/**
- * Used to iterate over the documents which occur in a set of disjoint iterators
- * all belonging to the same index
- *
- * @author Chris Pollett
- * @see IndexArchiveBundle
- */
-class DisjointIterator extends IndexBundleIterator
-{
-    /**
-     * An array of iterators whose intersection we  get documents from
-     * @var array
-     */
-    public $index_bundle_iterators;
-    /**
-     * Number of elements in $this->index_bundle_iterators
-     * @var int
-     */
-    public $num_iterators;
-    /**
-     * The number of iterated docs before the restriction test
-     * @var int
-     */
-    public $seen_docs_unfiltered;
-    /**
-     * Index of the iterator amongst those we are disjoint unioning of
-     * least gen_doc_offset
-     * @var int
-     */
-    public $least_offset_index;
-    /**
-     * Creates an disjoint union iterator with the given parameters.
-     *
-     * @param object $index_bundle_iterators to use as a source of documents
-     *     to iterate over
-     */
-    public function __construct($index_bundle_iterators)
-    {
-        $this->index_bundle_iterators = $index_bundle_iterators;
-        $this->num_iterators = count($index_bundle_iterators);
-        $this->num_docs = 0;
-        $this->results_per_block = 1;
-        /*
-             We take an initial guess of the num_docs we return as the sum
-             of the num_docs of the underlying iterators. We are also setting
-             up here that we return at most one posting at a time from each
-             iterator
-        */
-        $this->seen_docs = 0;
-        $this->seen_docs_unfiltered = 0;
-        for ($i = 0; $i < $this->num_iterators; $i++) {
-            $this->num_docs += $this->index_bundle_iterators[$i]->num_docs;
-            $this->index_bundle_iterators[$i]->setResultsPerBlock(1);
-            $this->seen_docs += $this->index_bundle_iterators[$i]->seen_docs;
-            if (isset($this->index_bundle_iterators[$i]->seen_docs_unfiltered)){
-                $this->seen_docs_unfiltered +=
-                    $this->index_bundle_iterators[$i]->seen_docs_unfiltered;
-            } else {
-                $this->seen_docs_unfiltered += $this->seen_docs;
-            }
-        }
-        $this->leastGenDocOffsetsAmongstIterators();
-    }
-    /**
-     * Returns CrawlConstants::ASCENDING or CrawlConstants::DESCENDING
-     * depending on the direction in which this iterator ttraverse the
-     * underlying index archive bundle.
-     *
-     * @return int direction traversing underlying archive bundle
-     */
-    public function getDirection()
-    {
-        if (!empty($this->index_bundle_iterators[0])) {
-            return $this->index_bundle_iterators[0]->getDirection();
-        }
-        return self::ASCENDING;
-    }
-    /**
-     * Returns the iterators to the first document block that it could iterate
-     * over
-     */
-    public function reset()
-    {
-        for ($i = 0; $i < $this->num_iterators; $i++) {
-            $this->index_bundle_iterators[$i]->setResultsPerBlock(1);
-            $this->index_bundle_iterators[$i]->reset();
-        }
-
-        $this->seen_docs = 0;
-        $this->seen_docs_unfiltered = 0;
-        $this->leastGenDocOffsetsAmongstIterators();
-    }
-    /**
-     * Hook function used by currentDocsWithWord to return the current block
-     * of docs if it is not cached
-     *
-     * @return mixed doc ids and rank if there are docs left, -1 otherwise
-     */
-    public function findDocsWithWord()
-    {
-        $least_offset = $this->leastGenDocOffsetsAmongstIterators();
-        if ($least_offset == -1) {
-            return -1;
-        }
-        //next we finish computing the score
-        $docs = $this->index_bundle_iterators[
-            $this->least_offset_index]->currentDocsWithWord();
-        $this->count_block = 0;
-        if (is_array($docs)) {
-            $this->count_block = count($docs);
-        }
-        $this->pages = $docs;
-        return $docs;
-    }
-    /**
-     * Gets the doc_offset and generation for the next document that
-     * would be return by this iterator
-     *
-     * @return mixed an array with the desired document offset
-     * and generation; -1 on fail
-     */
-    public function currentGenDocOffsetWithWord() {
-        if ($this->num_iterators <= 0) {
-            return -1;
-        }
-        return $this->leastGenDocOffsetsAmongstIterators();
-    }
-    /**
-     * Finds the next generation and doc offset amongst all the iterators
-     * that is of least value
-     */
-    public function leastGenDocOffsetsAmongstIterators()
-    {
-        $least_gen_offset = -1;
-        $this->least_offset_index = 0;
-        $direction = $this->getDirection();
-        for ($i = 0; $i < $this->num_iterators; $i++) {
-            $cur_gen_doc_offset =
-                $this->index_bundle_iterators[
-                    $i]->currentGenDocOffsetWithWord();
-            if ($least_gen_offset == -1 && is_array($cur_gen_doc_offset)) {
-                $least_gen_offset = $cur_gen_doc_offset;
-                $this->least_offset_index = $i;
-                continue;
-            } else if ($cur_gen_doc_offset == -1) {
-                continue;
-            }
-            $gen_doc_cmp = $this->genDocOffsetCmp($cur_gen_doc_offset,
-                $least_gen_offset, $direction);
-            if ($gen_doc_cmp < 0) {
-                $least_gen_offset = $cur_gen_doc_offset;
-                $this->least_offset_index = $i;
-            }
-        }
-        return $least_gen_offset;
-    }
-    /**
-     * Forwards the iterator one group of docs
-     * @param array $gen_doc_offset a generation, doc_offset pair. If set,
-     *     the must be of greater than or equal generation, and if equal the
-     *     next block must all have $doc_offsets larger than or equal to
-     *     this value
-     */
-    public function advance($gen_doc_offset = null)
-    {
-        $no_change = true;
-        //num_docs can change when advance() called so that's why we recompute
-        $total_num_docs = 0;
-        if ($gen_doc_offset !== null) {
-            $direction = $this->getDirection();
-            for ($i = 0; $i < $this->num_iterators; $i++) {
-                $cur_gen_doc_offset = $this->index_bundle_iterators[
-                    $i]->currentGenDocOffsetWithWord();
-                if ($this->genDocOffsetCmp($cur_gen_doc_offset,
-                    $gen_doc_offset, $direction) < 0) {
-                    if ($no_change) {
-                        $this->current_block_fresh = false;
-                        $this->seen_docs += 1;
-                        $this->seen_docs_unfiltered = 0;
-                        $no_change = false;
-                    }
-                    $this->seen_docs_unfiltered +=
-                        $this->index_bundle_iterators[$i]->seen_docs;
-                    $total_num_docs +=
-                        $this->index_bundle_iterators[$i]->num_docs;
-                    $this->index_bundle_iterators[$i]->advance($gen_doc_offset);
-                }
-            }
-        } else {
-            if (!$this->current_block_fresh) {
-                $this->leastGenDocOffsetsAmongstIterators();
-            }
-            $this->current_block_fresh = false;
-            $this->seen_docs += 1;
-            $this->seen_docs_unfiltered = 0;
-            $least= $this->least_offset_index;
-            if (!isset($this->index_bundle_iterators[$least])) {
-                return;
-            }
-            $this->seen_docs_unfiltered +=
-                $this->index_bundle_iterators[$least]->seen_docs;
-            $total_num_docs += $this->index_bundle_iterators[$least]->num_docs;
-            $this->index_bundle_iterators[$least]->advance();
-        }
-        if ($this->seen_docs_unfiltered > 0) {
-            $this->num_docs =
-                floor(($this->seen_docs * $total_num_docs) /
-                $this->seen_docs_unfiltered);
-        }
-    }
-    /**
-     * This method is supposed to set
-     * the value of the result_per_block field. This field controls
-     * the maximum number of results that can be returned in one go by
-     * currentDocsWithWord(). This method cannot be consistently
-     * implemented for this iterator and expect it to behave nicely
-     * it this iterator is used together with union_iterator or
-     * intersect_iterator. So to prevent a user for doing this, calling this
-     * method results in a user defined error
-     *
-     * @param int $num the maximum number of results that can be returned by
-     *     a block
-     */
-     public function setResultsPerBlock($num) {
-        if ($num != 1) {
-            trigger_error("Cannot set the results per block of
-                a phrase iterator", E_USER_ERROR);
-        }
-     }
-}
diff --git a/src/library/index_bundle_iterators/DocIterator.php b/src/library/index_bundle_iterators/DocIterator.php
index 9b9d36b60..bb59a63a3 100755
--- a/src/library/index_bundle_iterators/DocIterator.php
+++ b/src/library/index_bundle_iterators/DocIterator.php
@@ -98,6 +98,12 @@ class DocIterator extends IndexBundleIterator
      * @var int
      */
     public $current_offset;
+    /**
+     * How url, keywords, and title words should influence relevance
+     * and doc rank calculations
+     * @var array
+     */
+    public $ranking_factors;
     /**
      * An array of shard docids_lens
      * @var array
@@ -135,16 +141,17 @@ class DocIterator extends IndexBundleIterator
      *  added. Note: this value is not saved permanently. So you
      *  could in theory open two read only versions of the same bundle but
      *  reading the results in different directions
-     * @param int $results_per_block the maximum number of results that can
-     *  be returned by a findDocsWithWord call
+     * @param array $ranking_factors field says url being a host, cld,
+     *  or having a lot of slashes should affect its doc rank calculations
      */
     public function __construct($index_name, $filter = null,
         $results_per_block = IndexBundleIterator::RESULTS_PER_BLOCK,
-        $direction = self::ASCENDING)
+        $direction = self::ASCENDING, $ranking_factors = [])
     {
         $this->filter = $filter;
         $this->index_name =  $index_name;
         $this->direction = $direction;
+        $this->ranking_factors = $ranking_factors;
         $this->index_version = IndexManager::getVersion($index_name);
         $index = IndexManager::getIndex($index_name, $direction);
         if (empty($index)) {
@@ -285,9 +292,15 @@ class DocIterator extends IndexBundleIterator
                 $doc_id = $doc_keys[$this->next_offset];
                 $doc_info = $doc_map_tools->unpack($doc_map[$doc_id]);
                 $item = [self::GENERATION => $this->current_generation];
-                list($item[self::DOC_LEN], $item[self::SCORE]) =
+                $item[self::DOC_RANK] = $this->computeDocRank($doc_id,
+                    $this->next_offset, $this->current_generation,
+                    $this->num_generations, $this->last_offset,
+                    $this->last_offset, $this->last_offset,
+                    $this->ranking_factors, $is_ascending);
+                list($item[self::DOC_LEN], ) =
                     array_values(array_shift($doc_info));
-                list($item['TITLE_LENGTH'], $num_description_scores) =
+                $item[self::SCORE] = $item[self::DOC_RANK];
+                list(, $num_description_scores) =
                     array_values(array_shift($doc_info));
                 $item[self::DESCRIPTION_SCORES] = array_slice($doc_info, 0,
                     $num_description_scores);
diff --git a/src/library/index_bundle_iterators/IndexBundleIterator.php b/src/library/index_bundle_iterators/IndexBundleIterator.php
index 50c89a417..dfb27e68d 100644
--- a/src/library/index_bundle_iterators/IndexBundleIterator.php
+++ b/src/library/index_bundle_iterators/IndexBundleIterator.php
@@ -32,8 +32,12 @@ namespace seekquarry\yioop\library\index_bundle_iterators;

 use seekquarry\yioop\configs as C;
 use seekquarry\yioop\library as L;
-use seekquarry\yioop\library\PhraseParser;
 use seekquarry\yioop\library\CrawlConstants;
+use seekquarry\yioop\library\IndexDocumentBundle;
+use seekquarry\yioop\library\PhraseParser;
+use seekquarry\yioop\library\PartitionDocumentBundle;
+
+

 /** For toHexString and Yioop constants*/
 require_once __DIR__."/../Utility.php";
@@ -99,7 +103,7 @@ abstract class IndexBundleIterator implements CrawlConstants
      *     next block must all have $doc_offsets larger than or equal to
      *     this value
      */
-    abstract function advance($gen_doc_offset = null);
+    public abstract function advance($gen_doc_offset = null);
     /**
      * Gets the doc_offset and generation for the next document that
      * would be return by this iterator
@@ -107,14 +111,47 @@ abstract class IndexBundleIterator implements CrawlConstants
      * @return mixed an array with the desired document offset
      * and generation; -1 on fail
      */
-    abstract function currentGenDocOffsetWithWord();
+    public abstract function currentGenDocOffsetWithWord();
     /**
      * Hook function used by currentDocsWithWord to return the current block
      * of docs if it is not cached
      *
      * @return mixed doc ids and score if there are docs left, -1 otherwise
      */
-     abstract function findDocsWithWord();
+     public abstract function findDocsWithWord();
+     /**
+      * This method calculates the max score value for the Doc Quality
+      * calculation (query independent) for an document returned by this
+      * iterator. Currently,  for all documents we have coded a maximum
+      * DOC_RANK of 5 (based on code in @see computeDocRank ). This will
+      * likely need to be revisited in the future.
+      *
+      * @return float maximum score for document quality
+      */
+     public function getMaxDocQualityScore()
+     {
+         return 5;
+     }
+     /**
+      * This method calculates the max relevance value for query underlying
+      * the iterator to the document currently being iterated over
+      * by the query
+      * @return float maximum score for document relevance to a query
+      */
+     public function getMaxRelevanceScore()
+     {
+         return 0.01;
+     }
+     /**
+      * This method calculates the maximum overall score value for any document
+      * returned by this iterator. It should be overriden in subclasses as
+      * makes sense
+      * @return float maximum score
+      */
+    public function getMaxScore()
+    {
+        return $this->getMaxDocQualityScore() + $this->getMaxRelevanceScore();
+    }
     /**
      * Returns a string representation of a plan by which the current iterator
      * finds its results
@@ -281,4 +318,68 @@ abstract class IndexBundleIterator implements CrawlConstants
      {
         $this->results_per_block = $num;
      }
+     /**
+      *
+      */
+    public function computeDocRank($doc_key, $doc_map_index = 1,
+        $num_seen_partitions = 0, $number_of_partitions = 1,
+        $num_doc_keys = PartitionDocumentBundle::MAX_ITEMS_PER_FILE,
+        $avg_items_per_partition = PartitionDocumentBundle::MAX_ITEMS_PER_FILE,
+        $max_items_per_partition = PartitionDocumentBundle::MAX_ITEMS_PER_FILE,
+        $ranking_factors = [], $is_ascending = true)
+    {
+        /*
+           DOC_RANK calculate is a computes a document quality measure
+           either based on time item was added (freshness) or a
+           sum of signals (how early or late it was added to index),
+           whether the url was a CLD or HOST, whether page was a wiki
+           page, and the number of slashes in the url path
+         */
+        $cld_bonus = $ranking_factors["CLD_URL_BONUS"] ?? 1;
+        $host_bonus = $ranking_factors["HOST_URL_BONUS"] ?? 1;
+        $wiki_bonus = $ranking_factors["WIKI_BONUS"] ?? 1;
+        $num_slashes_bonus = $ranking_factors["NUM_SLASHES_BONUS"] ?? 1;
+        $max_pre_rank_and_bonuses = $cld_bonus + $host_bonus +
+            $wiki_bonus + $wiki_bonus + 1;
+        $last_partition_pos =  ($is_ascending) ?
+            $num_doc_keys - $doc_map_index :
+            $doc_map_index;
+        $remaining_partitions =  ($is_ascending) ?
+            $number_of_partitions - $num_seen_partitions :
+            $num_seen_partitions - 1;
+        $pre_rank_and_bonuses = ($remaining_partitions *
+            $this->avg_items_per_partition)/
+            (($number_of_partitions + 1) *
+            ($avg_items_per_partition + 1)) +
+            $last_partition_pos / $max_items_per_partition;
+        if (IndexDocumentBundle::isAHostDocId($doc_key)) {
+            $pre_rank_and_bonuses +=
+                (IndexDocumentBundle::isACldDocId($doc_key)) ?
+                $cld_bonus : $host_bonus;
+        }
+        /**
+         * For backward compatibility: new bonuses should only be added
+         * for  doc_ids following the new letter_code format. Since all
+         * old formats use letters (b, t, etc.) to denote the doc
+         * type, the ASCII values for these letters are all > 96 (i.e.,
+         * bits 6 and 7 of the doc_id's 9th byte are both true).
+         * Since all new letter_code formats use bits 4, 5, 6, 7 to
+         * represent  the doc type as int values mapped between 0-8,
+         * there is no value in a doc_id's 9th byte that can have both
+         * bits 6 and 7  set to true.
+         * This difference can be used to check whether $doc_key follows
+         * the old or new letter_code format.
+         */
+        $doc_id_format = ord($doc_key[
+            IndexDocumentBundle::DOCID_PART_LEN << 1] ?? 0) & 96;
+        if ($doc_id_format != 96) {
+            if (IndexDocumentBundle::isAWikipediaPage($doc_key)) {
+                $pre_rank_and_bonuses += $wiki_bonus;
+            }
+            $pre_rank_and_bonuses  += $num_slashes_bonus /
+                (IndexDocumentBundle::findNumSlashes($doc_key) + 1);
+        }
+        return $this->getMaxDocQualityScore() *
+            $pre_rank_and_bonuses / $max_pre_rank_and_bonuses;
+    }
 }
diff --git a/src/library/index_bundle_iterators/IntersectIterator.php b/src/library/index_bundle_iterators/IntersectIterator.php
index e553da3d5..2acfd225f 100644
--- a/src/library/index_bundle_iterators/IntersectIterator.php
+++ b/src/library/index_bundle_iterators/IntersectIterator.php
@@ -546,16 +546,17 @@ class IntersectIterator extends IndexBundleIterator
         }
      }
      /**
-      * Returns the sum of maxScores for nested WordIterators
-      *
-      * @return int maxScore
+      * This method calculates the max relevance value for query underlying
+      * the iterator to the document currently being iterated over
+      * by the query
+      * @return float maximum score for document relevance to a query
       */
-    public function getMaxScore()
+     public function getMaxRelevanceScore()
     {
-        $maxScore = 0;
+        $max_relevance = 0;
         foreach ($this->index_bundle_iterators as $iterator) {
-            $maxScore += $iterator->getMaxScore();
+            $max_relevance += $iterator->getMaxRelevanceScore();
         }
-        return $maxScore;
+        return $max_score;
     }
 }
diff --git a/src/library/index_bundle_iterators/NegationIterator.php b/src/library/index_bundle_iterators/NegationIterator.php
index aef26eb6f..8d8c7c772 100644
--- a/src/library/index_bundle_iterators/NegationIterator.php
+++ b/src/library/index_bundle_iterators/NegationIterator.php
@@ -119,7 +119,7 @@ class NegationIterator extends IndexBundleIterator
             //we get intersect docs one at a time so should be only one
             $keys = array_keys($docs);
             $key = $keys[0];
-            $docs[$key][self::RELEVANCE] = 1;
+            $docs[$key][self::RELEVANCE] = 0.01;
             $docs[$key][self::SCORE] = $docs[$key][self::DOC_RANK] +
                  $docs[$key][self::RELEVANCE];
         }
diff --git a/src/library/index_bundle_iterators/UnionIterator.php b/src/library/index_bundle_iterators/UnionIterator.php
index f48518c2c..f966d3f8c 100644
--- a/src/library/index_bundle_iterators/UnionIterator.php
+++ b/src/library/index_bundle_iterators/UnionIterator.php
@@ -92,30 +92,6 @@ class UnionIterator extends IndexBundleIterator
      * @var array
      */
     public $results_heap;
-    /**
-     * Heap constant to track the next occurrence of the term on a consituent
-     * iterator
-     */
-    const NEXT_DOC = 'NEXT_DOC';
-    /**
-     * Heap constant to track the index of a consituent iterator on
-     * $index_bundle_iterators
-     */
-    const ITERATOR = 'ITERATOR';
-    /**
-     * Heap constant to track the MaxScore of the term on a consituent
-     * iterator
-     */
-    const MAX_SCORE = 'MAX_SCORE';
-    /**
-     * Heap constant to track the doc fetched by a consituent iterator
-     */
-    const DOC = 'DOC';
-    /**
-     * Heap constant to track the score of a doc fetched by a consituent
-     * iterator
-     */
-    const DOC_SCORE = 'DOC_SCORE';
     /**
      * Creates a union iterator with the given parameters.
      *
@@ -142,7 +118,7 @@ class UnionIterator extends IndexBundleIterator
         $this->total_num_docs = $total_num_docs;
         $this->low_scoring_terms = [];
         for ($i = 0; $i < self::RESULTS_PER_BLOCK; $i++) {
-            $this->results_heap[$i][self::DOC_SCORE] = 0;
+            $this->results_heap[$i][self::SCORE] = 0;
         }
         $this->initializeTermsHeap($this->terms_heap);
         for ($i = 0; $i < $this->num_iterators; $i++) {
@@ -234,20 +210,19 @@ class UnionIterator extends IndexBundleIterator
         }

         if (!empty($doc) && $relevance_score >
-            $this->results_heap[0][self::DOC_SCORE]) {
+            $this->results_heap[0][self::SCORE]) {
             // Update the document's scores
             $doc[self::RELEVANCE] = $relevance_score;
-            $score = $relevance_score + $doc[self::DOC_RANK];
-            $doc[self::SCORE] = $score;
+            $doc[self::SCORE] = $relevance_score + $doc[self::DOC_RANK];
             $found_docs = true;
-            $this->results_heap[0][self::DOC] = $doc;
-            $this->results_heap[0][self::DOC_SCORE] = $score;
+            $this->results_heap[0] = [self::DOC_INFO => $doc,
+                self::SCORE => $doc[self::SCORE]];
             $this->heapifyDown($this->results_heap, false);
         }
-        $found_top_results = $this->results_heap[0][self::DOC_SCORE] > 0;
+        $found_top_results = $this->results_heap[0][self::SCORE] > 0;
         if (!$found_docs || $found_top_results) {
-            $pages = ($this->
-                results_heap[self::RESULTS_PER_BLOCK-1][self::DOC_SCORE] == 0) ?
+            $pages = ($this->results_heap[
+                self::RESULTS_PER_BLOCK - 1][self::SCORE] == 0) ?
                 -1 : $this->getResultsHeap();
         } else {
             $pages = [$doc];
@@ -270,15 +245,14 @@ class UnionIterator extends IndexBundleIterator
         $pages = [];
         while (!empty($this->results_heap)) {
             $doc = $this->extractMinScoringDoc($this->results_heap);
-            if ($doc[self::DOC_SCORE] > 0) {
-                array_unshift($pages, $doc[self::DOC]);
+            if ($doc[self::SCORE] > 0) {
+                array_unshift($pages, $doc[self::DOC_INFO]);
             }
         }
         // Re-initialize the results heap for the next set of docs
-        $this->results_heap = [];
-        for ($i = 0; $i < self::RESULTS_PER_BLOCK; $i++) {
-            $this->results_heap[$i][self::DOC_SCORE] = 0;
-        }
+        $initial_heap_item = [self::SCORE => 0, self::DOC_INFO => null];
+        $this->results_heap = array_fill(0, self::RESULTS_PER_BLOCK,
+            $initial_heap_item);
         if (empty($pages)) {
             $pages = -1;
         }
@@ -296,21 +270,23 @@ class UnionIterator extends IndexBundleIterator
     {
         $is_ascending = $this->getDirection();
         if (!$is_terms_heap) {
-            return $i[self::DOC_SCORE] > $j[self::DOC_SCORE];
+            return $i[self::SCORE] > $j[self::SCORE];
         }
+        $i_next_doc = $i[self::NEXT_DOC];
+        $j_next_doc = $j[self::NEXT_DOC];
         if ($is_ascending) {
-            if ($i[self::NEXT_DOC] == -1) {
+            if ($i_next_doc == -1) {
                 return true;
             } else if ($j[self::NEXT_DOC] == -1) {
                 return false;
             }
-            return $i[self::NEXT_DOC][0] > $j[self::NEXT_DOC][0] ||
-                ($i[self::NEXT_DOC][0] == $j[self::NEXT_DOC][0] &&
-                    $i[self::NEXT_DOC][1] > $j[self::NEXT_DOC][1]);
+            return $i_next_doc[0] > $j_next_doc[0] ||
+                ($i_next_doc[0] == $j_next_doc[0] &&
+                    $i_next_doc[1] > $j_next_doc[1]);
         } else {
-            return $j[self::NEXT_DOC][0] > $i[self::NEXT_DOC][0] ||
-                ($j[self::NEXT_DOC][0] == $i[self::NEXT_DOC][0] &&
-                    $j[self::NEXT_DOC][1] > $i[self::NEXT_DOC][1]);
+            return $j_next_doc[0] > $i_next_doc[0] ||
+                ($j_next_doc[0] == $i_next_doc[0] &&
+                    $j_next_doc[1] > $i_next_doc[1]);
         }
     }
     /**
@@ -324,8 +300,8 @@ class UnionIterator extends IndexBundleIterator
         $index = 0;
         $heap_size = count($heap);
         while ($index < $heap_size) {
-            $left = $index * 2 + 1;
-            $right = $index * 2 + 2;
+            $left = ($index << 1) + 1;
+            $right = ($index + 1) << 1;
             $least_doc = $index;
             if ($left < $heap_size &&
                 $this->compareElements($heap[$least_doc], $heap[$left],
@@ -357,7 +333,7 @@ class UnionIterator extends IndexBundleIterator
     {
         $index = count($heap) - 1;
         while ($index > 0) {
-            $parent_index = floor(($index - 1) / 2);
+            $parent_index = ($index - 1) >> 1;
             if ($this->compareElements($heap[$index], $heap[$parent_index],
                 $is_terms_heap)) {
                 break;
@@ -395,14 +371,13 @@ class UnionIterator extends IndexBundleIterator
         if (!empty($terms)) {
             return;
         }
-        for ($i = 0; $i < $this->num_iterators; $i++) {
+        $num_iterators = $this->num_iterators;
+        for ($i = 0; $i < $num_iterators; $i++) {
             $iterator =  $this->index_bundle_iterators[$i];
-            $max_score = $iterator->getMaxScore();
-            $position = $iterator->currentGenDocOffsetWithWord();
             $terms[] = [
                 self::ITERATOR => $i,
-                self::MAX_SCORE => $max_score,
-                self::NEXT_DOC => $position
+                self::MAX_SCORE => $iterator->getMaxScore(),
+                self::NEXT_DOC => $iterator->currentGenDocOffsetWithWord()
             ];
             $this->heapifyUp($terms, true);
         }
@@ -420,7 +395,7 @@ class UnionIterator extends IndexBundleIterator
         $this->seen_docs_unfiltered += $this->count_block_unfiltered;
         $total_num_docs = 0;
         $d = $this->currentGenDocOffsetWithWord();
-        $score_k = $this->results_heap[0][self::DOC_SCORE];
+        $score_k = $this->results_heap[0][self::SCORE];
         while ($d != -1 && $this->terms_heap[0][self::NEXT_DOC] === $d) {
             $iterator_idx = $this->terms_heap[0][self::ITERATOR];
             $iterator = $this->index_bundle_iterators[$iterator_idx];
@@ -486,4 +461,18 @@ class UnionIterator extends IndexBundleIterator
             (key_exists(self::NEXT_DOC, $this->terms_heap[0]) ?
                 $this->terms_heap[0][self::NEXT_DOC] : -1) : -1;
     }
+    /**
+     * This method calculates the max relevance value for query underlying
+     * the iterator to the document currently being iterated over
+     * by the query
+     * @return float maximum score for document relevance to a query
+     */
+    public function getMaxRelevanceScore()
+   {
+       $max_relevance = 0;
+       foreach ($this->index_bundle_iterators as $iterator) {
+           $max_relevance += $iterator->getMaxRelevanceScore();
+       }
+       return $max_score;
+   }
 }
diff --git a/src/library/index_bundle_iterators/WordIterator.php b/src/library/index_bundle_iterators/WordIterator.php
index 3592c8250..746934639 100644
--- a/src/library/index_bundle_iterators/WordIterator.php
+++ b/src/library/index_bundle_iterators/WordIterator.php
@@ -574,8 +574,8 @@ class WordIterator extends IndexBundleIterator
                     IndexManager::lookupLatestVersionPage($url_hash,
                     $this->index_name);
                 if ($latest_version_info != null) {
-                    $latest_partition = $latest_version_info[0];
-                    $latest_posting = $latest_version_info[1];
+                    list($latest_partition, $latest_posting) =
+                        $latest_version_info;
                     /**
                      * Ensure that the discovered latest version
                      * isn't the same as the current posting.
@@ -584,7 +584,7 @@ class WordIterator extends IndexBundleIterator
                         $latest_posting['DOC_MAP_INDEX'] !=
                         $doc_map_index) {
                         $latest_base_folder = $index->
-                        getPartitionBaseFolder($latest_partition);
+                            getPartitionBaseFolder($latest_partition);
                         $latest_doc_map_filename = $latest_base_folder .
                             "/" . IndexDocumentBundle::DOC_MAP_FILENAME;
                         $latest_doc_map_index =
@@ -670,53 +670,11 @@ class WordIterator extends IndexBundleIterator
                     (max(1, $time - $original_score)) *
                     $this->getMaxDocQualityScore();
             } else {
-                $cld_bonus = $this->ranking_factors["CLD_URL_BONUS"];
-                $host_bonus = $this->ranking_factors["HOST_URL_BONUS"];
-                $wiki_bonus = $this->ranking_factors["WIKI_BONUS"];
-                $num_slashes_bonus =
-                    $this->ranking_factors["NUM_SLASHES_BONUS"];
-                $max_pre_rank_and_bonuses = $cld_bonus + $host_bonus +
-                    $wiki_bonus + $wiki_bonus + 1;
-                $last_partition_pos =  ($is_ascending) ?
-                    $num_doc_keys - $doc_map_index :
-                    $doc_map_index;
-                $remaining_partitions =  ($is_ascending) ?
-                    $number_of_partitions - $num_seen_partitions :
-                    $num_seen_partitions - 1;
-                $pre_rank_and_bonuses = ($remaining_partitions *
-                    $this->avg_items_per_partition)/
-                    (($number_of_partitions + 1) *
-                    ($this->avg_items_per_partition + 1)) +
-                    $last_partition_pos / $this->max_items_per_partition;
-                if (IndexDocumentBundle::isAHostDocId($doc_key)) {
-                    $pre_rank_and_bonuses +=
-                        (IndexDocumentBundle::isACldDocId($doc_key)) ?
-                        $cld_bonus : $host_bonus;
-                }
-                /**
-                 * For backward compatibility: new bonuses should only be added
-                 * for  doc_ids following the new letter_code format. Since all
-                 * old formats use letters (b, t, etc.) to denote the doc
-                 * type, the ASCII values for these letters are all > 96 (i.e.,
-                 * bits 6 and 7 of the doc_id's 9th byte are both true).
-                 * Since all new letter_code formats use bits 4, 5, 6, 7 to
-                 * represent  the doc type as int values mapped between 0-8,
-                 * there is no value in a doc_id's 9th byte that can have both
-                 * bits 6 and 7  set to true.
-                 * This difference can be used to check whether $doc_key follows
-                 * the old or new letter_code format.
-                 */
-                $doc_id_format = ord($doc_key[
-                    IndexDocumentBundle::DOCID_PART_LEN << 1] ?? 0) & 96;
-                if ($doc_id_format != 96) {
-                    if (IndexDocumentBundle::isAWikipediaPage($doc_key)) {
-                        $pre_rank_and_bonuses += $wiki_bonus;
-                    }
-                    $pre_rank_and_bonuses  += $num_slashes_bonus /
-                        (IndexDocumentBundle::findNumSlashes($doc_key) + 1);
-                }
-                $posting[self::DOC_RANK] = $this->getMaxDocQualityScore() *
-                    $pre_rank_and_bonuses / $max_pre_rank_and_bonuses;
+                $posting[self::DOC_RANK] = $this->computeDocRank($doc_key,
+                    $doc_map_index, $num_seen_partitions, $number_of_partitions,
+                    $num_doc_keys, $this->avg_items_per_partition,
+                    $this->max_items_per_partition,
+                    $this->ranking_factors, $is_ascending);
             }
             list($preface_positions, $num_description_scores) =
                 array_values(array_shift($doc_info));
@@ -763,30 +721,20 @@ class WordIterator extends IndexBundleIterator
         return $key_postings;
     }
     /**
-     * This method calculates the maxScore value for the relevance calculation
-     * of the term to the query
+     * This method calculates the max relevance value for the relevance
+     * calculation of the term to the query
      * @return float maximum score for document relevance to a query
      */
     public function getMaxRelevanceScore()
     {
         $occurrences_per_doc = $this->num_occurrences /
             max($this->total_num_docs, 1);
-        $max_score = 1 + log(1 + 1/max(1, $occurrences_per_doc), 2);
-        return $max_score;
-    }
-    /**
-     * This method calculates the maxScore value for the Doc Quality calculation
-     * for a document and a query
-     * @return float maximum score for document quality
-     */
-    public function getMaxDocQualityScore()
-    {
-        $max_score = 5;
+        $max_score = 1 + log(1 + 1 / max(1, $occurrences_per_doc), 2);
         return $max_score;
     }
     /**
-     * This method calculates the maxScore value for the Doc Quality calculation
-     * for a document and a query
+     * This method calculates the maximum overall score value for any document
+     * returned by this iterator.
      * @return float maxScore
      */
     public function getMaxScore()
diff --git a/src/models/PhraseModel.php b/src/models/PhraseModel.php
index d55566c25..6eb1b2e16 100755
--- a/src/models/PhraseModel.php
+++ b/src/models/PhraseModel.php
@@ -1905,7 +1905,7 @@ class PhraseModel extends ParallelModel
                         }
                         $word_iterators[$i] = new I\DocIterator(
                             $actual_index_name, $filter, $to_retrieve,
-                            $direction);
+                            $direction, $ranking_factors);
                         $min_group_override = true;
                     } else {
                         $distinct_key = $distinct_word_keys[$i];

ViewGit