Tweaks to disjunctive query handling

Chris Pollett [2023-11-27 22:Nov:th]

Tweaks to disjunctive query handling

Filename
src/configs/Config.php
src/configs/PublicHelpPages.php
src/controllers/SearchController.php
src/controllers/components/CrawlComponent.php
src/data/public_default.db
src/examples/SearchApi.php
src/executables/QueryTool.php
src/library/CrawlConstants.php
src/library/IndexDocumentBundle.php
src/library/IndexShard.php
src/library/index_bundle_iterators/GroupIterator.php
src/library/index_bundle_iterators/IntersectIterator.php
src/library/index_bundle_iterators/NegationIterator.php
src/library/index_bundle_iterators/NetworkIterator.php
src/library/index_bundle_iterators/UnionIterator.php
src/library/index_bundle_iterators/WordIterator.php
src/models/PhraseModel.php
src/models/ProfileModel.php
src/views/elements/PageoptionsElement.php
src/views/elements/SearchElement.php
tests/BPlusTreeTest.php
tests/IndexDocumentBundleTest.php
tests/WordIteratorTest.php

diff --git a/src/configs/Config.php b/src/configs/Config.php
index 557fc8cff..b3c56e6ef 100755
--- a/src/configs/Config.php
+++ b/src/configs/Config.php
@@ -573,12 +573,6 @@ if (file_exists(WORK_DIRECTORY . PROFILE_FILE_NAME)) {
      *  each
      */
     nsdefine('TITLE_BONUS', 5);
-    /**
-     *  Phrase Proximity scores are normalized between 0 and 1, this is
-     *  the weighting factor to multiply that basic score before adding it
-     *  to the overall score used for ranking
-     */
-    nsdefine('PROXIMITY_BONUS', 20);
     /**
      *  Bonus to add to relevance score if the url path contains the search
      *  term. If the path has 1 term bonus would be 3 if 3 terms then 1
diff --git a/src/configs/PublicHelpPages.php b/src/configs/PublicHelpPages.php
index fd2893d54..80533cd9f 100644
--- a/src/configs/PublicHelpPages.php
+++ b/src/configs/PublicHelpPages.php
@@ -43404,7 +43404,6 @@ END_HEAD_VARSThe score used to rank a page is computed as the document rank scor
 ; &#039;&#039;&#039;Host Keyword Bonus&#039;&#039;&#039; :  Potential bonus to add to relevance score. The number of occurrences of search term divided by the number of host name keywords is the fraction of this bonus that will be added to the relevance score.
 ; &#039;&#039;&#039;Title Bonus&#039;&#039;&#039; :  Potential bonus to add to relevance score. The number of occurrences of search term divided by the number of words in the title is the fraction of this bonus that will be added to the relevance score.
 ; &#039;&#039;&#039;Path Bonus&#039;&#039;&#039; :  Potential bonus to add to relevance score. The number of occurrences of search term divided by the number of words in the path portion of the url is the fraction of this bonus that will be added to the relevance score.
-; &#039;&#039;&#039;Proximity Bonus&#039;&#039;&#039; : Proximity scores for multi-term queries are normalized between 0 and 1, this is the weighting factor to multiply that basic score before adding it to the overall score used for ranking
 ; &#039;&#039;&#039;CLD Url Bonus&#039;&#039;&#039; : Bonus to add to doc rank score if the url is a company level domain.
 ; &#039;&#039;&#039;Host Url Bonus&#039;&#039;&#039; : Bonus to add to doc rank score if the url is a a hostname.
 ; &#039;&#039;&#039;User Rank  Bonus&#039;&#039;&#039; : User rank scores (created by making a classifier) for a document are normalized between 0 and 1, this is the weighting factor to multiply that basic score before adding it to the overall score used for ranking.
diff --git a/src/controllers/SearchController.php b/src/controllers/SearchController.php
index f886acfe6..424118f24 100755
--- a/src/controllers/SearchController.php
+++ b/src/controllers/SearchController.php
@@ -336,7 +336,6 @@ class SearchController extends Controller implements CrawlConstants
             "host_url_bonus" => C\HOST_URL_BONUS,
             "host_keyword_bonus" => C\HOST_KEYWORD_BONUS,
             "path_keyword_bonus" => C\PATH_KEYWORD_BONUS,
-            "proximity_bonus" => C\PROXIMITY_BONUS,
             "title_bonus" => C\TITLE_BONUS,
             "user_rank_bonus" => C\USER_RANK_BONUS,
             "wiki_bonus" => C\WIKI_BONUS,
diff --git a/src/controllers/components/CrawlComponent.php b/src/controllers/components/CrawlComponent.php
index 32b9740a3..f84b20285 100644
--- a/src/controllers/components/CrawlComponent.php
+++ b/src/controllers/components/CrawlComponent.php
@@ -1703,7 +1703,6 @@ class CrawlComponent extends Component implements CrawlConstants
         }
         $bonuses = ['HOST_KEYWORD_BONUS' => 6,
             'TITLE_BONUS' => 5, 'PATH_KEYWORD_BONUS' => 3,
-            'PROXIMITY_BONUS' => 20,
             'CLD_URL_BONUS' => 2, 'HOST_URL_BONUS' => 0.5,
             'WIKI_BONUS' => 0.5, 'NUM_SLASHES_BONUS' => 0.5,
             'MIN_RESULTS_TO_GROUP' => C\MIN_RESULTS_TO_GROUP,
diff --git a/src/data/public_default.db b/src/data/public_default.db
index 3498429fc..cdff7bb0d 100644
Binary files a/src/data/public_default.db and b/src/data/public_default.db differ
diff --git a/src/examples/SearchApi.php b/src/examples/SearchApi.php
index c91810d28..820470461 100644
--- a/src/examples/SearchApi.php
+++ b/src/examples/SearchApi.php
@@ -147,7 +147,6 @@ function outputQueryData($data)
             wordwrap(trim($page[CrawlConstants::DESCRIPTION]))."\n";
         echo "Rank: ".$page[CrawlConstants::DOC_RANK]."\n";
         echo "Relevance: ".$page[CrawlConstants::RELEVANCE]."\n";
-        echo "Proximity: ".$page[CrawlConstants::PROXIMITY]."\n";
         echo "Score: ".$page[CrawlConstants::SCORE]."\n";
         echo "============\n\n";
     }
diff --git a/src/executables/QueryTool.php b/src/executables/QueryTool.php
index fa0aec820..22106463a 100755
--- a/src/executables/QueryTool.php
+++ b/src/executables/QueryTool.php
@@ -115,7 +115,6 @@ class QueryTool implements CrawlConstants
                 "\n";
             echo "Rank: " . $page[self::DOC_RANK] . "\n";
             echo "Relevance: " . $page[self::RELEVANCE] . "\n";
-            echo "Proximity: " . $page[self::PROXIMITY] . "\n";
             echo "Score: " . $page[self::SCORE] . "\n";
             if (!empty($page[self::PINNED])) {
                 echo "This was a pinned result\n";
@@ -124,7 +123,6 @@ class QueryTool implements CrawlConstants
         }
         $data['ELAPSED_TIME'] = L\changeInMicrotime($start_time);
         echo "QUERY STATISTICS\n";
-
         echo "============\n";
         echo "ELAPSED TIME: ".$data['ELAPSED_TIME']."\n";
         if (isset($data['LIMIT'])) {
diff --git a/src/library/CrawlConstants.php b/src/library/CrawlConstants.php
index 37b2ddf52..0353ea6ce 100755
--- a/src/library/CrawlConstants.php
+++ b/src/library/CrawlConstants.php
@@ -183,7 +183,6 @@ interface CrawlConstants
     const INDEXING_PLUGINS = 'bl';
     const DOMAIN_WEIGHTS = 'bm';
     const POSITION_LIST = 'bn';
-    const PROXIMITY = 'bo';
     const LOCATION = 'bp';
     const INDEXED_FILE_TYPES = 'bq';
     const PAGE_RANGE_REQUEST = 'br';
diff --git a/src/library/IndexDocumentBundle.php b/src/library/IndexDocumentBundle.php
index eb1fc3075..c22d888a4 100644
--- a/src/library/IndexDocumentBundle.php
+++ b/src/library/IndexDocumentBundle.php
@@ -54,7 +54,7 @@ class IndexDocumentBundle implements CrawlConstants
      * The version of this IndexDocumentBundle. The lowest format number is
      * 3.0 as prior inverted index/document stores used IndexArchiveBundle's
      */
-    const DEFAULT_VERSION = "3.2";
+    const DEFAULT_VERSION = "3.3";
     /**
      * Default values for the configuration parameters of an
      * IndexDocumentBundle
@@ -83,16 +83,21 @@ class IndexDocumentBundle implements CrawlConstants
     /**
      * Length of terms' bloom filter string in bytes
      */
-    const WORDSFILTER_LEN = 125;
+    const TERMSFILTER_LEN = 125;
+    /**
+     * Number of terms from a doc to store in term filter (this
+     * would typically be the top terms according to soem metric)
+     */
+    const NUM_TERMS_FILTER = 300;
     /**
      * Length of terms' bloom filter string in bits
      */
-    const WORDSFILTER_BITS_LEN = 1000;
+    const TERMSFILTER_BITS_LEN = 1000;
     /**
      * Number of hash functions to use while constructing the
      * terms' bloom filter string
      */
-    const WORDSFILTER_HASHFN_COUNT = 3;
+    const TERMSFILTER_HASHFN_COUNT = 3;
     /**
      * Partition i in an IndexDocumentBundle has a subfolder i
      * within self::POSITIONS_DOC_MAP_FOLDER. Within this subfolder i,
@@ -971,15 +976,15 @@ class IndexDocumentBundle implements CrawlConstants
         if (empty($doc_id)) {
             return "";
         }
-        $words = array_keys($word_lists);
-        if (count($words) > 300) {
-            $words = array_slice($words, 0, 300);
+        $terms = array_keys($word_lists);
+        if (count($terms) > self::NUM_TERMS_FILTER) {
+            $terms = array_slice($terms, 0, self::NUM_TERMS_FILTER);
         }
-        $words_filter = $this->storeWords($words);
+        $terms_filter = $this->storeTerms($terms);
         $this->addScoresDocMap($doc_id, $num_words,
             $url_info[self::SCORE], $host_keywords_end_pos, $title_end_pos,
             $path_keywords_end_pos, $description_scores,
-            $user_ranks, $words_filter);
+            $user_ranks, $terms_filter);
         $this->addTermPostingLists(0, $num_words,
             $word_lists, $meta_ids, $this->doc_map_counter);
         $this->doc_map_counter++;
@@ -1085,7 +1090,7 @@ class IndexDocumentBundle implements CrawlConstants
      */
     public function addScoresDocMap($doc_id, $num_words, $score,
         $host_keywords_end_pos, $title_end_pos, $path_keywords_end_pos,
-        $description_scores, $user_ranks, $words_filter = "")
+        $description_scores, $user_ranks, $terms_filter = "")
     {
         $num_description_scores = count($description_scores);
         $preface_positions =
@@ -1104,36 +1109,59 @@ class IndexDocumentBundle implements CrawlConstants
         /* the doc_map entry is prepended with a string representing
          * the bloom filter of terms in the document
          */
-        $entry = $words_filter . $entry;
+        $entry = $terms_filter . $entry;
         $this->doc_map_tools->add($this->doc_map, $doc_id, $entry,
             PackedTableTools::ADD_MEM_TABLE_STRING);
     }
     /**
-     * Creates a bloom filter string made up of the top 300 most
+     * Creates a bloom filter string made up of the 300 most
      * important terms in the current document. This filter is used
      * later to check if a term belongs to the document.
      *
-     * @param array $words terms in document
+     * @param array $terms terms in document
      * @return string term bloom filter, prepended with 't'
      * to check for backward compatibility
      */
-    public static function storeWords($words)
+    public static function storeTerms($terms)
     {
-        $hash_functions = self::WORDSFILTER_HASHFN_COUNT;
-        $size = self::WORDSFILTER_BITS_LEN;
-        $words_filter = str_repeat('0', self::WORDSFILTER_LEN);
-        foreach ($words as $word) {
-            $word = canonicalTerm($word);
+        $hash_functions = self::TERMSFILTER_HASHFN_COUNT;
+        $size = self::TERMSFILTER_BITS_LEN;
+        $terms_filter = str_repeat(chr(0), self::TERMSFILTER_LEN);
+        foreach ($terms as $term) {
+            $term = canonicalTerm($term);
             for ($i = 0; $i < $hash_functions; $i++) {
-                $hash = crc32($word . $i) % $size;
-                $byte = (int)($hash / 8);
-                $bit = $hash % 8;
-                $ascii_char = ord($words_filter[$byte]);
+                $hash = crc32($term . $i) % $size;
+                $byte = (int)($hash >> 3);
+                $bit = $hash & 7;
+                $ascii_char = ord($terms_filter[$byte]);
                 $ascii_char |= (1 << $bit);
-                $words_filter[$byte] = chr($ascii_char);
+                $terms_filter[$byte] = chr($ascii_char);
             }
         }
-        return 't' . $words_filter;
+        return 't' . $terms_filter;
+    }
+    /**
+     * Check if the current term id exists in the term bloom filter
+     * associated with the doc_map entry.
+     *
+     * @param string $term to look up
+     * @param string $terms_filter term bloom filter
+     * @return boolean exists or not
+     */
+    public function checkTermExists($term, $terms_filter)
+    {
+        $hash_functions = self::TERMSFILTER_HASHFN_COUNT;
+        $size = self::TERMSFILTER_BITS_LEN;
+        for ($i = 0; $i < $hash_functions; $i++) {
+            $hash = crc32($term . $i) % $size;
+            $byte = (int)($hash >> 3);
+            $bit = $hash % 7;
+            $ascii_char = ord($terms_filter[$byte]);
+            if (($ascii_char & (1 << $bit)) == 0) {
+                return false;
+            }
+        }
+        return true;
     }
     /**
      * Adds posting records associated to a document to the posting lists for
@@ -1221,7 +1249,7 @@ class IndexDocumentBundle implements CrawlConstants
      */
     public static function isACldDocId($key)
     {
-        return (ord($key[self::DOCID_PART_LEN << 1] ?? '\0') & 128) > 0;
+        return (ord($key[self::DOCID_PART_LEN << 1] ?? chr(0)) & 128) > 0;
     }
     /**
      * Checks if a doc_id $key is that of a Wikipedia page.
@@ -1229,7 +1257,7 @@ class IndexDocumentBundle implements CrawlConstants
      */
     public static function isAWikipediaPage($key)
     {
-        return (ord($key[self::DOCID_PART_LEN << 1] ?? '\0') & 4) > 0;
+        return (ord($key[self::DOCID_PART_LEN << 1] ?? chr(0)) & 4) > 0;
     }
     /**
      * Finds number of '/' in the url after the hostname represented by doc_id
@@ -1238,7 +1266,7 @@ class IndexDocumentBundle implements CrawlConstants
      */
     public static function findNumSlashes($key)
     {
-        return (ord($key[self::DOCID_PART_LEN << 1] ?? '\0') & 3);
+        return (ord($key[self::DOCID_PART_LEN << 1] ?? chr(0)) & 3);
     }

     /**
diff --git a/src/library/IndexShard.php b/src/library/IndexShard.php
index 1df0ab8f2..c89d74120 100644
--- a/src/library/IndexShard.php
+++ b/src/library/IndexShard.php
@@ -867,8 +867,6 @@ class IndexShard extends PersistentStructure implements CrawlConstants
         }
         $item[self::DOC_LEN] = $doc_len;
         $item[self::IS_DOC] = $is_doc;
-        $item[self::PROXIMITY] =
-            $this->computeProximity($position_list, $is_doc);
         $item[self::DESCRIPTION_SCORES] = [];
         $doc_id_len = ($num_keys > 3) ? self::DOC_ID_LEN : $num_keys  *
             $doc_key_len; /* original format allowed shorter doc ids,
@@ -1025,20 +1023,6 @@ class IndexShard extends PersistentStructure implements CrawlConstants
         }
         return $count;
     }
-    /**
-     * Returns a proximity score for a single term based on its location in
-     * doc.
-     *
-     * @param array $position_list locations of term within item
-     * @param bool $is_doc whether the item is a document or not
-     * @return int a score for proximity
-     */
-    public function computeProximity($position_list, $is_doc) {
-        return (!$is_doc) ? self::LINK_WEIGHT :
-            ((isset($position_list[0]) &&
-            $position_list[0] < C\AD_HOC_TITLE_LENGTH) ?
-            self::TITLE_WEIGHT : self::DESCRIPTION_WEIGHT);
-    }
     /**
      * Computes BM25F relevance and a score for the supplied item based
      * on the supplied parameters.
diff --git a/src/library/index_bundle_iterators/GroupIterator.php b/src/library/index_bundle_iterators/GroupIterator.php
index 98c9eb6e6..96e560469 100644
--- a/src/library/index_bundle_iterators/GroupIterator.php
+++ b/src/library/index_bundle_iterators/GroupIterator.php
@@ -378,7 +378,7 @@ class GroupIterator extends IndexBundleIterator
     }
     /**
      * For a collection of pages each with the same url, computes the page
-     * with the max score, as well as the max of the ranks, proximity, and
+     * with the max score, as well as the max of the ranks and
      * relevance scores.
      * Stores this information in the first element of the array of pages.
      * This process is described in detail at:
@@ -393,22 +393,18 @@ class GroupIterator extends IndexBundleIterator
     {
         $max_rank = 0;
         $max_relevance = 0;
-        $max_proximity = 0;
         $domain_weights = [];
         foreach ($pre_hash_page as $hash_page) {
             if (isset($hash_page[self::SCORE])) {
                 $max_rank = max($max_rank, $hash_page[self::DOC_RANK]);
                 $max_relevance = max($max_relevance,
                     $hash_page[self::RELEVANCE]);
-                $max_proximity = max($max_proximity,
-                    ($hash_page[self::PROXIMITY] ?? 1));
             }
         }
         $pre_hash_page[0][self::SCORE] = $max_rank + $max_relevance;
         $pre_hash_page[0][self::DOC_RANK] = $max_rank;
         $pre_hash_page[0][self::HASH_URL_COUNT] = count($pre_hash_page);
         $pre_hash_page[0][self::RELEVANCE] = $max_relevance;
-        $pre_hash_page[0][self::PROXIMITY] = $max_proximity;
     }
     /**
      * Forwards the iterator one group of docs
diff --git a/src/library/index_bundle_iterators/IntersectIterator.php b/src/library/index_bundle_iterators/IntersectIterator.php
index e8beae9d5..92f21cbb8 100644
--- a/src/library/index_bundle_iterators/IntersectIterator.php
+++ b/src/library/index_bundle_iterators/IntersectIterator.php
@@ -225,27 +225,13 @@ class IntersectIterator extends IndexBundleIterator
                     }
                 }
             }
-            if (count($position_lists) > 1) {
-                if ($this->quote_positions === null ||
-                    $this->checkQuotes($position_lists)) {
-                    $docs[$key][self::PROXIMITY] =
-                        $this->computeProximity($position_lists, $len_lists,
-                            ($docs[$key][self::IS_DOC] ?? false),
-                            $docs[$key][self::DOC_LEN]);
-                } else {
-                    $docs = [];
-                }
-            } else {
-                 $docs[$key][self::PROXIMITY] = 0;
-            }
-            if ($docs != []) {
+            if (!empty($docs)) {
                 // proximity is aggregated into score in phrase model
                 $docs[$key][self::SCORE] = $docs[$key][self::DOC_RANK] +
                      $docs[$key][self::RELEVANCE];
                 if ($weight != 1) {
                     $docs[$key][self::DOC_RANK] *= $weight;
                     $docs[$key][self::RELEVANCE] *= $weight;
-                    $docs[$key][self::PROXIMITY] *= $weight;
                     $docs[$key][self::SCORE] *= $weight;
                 }
             }
diff --git a/src/library/index_bundle_iterators/NegationIterator.php b/src/library/index_bundle_iterators/NegationIterator.php
index 0496e5deb..aef26eb6f 100644
--- a/src/library/index_bundle_iterators/NegationIterator.php
+++ b/src/library/index_bundle_iterators/NegationIterator.php
@@ -113,16 +113,15 @@ class NegationIterator extends IndexBundleIterator
         if ($status == -1) {
             return -1;
         }
-        //next we finish computing BM25F
+        //next we finish computing a score
         $docs = $this->index_bundle_iterators[0]->currentDocsWithWord();
         if (is_array($docs) && count($docs) == 1) {
             //we get intersect docs one at a time so should be only one
             $keys = array_keys($docs);
             $key = $keys[0];
             $docs[$key][self::RELEVANCE] = 1;
-            $docs[$key][self::PROXIMITY] = 1;
-            $docs[$key][self::SCORE] = $docs[$key][self::DOC_RANK] *
-                 $docs[$key][self::RELEVANCE] * $docs[$key][self::PROXIMITY];
+            $docs[$key][self::SCORE] = $docs[$key][self::DOC_RANK] +
+                 $docs[$key][self::RELEVANCE];
         }
         $this->count_block = count($docs);
         $this->pages = $docs;
diff --git a/src/library/index_bundle_iterators/NetworkIterator.php b/src/library/index_bundle_iterators/NetworkIterator.php
index 640cd1fa4..fd9382e06 100644
--- a/src/library/index_bundle_iterators/NetworkIterator.php
+++ b/src/library/index_bundle_iterators/NetworkIterator.php
@@ -147,7 +147,6 @@ class NetworkIterator extends IndexBundleIterator
             "host_url_bonus" => C\HOST_URL_BONUS,
             "host_keyword_bonus" => C\HOST_KEYWORD_BONUS,
             "path_keyword_bonus" => C\PATH_KEYWORD_BONUS,
-            "proximity_bonus" => C\PROXIMITY_BONUS,
             "title_bonus" => C\TITLE_BONUS,
             "user_rank_bonus" => C\USER_RANK_BONUS,
             "wiki_bonus" => C\WIKI_BONUS,
diff --git a/src/library/index_bundle_iterators/UnionIterator.php b/src/library/index_bundle_iterators/UnionIterator.php
index 746aba238..071ac78fd 100644
--- a/src/library/index_bundle_iterators/UnionIterator.php
+++ b/src/library/index_bundle_iterators/UnionIterator.php
@@ -236,7 +236,8 @@ class UnionIterator extends IndexBundleIterator
     public function compareByMaxScore(&$query_terms, $least_score)
     {
         foreach ($query_terms as $query_term => $term_info) {
-            if ($term_info['MAX_SCORE'] <= $least_score) {
+            if ($term_info['MAX_SCORE'] > 0 &&
+                $term_info['MAX_SCORE'] <= $least_score) {
                 $iterator_index = $term_info['ITERATOR'];
                 $iterator = $this->index_bundle_iterators[$iterator_index];
                 if ($iterator instanceof IntersectIterator) {
@@ -317,7 +318,7 @@ class UnionIterator extends IndexBundleIterator
             return;
         }
         while ($index > 0) {
-            $parent_index = floor(($index-1) / 2);
+            $parent_index = floor(($index - 1) / 2);
             if ($heap[$parent_index] >= $heap[$index]) {
                 break;
             }
@@ -336,7 +337,10 @@ class UnionIterator extends IndexBundleIterator
      */
     public function getQueryTerms()
     {
-        $query_terms = [];
+        static $query_terms = [];
+        if (!empty($query_terms)) {
+            return $query_terms;
+        }
         for ($i = 0; $i < $this->num_iterators; $i++) {
             $iterator =  $this->index_bundle_iterators[$i];
             if ($iterator instanceof IntersectIterator) {
@@ -356,7 +360,7 @@ class UnionIterator extends IndexBundleIterator
                         }
                     }
                     if (!$check_meta) {
-                        $max_score = $this->getMaxScoreForTerm($word_key);
+                        $max_score = $word_iterator->getMaxScore();
                         $query_terms[$word_key] = ['ITERATOR' => $i,
                             'MAX_SCORE' => $max_score];
                     }
@@ -365,21 +369,6 @@ class UnionIterator extends IndexBundleIterator
         }
         return $query_terms;
     }
-
-    /**
-     * This method calculates the maxScore value for the term supplied.
-     *
-     * @param string $term to find score of
-     * @return float maxScore
-     */
-    public function getMaxScoreForTerm($term)
-    {
-        $score = IndexManager::discountedNumDocsTerm($term,
-            $this->index_name, false);
-        $max_score = $score == 0 ? 0.0 : 2.2 *
-            log($this->total_num_docs / $score);
-        return $max_score;
-    }
     /**
      * Forwards the iterator one group of docs
      * @param array $gen_doc_offset a generation, doc_offset pair. If set,
diff --git a/src/library/index_bundle_iterators/WordIterator.php b/src/library/index_bundle_iterators/WordIterator.php
index 9f75215fe..a3051bf20 100644
--- a/src/library/index_bundle_iterators/WordIterator.php
+++ b/src/library/index_bundle_iterators/WordIterator.php
@@ -209,7 +209,7 @@ class WordIterator extends IndexBundleIterator
      * Whether the latest version of each document should be searched for
      * @var boolean
      */
-    public $latest_version;
+    public $retrieve_latest;
     /**
      * Creates a word iterator with the given parameters.
      *
@@ -228,14 +228,15 @@ class WordIterator extends IndexBundleIterator
      *      reading the results in different directions
      * @param array $ranking_factors field say how url, keywords, and
      *      title words should influence relevance and doc rank calculations
-     * @param boolean $latest_version whether the latest version of a url
-     *      is being searched for or not
+     * @param boolean $retrieve_latest whether the latest indexed instance of a
+     *      document should be returned or not (might have multiple instances
+     *      if crawl indexes document more than once)
      *      (@see PhraseModel::lookupSummaryOffsetGeneration())
      */
     public function __construct($word_key, $index_name, $raw = false,
         $filter = null, $results_per_block =
-        IndexBundleIterator::RESULTS_PER_BLOCK, $direction=self::ASCENDING,
-        $ranking_factors = [], $latest_version = false)
+        IndexBundleIterator::RESULTS_PER_BLOCK, $direction = self::ASCENDING,
+        $ranking_factors = [], $retrieve_latest = true)
     {
         if ($raw == false) {
             //get rid of our modified base64 encoding
@@ -251,7 +252,7 @@ class WordIterator extends IndexBundleIterator
         $this->current_doc_offset = null;
         $this->results_per_block = $results_per_block;
         $this->current_block_fresh = false;
-        $this->latest_version = $latest_version;
+        $this->retrieve_latest = $retrieve_latest;
         $this->start_generation = ($direction == self::ASCENDING) ? 0 :
             "ACTIVE";
         foreach (["CLD_URL_BONUS" => C\CLD_URL_BONUS,
@@ -552,7 +553,7 @@ class WordIterator extends IndexBundleIterator
             $entry = $doc_map_tools->findEntryAtIndexTableName(
                 $doc_map_filename, $doc_map_index);
             $docid_len = IndexDocumentBundle::DOCID_LEN;
-            $wordsfilter_len = IndexDocumentBundle::WORDSFILTER_LEN;
+            $termsfilter_len = IndexDocumentBundle::TERMSFILTER_LEN;
             if (strlen($entry) < $docid_len) {
                 continue;
             }
@@ -563,11 +564,11 @@ class WordIterator extends IndexBundleIterator
              * (the beginning character of the term bloom
              * filter string attached to doc_map entries).
              */
-            $values = (strlen($entry) >= ($docid_len + $wordsfilter_len + 1) &&
+            $values = (strlen($entry) >= ($docid_len + $termsfilter_len + 1) &&
                 $entry[$docid_len] == 't') ?
-                substr($entry, $docid_len + $wordsfilter_len + 1) :
+                substr($entry, $docid_len + $termsfilter_len + 1) :
                 substr($entry, $docid_len);
-            if ($this->latest_version && $entry[$docid_len] == 't') {
+            if ($this->retrieve_latest && $entry[$docid_len] == 't') {
                 $url_hash = substr($doc_key, 0, 8);
                 $model_for_url_hash_lookup = new ParallelModel();
                 $page_versions = $model_for_url_hash_lookup->
@@ -590,22 +591,23 @@ class WordIterator extends IndexBundleIterator
                             $latest_posting['DOC_MAP_INDEX'] !=
                             $doc_map_index) {
                             $latest_base_folder = $index->
-                            getPartitionBaseFolder($latest_partition);
+                                getPartitionBaseFolder($latest_partition);
                             $latest_doc_map_filename = $latest_base_folder .
                                 "/" . IndexDocumentBundle::DOC_MAP_FILENAME;
                             $latest_doc_map_index =
                                 $latest_posting['DOC_MAP_INDEX'];
-                            $latest_doc_map_entry = $doc_map_tools->
-                            findEntryAtIndexTableName($latest_doc_map_filename,
-                                $latest_doc_map_index);
+                            $latest_doc_map_entry =
+                                $doc_map_tools->findEntryAtIndexTableName(
+                                $latest_doc_map_filename,$latest_doc_map_index);
                             if (strlen($latest_doc_map_entry) < $docid_len) {
                                 continue;
                             }
                             $latest_doc_key = substr($latest_doc_map_entry, 0,
                                 $docid_len);
-                            $words_filter = substr($latest_doc_map_entry,
-                                $docid_len + 1, $wordsfilter_len);
-                            if (!$this->checkTermExists($words_filter)) {
+                            $terms_filter = substr($latest_doc_map_entry,
+                                $docid_len + 1, $termsfilter_len);
+                            if (!$this->checkTermExists($this->word_key,
+                                $terms_filter)) {
                                 continue;
                             } else {
                                 /**
@@ -613,13 +615,12 @@ class WordIterator extends IndexBundleIterator
                                  * version of the document; replace the current
                                  * posting entries with the latest entry.
                                  */
-                                $posting[self::GENERATION] =
-                                    $latest_partition;
+                                $posting[self::GENERATION] = $latest_partition;
                                 $posting['DOC_MAP_INDEX'] =
                                     $latest_doc_map_index;
                                 $doc_key = $latest_doc_key;
                                 $values = substr($latest_doc_map_entry,
-                                    $docid_len + $wordsfilter_len + 1);
+                                    $docid_len + $termsfilter_len + 1);
                                 $latest_term_postings = $this->
                                     getGenerationPostings($latest_partition);
                                 $target_posting =
@@ -664,25 +665,40 @@ class WordIterator extends IndexBundleIterator
                 array_values(array_shift($doc_info));
             $is_timestamp_score = ($original_score <= $time &&
                 $original_score > ($time >> 1));
+            /*
+               DOC_RANK calculate is a computes a document quality measure
+               either based on time item was added (freshness) or a
+               sum of signals (how early or late it was added to index),
+               whether the url was a CLD or HOST, whether page was a wiki
+               page, and the number of slashes in the url path
+             */
             if ($is_timestamp_score) {
-                $posting[self::SCORE] = 0.5 * log($time/
-                    (max(1, $time - $original_score)), 2);
-                $posting[self::DOC_RANK] = $posting[self::SCORE];
+                $posting[self::DOC_RANK] = $time /
+                    (max(1, $time - $original_score)) *
+                    $this->getMaxDocQualityScore();
             } else {
-                $posting[self::SCORE] =  ($is_ascending) ?
+                $cld_bonus = $this->ranking_factors["CLD_URL_BONUS"];
+                $host_bonus = $this->ranking_factors["HOST_URL_BONUS"];
+                $wiki_bonus = $this->ranking_factors["WIKI_BONUS"];
+                $num_slashes_bonus =
+                    $this->ranking_factors["NUM_SLASHES_BONUS"];
+                $max_pre_rank_and_bonuses = $cld_bonus + $host_bonus +
+                    $wiki_bonus + $wiki_bonus + 1;
+                $last_partition_pos =  ($is_ascending) ?
                     $num_doc_keys - $doc_map_index :
                     $doc_map_index;
                 $remaining_partitions =  ($is_ascending) ?
                     $number_of_partitions - $num_seen_partitions :
                     $num_seen_partitions - 1;
-                $posting[self::DOC_RANK] = log(
-                    $remaining_partitions * $this->avg_items_per_partition +
-                    $posting[self::SCORE], 10);
-                if (L\IndexDocumentBundle::isAHostDocId($doc_key)) {
-                    $posting[self::DOC_RANK] +=
-                        (L\IndexDocumentBundle::isACldDocId($doc_key)) ?
-                        $this->ranking_factors["CLD_URL_BONUS"] :
-                        $this->ranking_factors["HOST_URL_BONUS"];
+                $pre_rank_and_bonuses = ($remaining_partitions *
+                    $this->avg_items_per_partition)/
+                    (($number_of_partitions + 1) *
+                    ($this->avg_items_per_partition + 1)) +
+                    $last_partition_pos / $this->max_items_per_partition;
+                if (IndexDocumentBundle::isAHostDocId($doc_key)) {
+                    $pre_rank_and_bonuses +=
+                        (IndexDocumentBundle::isACldDocId($doc_key)) ?
+                        $cld_bonus : $host_bonus;
                 }
                 /**
                  * For backward compatibility: new bonuses should only be added
@@ -697,16 +713,17 @@ class WordIterator extends IndexBundleIterator
                  * This difference can be used to check whether $doc_key follows
                  * the old or new letter_code format.
                  */
-                $doc_id_format = ord($doc_key[8 << 1] ?? 0) & 96;
+                $doc_id_format = ord($doc_key[
+                    IndexDocumentBundle::DOCID_PART_LEN << 1] ?? 0) & 96;
                 if ($doc_id_format != 96) {
-                    if (L\IndexDocumentBundle::isAWikipediaPage($doc_key)) {
-                        $posting[self::DOC_RANK] +=
-                            $this->ranking_factors["WIKI_BONUS"];
+                    if (IndexDocumentBundle::isAWikipediaPage($doc_key)) {
+                        $pre_rank_and_bonuses += $wiki_bonus;
                     }
-                    $posting[self::DOC_RANK] +=
-                        $this->ranking_factors["NUM_SLASHES_BONUS"] /
-                        (L\IndexDocumentBundle::findNumSlashes($doc_key) + 1);
+                    $pre_rank_and_bonuses  += $num_slashes_bonus /
+                        (IndexDocumentBundle::findNumSlashes($doc_key) + 1);
                 }
+                $posting[self::DOC_RANK] = $this->getMaxDocQualityScore() *
+                    $pre_rank_and_bonuses / $max_pre_rank_and_bonuses;
             }
             list($preface_positions, $num_description_scores) =
                 array_values(array_shift($doc_info));
@@ -719,8 +736,8 @@ class WordIterator extends IndexBundleIterator
             $posting[self::DESCRIPTION_SCORES] = array_slice($doc_info, 0,
                 $num_description_scores);
             if ($posting['FREQUENCY'] > 0) {
-                list($frequency, $preface_score) =
-                    $this->frequencyNormalizationPrefaceScoring(
+                $frequency = $this->frequencyNormalizationScoring(
+                    $occurrences_per_doc,
                     $posting[self::POSITION_LIST],
                     $posting[self::DOC_LEN],
                     $posting["HOST_KEYWORDS_END_POS"],
@@ -731,7 +748,7 @@ class WordIterator extends IndexBundleIterator
                 $posting[self::RELEVANCE] =
                     ((log(1 + $occurrences_per_doc, 2) + $frequency *
                     log(1 + 1/max(1, $occurrences_per_doc), 2)) /
-                    ($frequency + 1)) + $preface_score;
+                    ($frequency + 1));
             } else {
                  /*
                    this will typically be the relaveance score for a meta word
@@ -753,28 +770,38 @@ class WordIterator extends IndexBundleIterator
         return $key_postings;
     }
     /**
-     * Check if the current term id exists in the term bloom filter
-     * associated with the doc_map entry.
-     *
-     * @param string $words_filter term bloom filter
-     * @return boolean exists or not
+     * This method calculates the maxScore value for the relevance calculation
+     * of the term to the query
+     * @return float maximum score for document relevance to a query
      */
-    public function checkTermExists($words_filter)
+    public function getMaxRelevanceScore()
     {
-        $hash_functions = IndexDocumentBundle::WORDSFILTER_HASHFN_COUNT;
-        $size = IndexDocumentBundle::WORDSFILTER_BITS_LEN;
-        for ($i = 0; $i < $hash_functions; $i++) {
-            $hash = crc32($this->word_key . $i) % $size;
-            $byte = (int)($hash / 8);
-            $bit = $hash % 8;
-            $ascii_char = ord($words_filter[$byte]);
-            if (($ascii_char & (1 << $bit)) == 0) {
-                return false;
-            }
-        }
-        return true;
+        $occurrences_per_doc = $this->num_occurrences /
+            max($this->total_num_docs, 1);
+        $max_score = 1 + log(1 + 1/max(1, $occurrences_per_doc), 2);
+        return $max_score;
+    }
+    /**
+     * This method calculates the maxScore value for the Doc Quality calculation
+     * for a document and a query
+     * @return float maximum score for document quality
+     */
+    public function getMaxDocQualityScore()
+    {
+        $max_score = 5;
+        return $max_score;
+    }
+    /**
+     * This method calculates the maxScore value for the Doc Quality calculation
+     * for a document and a query
+     * @return float maxScore
+     */
+    public function getMaxScore()
+    {
+        $max_score = $this->getMaxDocQualityScore() +
+                $this->getMaxRelevanceScore();
+        return $max_score;
     }
-
     /**
      * Normalizes the frequencies of a term within a document with respect to
      * the length of the document, the positions of the term with the document
@@ -782,6 +809,8 @@ class WordIterator extends IndexBundleIterator
      * Also computes the score of the posting for the host keywords,
      * title keywords, and path keywords.
      *
+     * @param float $occurrences_per_doc expected number of occurrence of term
+     *  per/doc.
      * @param array $positions positions of this iterators term in the document
      * @param int $num_words number of terms in the document
      * @param int $host_keywords_end_pos term offset into the document summary
@@ -795,8 +824,8 @@ class WordIterator extends IndexBundleIterator
      * @return array [normalized frequency, score for host name, title,
      *     and path keywords]
      */
-    public function frequencyNormalizationPrefaceScoring(
-        $positions, $num_words, $host_keywords_end_pos,
+    public function frequencyNormalizationScoring(
+        $occurrences_per_doc, $positions, $num_words, $host_keywords_end_pos,
         $title_end_pos, $path_keywords_end_pos, $descriptions_scores)
     {
         $num_words = max($num_words, 1);
@@ -807,33 +836,40 @@ class WordIterator extends IndexBundleIterator
          * current document. C\MAX_DESCRIPTION_LEN is the max number
          * of characters in a document. Assuming the average word is
          * around 5 chars + whitespace char + punctuation, and most documents
-         * are summuarized, to close to the max character length, we
+         * are summarized to close to the max character length, we
          * approximate l_avg as C\MAX_DESCRIPTION_LEN/7 in the below.
          */
+        $pseudo_doc_length = 7 * $num_words;
         $length_normalization = log(1 + C\MAX_DESCRIPTION_LEN/(7 * $num_words),
             2);
-        $first_index = 0;
-        $old_pos = 0;
         if (empty($descriptions_scores)) {
-            return count($positions);
+            return count($positions) * $length_normalization;
         }
+        $host_bonus = $this->ranking_factors["HOST_KEYWORD_BONUS"];
+        $path_bonus = $this->ranking_factors["PATH_KEYWORD_BONUS"];
+        $title_bonus = $this->ranking_factors["TITLE_BONUS"];
+        $len_term = strlen($this->word_key);
+        $max_doc_norm_score = $host_bonus + $path_bonus + $title_bonus + 1;
+        $first_index = 0;
+        $old_pos = 0;
+        /*
+           Sum of description scores without bonus scores we add below
+           is 1. So with the scores we add below is $max_doc_norm_score.
+           The foreach loop that follows measures what fraction of this
+           comes from $this->word_key occurrences, so will be a number
+           less than $max_doc_norm_score;
+         */
+        $descriptions_scores = array_merge(
+            [['POS' => - $path_keywords_end_pos - 1,
+             'SCORE' => $host_bonus],
+             ['POS' => $host_keywords_end_pos - $path_keywords_end_pos - 1,
+             'SCORE' => $title_bonus],
+             ['POS' => $title_end_pos - $path_keywords_end_pos - 1,
+              'SCORE' => $path_bonus],
+           ], $descriptions_scores);
         $num_scores = count($descriptions_scores);
         $weighted_frequency = 0;
-        $preface_score = 0;
         foreach ($positions as $position) {
-            if ($position < $host_keywords_end_pos) {
-                $preface_score += $this->ranking_factors["HOST_KEYWORD_BONUS"] /
-                    max($host_keywords_end_pos - 1, 1);
-                continue;
-            } else if ($position < $title_end_pos) {
-                $preface_score += $this->ranking_factors["TITLE_BONUS"] /
-                    max($title_end_pos - $host_keywords_end_pos, 1);
-                continue;
-            } else if ($position < $path_keywords_end_pos) {
-                $preface_score += $this->ranking_factors["PATH_KEYWORD_BONUS"] /
-                    max($path_keywords_end_pos - $title_end_pos, 1);
-                continue;
-            }
             $last_index = $num_scores - 1;
             /* description score offsets are with respect to the description
                only so we subtract from the term position the offset of the
@@ -849,10 +885,16 @@ class WordIterator extends IndexBundleIterator
                 }
             }
             $weight = $descriptions_scores[$first_index]['SCORE'];
-            $weighted_frequency += $weight;
+            $start_description_pos = $descriptions_scores[$first_index]['POS'];
+            $len_description = ($first_index == $num_scores - 1) ?
+                $pseudo_doc_length - $start_description_pos :
+                $descriptions_scores[$first_index + 1]['POS'] -
+                $start_description_pos;
+            $weighted_frequency += $weight * $len_term / $len_description;
         }
-        $frequency = $weighted_frequency * $length_normalization;
-        return [$frequency, $preface_score];
+        $frequency = ($weighted_frequency/$max_doc_norm_score) * $num_words *
+            $length_normalization;
+        return $frequency;
     }
     /**
      * Updates the seen_docs count during an advance() call
diff --git a/src/models/PhraseModel.php b/src/models/PhraseModel.php
index 73b1b19fa..d55566c25 100755
--- a/src/models/PhraseModel.php
+++ b/src/models/PhraseModel.php
@@ -227,7 +227,6 @@ class PhraseModel extends ParallelModel
         $results = null;
         $answer_score_map = [];
         $word_structs = [];
-        $use_conjunctive = C\USE_CONJUNCTIVE_QUERY;
         /*
             this is a quick and dirty parsing and will usually work,
             exceptions would be # or | in quotes or if someone tried
@@ -343,13 +342,11 @@ class PhraseModel extends ParallelModel
             if ($cache_results) {
                 list($word_structs, $format_words) = $cache_results;
             } else {
-                if ($use_conjunctive) {
-                    $disjunct_phrases = explode("|", $phrase);
-                } else {
-                    $disjunct_phrases = $this->parseWordStructDisjunctiveQuery(
+                if (!C\USE_CONJUNCTIVE_QUERY) {
+                    $phrase = $this->rewriteAsDisjunctiveQuery(
                         $phrase, $guess_semantics);
-                    $phrase = implode(" | ", $disjunct_phrases);
                 }
+                $disjunct_phrases = explode("|", $phrase);
                 $can_use_query_map = $can_use_query_map &&
                     (count($disjunct_phrases) == 1) && !empty($filter) &&
                     $low == 0;
@@ -368,13 +365,12 @@ class PhraseModel extends ParallelModel
                         $map_result[self::DOC_RANK] = floatval(
                             $map_parts[1] ?? 0);
                         $map_result[self::RELEVANCE] = 0;
-                        $map_result[self::PROXIMITY] = 0;
                         $query_map_results[] = $map_result;
                         $query_map_urls[$map_parts[0]]  = $map_cnt;
                         $map_cnt++;
                     }
                 }
-                if ($use_conjunctive && $guess_semantics) {
+                if ($guess_semantics) {
                     $repeat_check = [];
                     $phrase = "";
                     $delim = " ";
@@ -403,7 +399,7 @@ class PhraseModel extends ParallelModel
                         }
                         list($word_struct, $found_format_words) =
                             $this->parseWordStructConjunctiveQuery(
-                                $disjunct, $use_conjunctive);
+                                $disjunct, C\USE_CONJUNCTIVE_QUERY);
                         $format_words = array_merge($format_words,
                             $found_format_words);
                         if ($word_struct != null) {
@@ -530,16 +526,14 @@ class PhraseModel extends ParallelModel
                     $qm_index = $query_map_urls[$page[self::URL]];
                     $qm_result = $query_map_results[$qm_index];
                     foreach ($page as $field => $value) {
-                        if (in_array($field, [self::DOC_RANK, self::RELEVANCE,
-                            self::PROXIMITY])) {
+                        if (in_array($field, [self::DOC_RANK, self::RELEVANCE])) {
                             $qm_result[$field] += $page[$field];
                         } else {
                             $qm_result[$field] = $page[$field];
                         }
                     }
                     $qm_result[self::SCORE] = $qm_result[self::DOC_RANK] +
-                        $qm_result[self::RELEVANCE] +
-                        $qm_result[self::PROXIMITY];
+                        $qm_result[self::RELEVANCE];
                     $query_map_results[$qm_index] = $qm_result;
                     unset($results['PAGES'][$p_index]);
                 }
@@ -580,15 +574,13 @@ class PhraseModel extends ParallelModel
      * Separates the cumulative search string into a series of
      * disjunctive phrases to be looked up in the current index.
      *
-     * @param string $search_phrase entered by user
-     * @param boolean $guess_semantics whether semantics should be
-     * guessed from the query string or not
-     * @return array of disjunct search phrases
+     * @param string $phrase entered by user
+     * @return string search phrase rewritten as disjunctive with | separators
      */
-    public function parseWordStructDisjunctiveQuery(&$search_phrase,
-        $guess_semantics)
+    public function rewriteAsDisjunctiveQuery($phrase)
     {
-        $phrase = preg_replace('/\s+/', ' ', $search_phrase);
+        $phrase = preg_replace('/\s+/', ' ', $phrase);
+        $locale_tag = L\guessLocaleFromString($phrase);
         $search_terms = [];
         /*
         Extracts all terms specified within quotes into a single
@@ -604,10 +596,12 @@ class PhraseModel extends ParallelModel
         $num_phrase_parts = count($split_phrase);
         $split_terms = [];
         $s = '';
-        // Extracts all terms separated by '&' into a single conjunctive query
+        /* Extracts all terms separated by '_and' into a single conjunctive
+           query
+         */
         for ($i = 0; $i < $num_phrase_parts; $i++) {
             while ($i < $num_phrase_parts - 1 &&
-                $split_phrase[$i + 1] == '&amp;') {
+                $split_phrase[$i + 1] == '_and') {
                 $s .= $split_phrase[$i] . " ";
                 $i += 2;
             }
@@ -628,7 +622,8 @@ class PhraseModel extends ParallelModel
         }
         foreach ($split_terms as $term) {
             if (!in_array($term, $special_words)) {
-                $search_terms[] = $term;
+                $search_terms = array_merge($search_terms,
+                    PhraseParser::segmentSegment($term, $locale_tag));
             }
         }
         $phrase = '';
@@ -638,8 +633,8 @@ class PhraseModel extends ParallelModel
                 $phrase .= $search_term . ' ';
             }
         }
-        $locale_tag = L\guessLocaleFromString($phrase);
-        $new_terms = PhraseParser::extractPhrases($phrase, $locale_tag);
+        PhraseParser::hyphenateEntities($phrase, $locale_tag);
+        $new_terms = explode(" ", $phrase);
         foreach ($new_terms as $new_term) {
             $new_term = trim($new_term);
             if (!in_array($new_term, $search_terms) && strlen($new_term) > 0) {
@@ -652,27 +647,10 @@ class PhraseModel extends ParallelModel
                 $search_terms[$i] .= " " . $add_word;
             }
         }
-        $repeat_check = [];
-        for ($i = 0; $i < $num_search_terms; $i++) {
-            $term = $search_terms[$i];
-            $check = trim($term);
-            if (isset($repeat_check[$check])) {
-                continue;
-            }
-            $repeat_check[$check] = true;
-            if ($guess_semantics) {
-                $query_part = $this->guessSemantics($term);
-                $search_terms[$i] = $query_part;
-            }
-        }
         if (count($search_terms) == 0 && count($special_words) > 0) {
             $search_terms = [implode(' ', $special_words)];
         }
-        /*
-        The resultant array holds multiple strings, each signifying
-        a disjunctive query
-        */
-        return $search_terms;
+        return implode(" | ", $search_terms);
     }
     /**
      * Parses from a string phrase representing a conjunctive query, a struct
@@ -683,17 +661,15 @@ class PhraseModel extends ParallelModel
      * @param string &$phrase string to extract struct from, if the phrase
      *  semantics is guessed or an if condition is processed the value of
      *  phrase will be altered. (Helps for feeding to network queries)
-     * @param boolean $use_conjunctive whether the search query is using
-     * conjunctive or disjunctive query logic
      * @return array struct representing the conjunctive query
      */
-    public function parseWordStructConjunctiveQuery(&$phrase, $use_conjunctive)
+    public function parseWordStructConjunctiveQuery(&$phrase)
     {
         $query = $phrase;
         $indent= "&nbsp;&nbsp;";
         $in2 = $indent . $indent;
         $in3 = $in2 . $indent;
-        $in4 = $in2. $in2;
+        $in4 = $in2 . $in2;
         $phrase = " " . $phrase;
         $phrase = $this->parseIfConditions($phrase);
         $phrase_string = $phrase;
@@ -745,8 +721,8 @@ class PhraseModel extends ParallelModel
                         $term_positions_within_quoted_query["*$num_words"] =
                             "*";
                     }
-                    $new_words = PhraseParser::extractPhrases(
-                        $sub_part, $locale_tag, $index_name, true);
+                    $new_words = array_filter(PhraseParser::extractPhrases(
+                        $sub_part, $locale_tag, $index_name, true));
                     $base_words = array_merge($base_words, $new_words);
                     foreach ($new_words as $new_word) {
                         $len = substr_count($new_word, " ") + 1;
@@ -757,14 +733,11 @@ class PhraseModel extends ParallelModel
                 }
                 $quote_positions[] = $term_positions_within_quoted_query;
             } else {
-                if ($use_conjunctive) {
-                    $new_words =
+                $new_words =
+                    array_filter(
                         PhraseParser::extractPhrases($phrase_part, $locale_tag,
-                            $index_name);
-                    $base_words = array_merge($base_words, $new_words);
-                } else {
-                    $base_words[] = $phrase_part;
-                }
+                        $index_name));
+                $base_words = array_merge($base_words, $new_words);
             }
             $num_words = count($base_words);
             $quote_state = ($quote_state) ? false : true;
@@ -1306,14 +1279,8 @@ class PhraseModel extends ParallelModel
         if (C\QUERY_STATISTICS) {
             $lookup_time = microtime(true);
         }
-        $use_proximity = false;
         $time = time();
         $test_query = trim(preg_replace("/\s+/u", " ", $original_query));
-        $approx_query_terms = substr_count($test_query, " ") -
-            substr_count($test_query, ":") + 1;
-        if ($approx_query_terms > 1) {
-            $use_proximity = true;
-        }
         if (empty($filter)) {
             $filter_time = 0;
         } else {
@@ -1439,15 +1406,8 @@ class PhraseModel extends ParallelModel
         // initialize scores
         $sort_start = microtime(true);
         $max_user_ranks = 0;
-        $ranking_factors["PROXIMITY_BONUS"] ??= C\PROXIMITY_BONUS;
         $ranking_factors["USER_RANK_BONUS"] ??= C\USER_RANK_BONUS;
         for ($i = 0; $i < $result_count; $i++) {
-            $pages[$i][self::PROXIMITY] ??= 0;
-            $pages[$i][self::PROXIMITY] *=
-                $ranking_factors["PROXIMITY_BONUS"];
-            if ($use_proximity) {
-                $pages[$i][self::SCORE] += $pages[$i][self::PROXIMITY];
-            }
             if (isset($pages[$i][self::USER_RANKS])) {
                 $j = count($pages[$i][self::USER_RANKS]);
                 if ($max_user_ranks < $j) {
@@ -1497,8 +1457,6 @@ class PhraseModel extends ParallelModel
                     ($p[self::DOC_RANK] ?? 0), 4, '.', ''));
                 $pages[$i][self::RELEVANCE] = floatval(number_format(
                     ($p[self::RELEVANCE] ?? 0), 4, '.', ''));
-                $pages[$i][self::PROXIMITY] = floatval(number_format(
-                    ($p[self::PROXIMITY] ?? 0), 4, '.', ''));
                 unset($pages[$i][self::DESCRIPTION_SCORES]);
                 $i++;
             }
@@ -1962,7 +1920,7 @@ class PhraseModel extends ParallelModel
                         $word_iterators[$i] =
                             new I\WordIterator($distinct_key_id,
                             $actual_index_name, true, $filter, $to_retrieve,
-                            $direction, $ranking_factors, true);
+                            $direction, $ranking_factors);
                         $min_group_override = true;
                     }
                     foreach ($word_keys as $index => $key) {
diff --git a/src/models/ProfileModel.php b/src/models/ProfileModel.php
index 909bc9700..d8b56c4e8 100755
--- a/src/models/ProfileModel.php
+++ b/src/models/ProfileModel.php
@@ -64,7 +64,7 @@ class ProfileModel extends Model
         'MEDIA_MODE', 'NAME_SERVER', 'NUM_SLASHES_BONUS', 'PATH_KEYWORD_BONUS',
         'PRIVATE_DB_NAME', 'PRIVATE_DB_HOST', 'PRIVATE_DBMS',
         'PRIVATE_DB_PASSWORD', 'PRIVATE_DB_USER', 'PROXY_SERVERS',
-        'PROXIMITY_BONUS', 'RECOVERY_MODE', 'REGISTRATION_TYPE', 'RESULT_SCORE',
+        'RECOVERY_MODE', 'REGISTRATION_TYPE', 'RESULT_SCORE',
         'ROBOT_INSTANCE','RSS_ACCESS', 'SEARCH_ANALYTICS_MODE',
         'SEARCHBAR_PATH', 'SEND_MAIL_MEDIA_UPDATER', 'SERP_FAVICONS',
         'SESSION_NAME', 'SIDE_ADSCRIPT', 'SIDEBAR_COLOR', 'SIGNIN_LINK',
diff --git a/src/views/elements/PageoptionsElement.php b/src/views/elements/PageoptionsElement.php
index 65d19c34f..649828468 100644
--- a/src/views/elements/PageoptionsElement.php
+++ b/src/views/elements/PageoptionsElement.php
@@ -542,11 +542,6 @@ class PageOptionsElement extends Element
             <input type="text" id="path-keyword-bonus" class="very-narrow-field"
                 maxlength="<?= C\NUM_FIELD_LEN ?>" name="PATH_KEYWORD_BONUS"
                 value="<?= $data['PATH_KEYWORD_BONUS']  ?>" ></td></tr>
-        <tr><th><label for="proximity-bonus"><?=
-            tl('pageoptions_element_proximity_bonus')?></label></th><td>
-            <input type="text" id="proximity-bonus" class="very-narrow-field"
-                maxlength="<?= C\NUM_FIELD_LEN ?>" name="PROXIMITY_BONUS"
-                value="<?= $data['PROXIMITY_BONUS'] ?>" ></td></tr>
         <tr><th><label for="cld-url-bonus"><?=
             tl('pageoptions_element_cld_url_bonus')?></label></th><td>
             <input type="text" id="cld-url-bonus" class="very-narrow-field"
diff --git a/src/views/elements/SearchElement.php b/src/views/elements/SearchElement.php
index 6840917da..4d8879907 100644
--- a/src/views/elements/SearchElement.php
+++ b/src/views/elements/SearchElement.php
@@ -360,11 +360,6 @@ class SearchElement extends Element implements CrawlConstants
                             number_format($page[self::DOC_RANK], 2)) . "\n");
                         e(tl('search_element_relevancy',
                             number_format($page[self::RELEVANCE], 2) ) . "\n");
-                        if ($page[self::PROXIMITY] > 0) {
-                            e(tl('search_element_proximity',
-                                number_format($page[self::PROXIMITY], 2)
-                                ) . "\n");
-                        }
                         if (isset($page[self::USER_RANKS])) {
                             foreach ($page[self::USER_RANKS] as
                                 $label => $score) {
diff --git a/tests/BPlusTreeTest.php b/tests/BPlusTreeTest.php
index f121a7a61..c7992282a 100644
--- a/tests/BPlusTreeTest.php
+++ b/tests/BPlusTreeTest.php
@@ -129,7 +129,7 @@ use seekquarry\yioop\library\UnitTest;
             $bptree = $this->createTree($i);
             for ($j = 0; $j < ($i * 40); $j++) {
                 for($k = 0; $k < 5; $k++) {
-                    $bptree->put(["KEY" => str_pad("$j",4,"0", STR_PAD_LEFT),
+                    $bptree->put(["KEY" => str_pad("$j", 4, "0", STR_PAD_LEFT),
                         "VALUE" => "row{$j}_{$k}"],
                         PackedTableTools::APPEND_MODE);
                 }
diff --git a/tests/IndexDocumentBundleTest.php b/tests/IndexDocumentBundleTest.php
index 51eb979ec..2406218a3 100644
--- a/tests/IndexDocumentBundleTest.php
+++ b/tests/IndexDocumentBundleTest.php
@@ -53,6 +53,10 @@ use seekquarry\yioop\library\UnitTest;
      * Prefix of folders for index document test
      */
     const TEST_DIR = __DIR__ . '/test_files/index_document_test';
+    /**
+     *
+     */
+    const TEST_DOC_TYPE = "\x10"; //link type;
     /**
      * Holds the IndexDocumentBundle used for test purposes
      * @var IndexDocumentBundle
@@ -230,6 +234,8 @@ use seekquarry\yioop\library\UnitTest;
         $i = 0;
         foreach ($keys as $key) {
             $row = $doc_map_tools->find($doc_map, $key);
+            //get row after bloom filter of terms
+            $row = substr($row, IndexDocumentBundle::TERMSFILTER_LEN + 1);
             $entry = $doc_map_tools->unpack($row);
             $preface_length = str_word_count(UrlParser::getWordsInHostUrl(
                     $docs[$i][CC::SUMMARY][CC::URL]) . " " .
@@ -345,11 +351,14 @@ use seekquarry\yioop\library\UnitTest;
      * Computes a 24 byte docId by padding an int to the left with 0's
      *
      * @param int $i integer to make docId from
+     * @param string $type default type of docid
      * @return string docid made by padding
      */
-    protected function docidFromInt($i)
+    protected function docidFromInt($i, $type = self::TEST_DOC_TYPE)
     {
-            return str_pad("$i", 24, "0", STR_PAD_LEFT);
+            $pre_key = str_pad("$i", 24, "0", STR_PAD_LEFT);
+            $pre_key[IndexDocumentBundle::DOCID_PART_LEN << 1] = $type;
+            return $pre_key;
     }
     /**
      * docids are typically made from three 8byte strings. This function
diff --git a/tests/WordIteratorTest.php b/tests/WordIteratorTest.php
index 890817d16..efd1a4726 100644
--- a/tests/WordIteratorTest.php
+++ b/tests/WordIteratorTest.php
@@ -113,7 +113,7 @@ use seekquarry\yioop\library\index_bundle_iterators\WordIterator;
         $index_archive->updateDictionary();
         $index_archive->forceSave();
         $word_iterator = new WordIterator(L\canonicalTerm("be"),
-            self::TEST_DIR . "/". self::TEST_BUNDLE, true, null, 10);
+            self::TEST_DIR . "/" . self::TEST_BUNDLE, true, null, 10);
         $current_doc_with_word = $word_iterator->currentDocsWithWord();
         $current_doc_with_word = (is_array($current_doc_with_word)) ?
             $current_doc_with_word : [];

ViewGit