Modifying search logic to include disjunctive query processing, use of max heaps and maxScore in UnionIterator

Gargi Sheguri [2023-11-12 18:Nov:th]

Modifying search logic to include disjunctive query processing, use of max heaps and maxScore in UnionIterator

Signed-off-by: Chris Pollett <chris@pollett.org>

Filename
src/configs/Config.php
src/library/IndexManager.php
src/library/index_bundle_iterators/IntersectIterator.php
src/library/index_bundle_iterators/UnionIterator.php
src/library/index_bundle_iterators/WordIterator.php
src/models/PhraseModel.php

diff --git a/src/configs/Config.php b/src/configs/Config.php
index 9e8a4c430..557fc8cff 100755
--- a/src/configs/Config.php
+++ b/src/configs/Config.php
@@ -352,6 +352,8 @@ nsdefine('ONE_HOUR', 3600);
 nsdefine('ONE_MINUTE', 60);
 /** Number of seconds in a second */
 nsdefine('ONE_SECOND', 1);
+/** Whether to use conjunctive search queries or disjunctive */
+nsconddefine('USE_CONJUNCTIVE_QUERY', false);
 /** setting Profile.php to something else in LocalConfig.php allows one to have
  *  two different yioop instances share the same work_directory but maybe have
  *  different configuration settings. This might be useful if one was
diff --git a/src/library/IndexManager.php b/src/library/IndexManager.php
index ad83faca8..e2a920a92 100644
--- a/src/library/IndexManager.php
+++ b/src/library/IndexManager.php
@@ -323,17 +323,20 @@ class IndexManager implements CrawlConstants
      * @param string $term what to look up in the indexes dictionary
      *     no  mask is used for this look up
      * @param string $index_name index to look up term or phrase in
+     * @param boolean $discount_terms whether terms should be discounted
+     *      based on their generation or not
      * @return int number of documents
      */
-    public static function discountedNumDocsTerm($term, $index_name)
+    public static function discountedNumDocsTerm($term, $index_name,
+        $discount_terms = true)
     {
         static $num_docs_cache = [];
         if (isset($num_docs_cache[$index_name][$term])) {
             return $num_docs_cache[$index_name][$term];
         }
         $version = self::getVersion($index_name);
-        $term_id = ($version > 2) ? canonicalTerm($term) :
-            crawlHashWord($term, true);
+        $term_id = $discount_terms ? (($version > 2) ? canonicalTerm($term) :
+            crawlHashWord($term, true)) : $term;
         $word_info = self::getWordInfo($index_name, $term_id, -1, 0,
             C\NUM_DISTINCT_GENERATIONS);
         if ($version >= 3 && !empty($word_info)) {
@@ -351,7 +354,7 @@ class IndexManager implements CrawlConstants
                 $generation = $generation_info['PARTITION'];
                 $num_docs = $generation_info['NUM_DOCS'];
             }
-            $discount = max($generation + 1, $i++);
+            $discount = $discount_terms ? max($generation + 1, $i++) : 1;
             $total += $num_docs / $discount;
         }
         if (count($num_docs_cache) > 1000) {
diff --git a/src/library/index_bundle_iterators/IntersectIterator.php b/src/library/index_bundle_iterators/IntersectIterator.php
index cb7d34269..e8beae9d5 100644
--- a/src/library/index_bundle_iterators/IntersectIterator.php
+++ b/src/library/index_bundle_iterators/IntersectIterator.php
@@ -191,7 +191,7 @@ class IntersectIterator extends IndexBundleIterator
         if ($status == -1) {
             return -1;
         }
-        //next we finish computing BM25F
+        //next we finish computing divergence from randomness
         $docs = $this->index_bundle_iterators[0]->currentDocsWithWord();
         $weight = $this->weight;
         if (is_array($docs) && count($docs) == 1) {
diff --git a/src/library/index_bundle_iterators/UnionIterator.php b/src/library/index_bundle_iterators/UnionIterator.php
index 80d51d14d..d556a42ef 100644
--- a/src/library/index_bundle_iterators/UnionIterator.php
+++ b/src/library/index_bundle_iterators/UnionIterator.php
@@ -30,6 +30,8 @@
  */
 namespace seekquarry\yioop\library\index_bundle_iterators;

+use seekquarry\yioop\library\IndexManager;
+use seekquarry\yioop\library\PhraseParser;
 /**
  * Used to iterate over the documents which occur in any of a set of
  * WordIterator results
@@ -65,13 +67,26 @@ class UnionIterator extends IndexBundleIterator
      * @var array
      */
     public $key_iterator_table;
+    /**
+     * The timestamp of the index associated with this iterator
+     * @var string
+     */
+    public $index_name;
+    /**
+     * The total count of indexed documents in the current index
+     * @var int
+     */
+    public $total_num_docs;
     /**
      * Creates a union iterator with the given parameters.
      *
      * @param object $index_bundle_iterators to use as a source of documents
      *     to iterate over
+     * @param string $index_name time_stamp of the index to use
+     * @param int $total_num_docs total number of documents in the current index
      */
-    public function __construct($index_bundle_iterators)
+    public function __construct($index_bundle_iterators,
+        $index_name, $total_num_docs)
     {
         $this->index_bundle_iterators = $index_bundle_iterators;
         /*
@@ -84,6 +99,8 @@ class UnionIterator extends IndexBundleIterator
         $this->key_iterator_table = [];
         $this->seen_docs = 0;
         $this->seen_docs_unfiltered = 0;
+        $this->index_name = $index_name;
+        $this->total_num_docs = $total_num_docs;
         for ($i = 0; $i < $this->num_iterators; $i++) {
             $this->num_docs += $this->index_bundle_iterators[$i]->num_docs;
             /*
@@ -140,30 +157,229 @@ class UnionIterator extends IndexBundleIterator
     {
         $pages = [];
         $docs = [];
-        $high_score = [];
-        $high_score = [];
         $found_docs = false;
+        $results_heap = [];
+        $k_least_score = ['LEAST_SCORE' => 0, 'INDEX' => 0];
+        $query_terms = $this->getQueryTerms();
         for ($i = 0; $i < $this->num_iterators; $i++) {
             $docs =  $this->index_bundle_iterators[$i]->currentDocsWithWord();
             if (is_array($docs)) {
-                $doc_keys = array_keys($docs);
-                foreach ($doc_keys as $key) {
-                    $docs[$key]["ITERATOR"] = $i;
-                    $this->key_iterator_table[$key] = $i;
+                /*
+                 Iterate over all the documents fetched and add a doc to the
+                 results' max heap only if the heap is not full / the
+                 relevance score of the doc is greater than the current kth-best
+                 score
+                 */
+                foreach ($docs as $doc_key => $doc) {
+                    $doc["ITERATOR"] = $i;
+                    $this->key_iterator_table[$doc_key] = $i;
+                    $score = $doc[self::RELEVANCE];
+                    $full_heap =
+                        (count($results_heap) == $this->results_per_block);
+                    if ($full_heap && $score <= $k_least_score['LEAST_SCORE']) {
+                        continue;
+                    } else {
+                        $next_page_index = $full_heap ?
+                            $k_least_score['INDEX'] :
+                            count($results_heap);
+                        $results_heap[$next_page_index]['SCORE'] = $score;
+                        $results_heap[$next_page_index]['DOC'] = $doc;
+                        $this->heapifyUp($results_heap, $next_page_index);
+                    }
+                    /*
+                     If the heap is full after inserting the new doc,
+                     recompute the minimum score in the heap (which will be
+                     replaced with the next doc that has to be inserted)
+                     */
+                    if ($full_heap) {
+                        $min_score = min($results_heap);
+                        $k_least_score = ['LEAST_SCORE' => $min_score,
+                            'INDEX' => array_search($min_score, $results_heap)];
+                    }
+                }
+                /*
+                 Drop query terms whose maxScores are lower than the current
+                 kth-best score, where k is the max number of results that
+                 can be returned
+                 */
+                if (count($results_heap) == $this->results_per_block) {
+                    $this->compareByMaxScore($query_terms,
+                        $k_least_score['LEAST_SCORE']);
                 }
-                $pages = array_merge($pages, $docs);
                 $found_docs = true;
             }
         }
         if ($found_docs == false) {
             $this->pages = $docs;
             return $docs;
+        } else {
+            // Get the top k result documents from the max heap
+            while (!empty($results_heap)) {
+                $pages[] = $this->extractMaxScoringDoc($results_heap)['DOC'];
+            }
         }
         $this->count_block_unfiltered = count($pages);
         $this->pages = $pages;
         $this->count_block = count($pages);
         return $pages;
     }
+    /**
+     * Compares each of the query terms' maxScores with the current
+     * least score in the max heap of result documents (i.e., the current
+     * kth-best score). If the term's maxScore is <= the current least score
+     * in the top k results, remove the word iterator associated with that
+     * term, as it will never make it to the top k documents.
+     *
+     * @param array $query_terms on this union iterator
+     * @param int $least_score current kth-best score
+     */
+    public function compareByMaxScore(&$query_terms, $least_score)
+    {
+        foreach ($query_terms as $query_term => $term_info) {
+            if ($term_info['MAX_SCORE'] <= $least_score) {
+                $iterator_index = $term_info['ITERATOR'];
+                $iterator = $this->index_bundle_iterators[$iterator_index];
+                if ($iterator instanceof IntersectIterator) {
+                    $word_iterators = $iterator->index_bundle_iterators;
+                    for ($j = 0; $j < count($word_iterators); $j++) {
+                        if ($word_iterators[$j]->word_key == $query_term) {
+                            array_splice($this->
+                            index_bundle_iterators[$iterator_index], $j, 1);
+                            unset($query_terms[$query_term]);
+                            break;
+                        }
+                    }
+                } else {
+                    if ($iterator->word_key == $query_term) {
+                        array_splice($this->index_bundle_iterators,
+                            $iterator_index, 1);
+                        unset($query_terms[$query_term]);
+                    }
+                }
+            }
+        }
+    }
+    /**
+     * Gets the top-scoring document in the max heap of result documents.
+     *
+     * @param array $heap of result docs
+     * @return object top-scoring document
+     */
+    public function extractMaxScoringDoc(&$heap)
+    {
+        $top_doc = $heap[0];
+        $last_index = count($heap) - 1;
+        $heap[0] = $heap[$last_index];
+        unset($heap[$last_index]);
+        $this->heapifyDown($heap, 0);
+        return $top_doc;
+    }
+    /**
+     * Reheaps the given heap using bubble down operations (after extracting
+     * the root document from the heap).
+     *
+     * @param array $heap of result docs
+     * @param int $index to begin heapifyDown operation
+     */
+    public function heapifyDown(&$heap, $index)
+    {
+        $heap_size = count($heap);
+        while ($index < $heap_size) {
+            $left = $index * 2 + 1;
+            $right = $index * 2 + 2;
+            $top_doc = $index;
+            if ($left < $heap_size && $heap[$left] > $heap[$top_doc]) {
+                $top_doc = $left;
+            }
+            if ($right < $heap_size && $heap[$right] > $heap[$top_doc]) {
+                $top_doc = $right;
+            }
+            if ($top_doc != $index) {
+                $temp_doc = $heap[$top_doc];
+                $heap[$top_doc] = $heap[$index];
+                $heap[$index] = $temp_doc;
+                $index = $top_doc;
+            } else {
+                break;
+            }
+        }
+    }
+    /**
+     * Reheaps the given heap using bubble up operations (after inserting a new
+     * document into the heap).
+     *
+     * @param array $heap of result docs
+     * @param int $index to begin heapifyUp operation
+     */
+    public function heapifyUp(&$heap, $index)
+    {
+        if ($index == 0) {
+            return;
+        }
+        while ($index > 0) {
+            $parent_index = floor(($index-1) / 2);
+            if ($heap[$parent_index] >= $heap[$index]) {
+                break;
+            }
+            $temp_doc = $heap[$parent_index];
+            $heap[$parent_index] = $heap[$index];
+            $heap[$index] = $temp_doc;
+            $index = $parent_index;
+        }
+    }
+
+    /**
+     * This method fetches all the query terms associated with the nested
+     * word iterators on the current union iterator instance.
+     *
+     * @return array of query terms
+     */
+    public function getQueryTerms()
+    {
+        $query_terms = [];
+        for ($i = 0; $i < $this->num_iterators; $i++) {
+            $iterator =  $this->index_bundle_iterators[$i];
+            if ($iterator instanceof IntersectIterator) {
+                $word_iterators = $iterator->index_bundle_iterators;
+            } else {
+                $word_iterators = [$iterator];
+            }
+            foreach ($word_iterators as $word_iterator) {
+                if (property_exists($word_iterator, 'word_key')) {
+                    $word_key = $word_iterator->word_key;
+                    $check_meta = false;
+                    foreach (PhraseParser::$meta_words_list as $meta) {
+                        $meta_word = str_replace(':', '3A', $meta);
+                        if (str_starts_with($word_key, $meta_word)) {
+                            $check_meta = true;
+                            break;
+                        }
+                    }
+                    if (!$check_meta) {
+                        $max_score = $this->getMaxScoreForTerm($word_key);
+                        $query_terms[$word_key] = ['ITERATOR' => $i,
+                            'MAX_SCORE' => $max_score];
+                    }
+                }
+            }
+        }
+        return $query_terms;
+    }
+
+    /**
+     * This method calculates the maxScore value for the term supplied.
+     *
+     * @param string $term to find score of
+     * @return float maxScore
+     */
+    public function getMaxScoreForTerm($term)
+    {
+        $score = IndexManager::discountedNumDocsTerm($term,
+            $this->index_name, false);
+        $max_score = $score == 0 ? 0.0 : 2.2 *
+            log($this->total_num_docs/$score);
+        return $max_score;
+    }
     /**
      * Forwards the iterator one group of docs
      * @param array $gen_doc_offset a generation, doc_offset pair. If set,
diff --git a/src/library/index_bundle_iterators/WordIterator.php b/src/library/index_bundle_iterators/WordIterator.php
index 4427bc299..9f75215fe 100644
--- a/src/library/index_bundle_iterators/WordIterator.php
+++ b/src/library/index_bundle_iterators/WordIterator.php
@@ -710,6 +710,7 @@ class WordIterator extends IndexBundleIterator
             }
             list($preface_positions, $num_description_scores) =
                 array_values(array_shift($doc_info));
+            $num_description_scores = intval($num_description_scores);
             $posting["PATH_KEYWORDS_END_POS"] = ($preface_positions & 255);
             $preface_positions = $preface_positions >> 8;
             $posting["TITLE_END_POS"] = ($preface_positions & 255);
diff --git a/src/models/PhraseModel.php b/src/models/PhraseModel.php
index e1a554eea..d56b3320d 100755
--- a/src/models/PhraseModel.php
+++ b/src/models/PhraseModel.php
@@ -227,6 +227,7 @@ class PhraseModel extends ParallelModel
         $results = null;
         $answer_score_map = [];
         $word_structs = [];
+        $use_conjunctive = C\USE_CONJUNCTIVE_QUERY;
         /*
             this is a quick and dirty parsing and will usually work,
             exceptions would be # or | in quotes or if someone tried
@@ -284,7 +285,6 @@ class PhraseModel extends ParallelModel
         foreach ($query_parts as $phrase => $pre_result_bounds) {
             $phrase_high = $pre_result_bounds[0][1];
             $result_bounds = [];
-            $start_flag = false;
             $num_bounds = 0;
             foreach ($pre_result_bounds as $bound) {
                 if ($bound[0] > $results_high) {
@@ -343,10 +343,17 @@ class PhraseModel extends ParallelModel
             if ($cache_results) {
                 list($word_structs, $format_words) = $cache_results;
             } else {
-                $disjunct_phrases = explode("|", $phrase);
+                if ($use_conjunctive) {
+                    $disjunct_phrases = explode("|", $phrase);
+                } else {
+                    $disjunct_phrases = $this->parseWordStructDisjunctiveQuery(
+                        $phrase, $guess_semantics);
+                    $phrase = implode(" | ", $disjunct_phrases);
+                }
                 $can_use_query_map = $can_use_query_map &&
                     (count($disjunct_phrases) == 1) && !empty($filter) &&
                     $low == 0;
+                $original_has_disjuncts = (strpos($phrase, "|") !== false);
                 $query_map_results = [];
                 $query_map_urls = [];
                 if ($can_use_query_map) {
@@ -367,8 +374,7 @@ class PhraseModel extends ParallelModel
                         $map_cnt++;
                     }
                 }
-                if ($guess_semantics) {
-                    $original_has_disjuncts = (strpos($phrase, "|") !== false);
+                if ($use_conjunctive && $guess_semantics) {
                     $repeat_check = [];
                     $phrase = "";
                     $delim = " ";
@@ -383,7 +389,7 @@ class PhraseModel extends ParallelModel
                         $delim = " | ";
                     }
                     $disjunct_phrases = explode("|", $phrase);
-                        }
+                }
                 if (C\QUERY_STATISTICS) {
                     $this->query_info['QUERY'] .=
                         "$in2<b>Guessed Semantics</b>:<br>$in2$phrase<br>";
@@ -396,7 +402,8 @@ class PhraseModel extends ParallelModel
                             $dis_cnt++;
                         }
                         list($word_struct, $format_words) =
-                            $this->parseWordStructConjunctiveQuery($disjunct);
+                            $this->parseWordStructConjunctiveQuery(
+                                $disjunct, $use_conjunctive);
                         if ($word_struct != null) {
                             $word_structs[] = $word_struct;
                         }
@@ -567,6 +574,95 @@ class PhraseModel extends ParallelModel
         }
         return $results;
     }
+    /**
+     * Separates the cumulative search string into a series of
+     * disjunctive phrases to be looked up in the current index.
+     *
+     * @param string $search_phrase entered by user
+     * @param boolean $guess_semantics whether semantics should be
+     * guessed from the query string or not
+     * @return array of disjunct search phrases
+     */
+    public function parseWordStructDisjunctiveQuery(&$search_phrase,
+                                                    $guess_semantics)
+    {
+        $phrase = $search_phrase;
+        $search_terms = [];
+        /*
+        Extracts all terms specified within quotes into a single
+        conjunctive query
+        */
+        preg_match_all('/&quot;(.*?)&quot;/', $phrase, $matches);
+        foreach ($matches[0] as $match) {
+            $phrase = str_replace($match, '', $phrase);
+            $search_terms[] = $match;
+        }
+        $phrase = trim($phrase);
+        $split_phrase = explode(" ", $phrase);
+        $split_terms = [];
+        $s = '';
+        // Extracts all terms separated by '&' into a single conjunctive query
+        for ($i = 0; $i < count($split_phrase); $i++) {
+            while ($i < count($split_phrase)-1 &&
+                $split_phrase[$i+1] == '&amp;') {
+                $s .= $split_phrase[$i] . " ";
+                $i += 2;
+            }
+            $s .= $split_phrase[$i];
+            $split_terms[] = $s;
+            $s = '';
+        }
+        $search_terms = array_merge($search_terms, $split_terms);
+        $meta_words = [];
+        // Extracts any meta tags specified in the search phrase
+        for ($i = 0; $i < count($search_terms); $i++) {
+            foreach (PhraseParser::$meta_words_list as $meta_word) {
+                if (strpos($search_terms[$i], $meta_word) === 0) {
+                    $meta_words[] = $search_terms[$i];
+                    array_splice($search_terms, $i, 1);
+                    break;
+                }
+            }
+        }
+        $phrase = '';
+        foreach ($search_terms as $search_term) {
+            if (!str_contains($search_term, ' ') &&
+                !str_contains($search_term, '&quot;')) {
+                $phrase .= $search_term . ' ';
+            }
+        }
+        $locale_tag = L\guessLocaleFromString($search_terms[0]);
+        $new_terms = PhraseParser::extractPhrases($phrase, $locale_tag);
+        foreach ($new_terms as $new_term) {
+            $new_term = trim($new_term);
+            if (!in_array($new_term, $search_terms) && strlen($new_term) > 0) {
+                $search_terms[] = $new_term;
+            }
+        }
+        for ($i = 0; $i < count($search_terms); $i++) {
+            foreach ($meta_words as $meta_word) {
+                $search_terms[$i] .= " " . $meta_word;
+            }
+        }
+        $repeat_check = [];
+        for ($i = 0; $i < count($search_terms); $i++) {
+            $term = $search_terms[$i];
+            $check = trim($term);
+            if (isset($repeat_check[$check])) {
+                continue;
+            }
+            $repeat_check[$check] = true;
+            if ($guess_semantics) {
+                $query_part = $this->guessSemantics($term);
+                $search_terms[$i] = $query_part;
+            }
+        }
+        /*
+        The resultant array holds multiple strings, each signifying
+        a disjunctive query
+        */
+        return $search_terms;
+    }
     /**
      * Parses from a string phrase representing a conjunctive query, a struct
      * consisting of the words keys searched for, the allowed and disallowed
@@ -576,9 +672,11 @@ class PhraseModel extends ParallelModel
      * @param string &$phrase string to extract struct from, if the phrase
      *  semantics is guessed or an if condition is processed the value of
      *  phrase will be altered. (Helps for feeding to network queries)
+     * @param boolean $use_conjunctive whether the search query is using
+     * conjunctive or disjunctive query logic
      * @return array struct representing the conjunctive query
      */
-    public function parseWordStructConjunctiveQuery(&$phrase)
+    public function parseWordStructConjunctiveQuery(&$phrase, $use_conjunctive)
     {
         $query = $phrase;
         $indent= "&nbsp;&nbsp;";
@@ -648,10 +746,14 @@ class PhraseModel extends ParallelModel
                 }
                 $quote_positions[] = $term_positions_within_quoted_query;
             } else {
-                $new_words =
-                    PhraseParser::extractPhrases($phrase_part, $locale_tag,
-                    $index_name);
-                $base_words = array_merge($base_words, $new_words);
+                if ($use_conjunctive) {
+                    $new_words =
+                        PhraseParser::extractPhrases($phrase_part, $locale_tag,
+                            $index_name);
+                    $base_words = array_merge($base_words, $new_words);
+                } else {
+                    $base_words[] = $phrase_part;
+                }
             }
             $num_words = count($base_words);
             $quote_state = ($quote_state) ? false : true;
@@ -706,7 +808,7 @@ class PhraseModel extends ParallelModel
             $hashes = [];
             $word_keys = [];
             foreach ($words as $word) {
-                $word_keys[] = $make_term_id($word);
+                $word_keys[] = $make_term_id(trim($word));
             }
             if (count($word_keys) == 0) {
                 $word_keys = null;
@@ -1909,7 +2011,15 @@ class PhraseModel extends ParallelModel
         } elseif ($num_iterators == 1) {
             $union_iterator = $iterators[0];
         } else {
-            $union_iterator = new I\UnionIterator($iterators);
+            $actual_index_name = $index_name;
+            if (($index_name[0] == "-")) {
+                $actual_index_name = substr($index_name, 1);
+            }
+            $index = IndexManager::getIndex($actual_index_name);
+            $index_info = $index->getArchiveInfo($index->dir_name);
+            $N = $index_info['VISITED_URLS_COUNT'];
+            $union_iterator = new I\UnionIterator($iterators,
+                $actual_index_name, $N);
         }
         $raw = intval($raw);
         if ($raw > 0) {

ViewGit