viewgit/inc/functions.php:22 Function utf8_encode() is deprecated [8192]
diff --git a/src/configs/Config.php b/src/configs/Config.php index 9e8a4c430..557fc8cff 100755 --- a/src/configs/Config.php +++ b/src/configs/Config.php @@ -352,6 +352,8 @@ nsdefine('ONE_HOUR', 3600); nsdefine('ONE_MINUTE', 60); /** Number of seconds in a second */ nsdefine('ONE_SECOND', 1); +/** Whether to use conjunctive search queries or disjunctive */ +nsconddefine('USE_CONJUNCTIVE_QUERY', false); /** setting Profile.php to something else in LocalConfig.php allows one to have * two different yioop instances share the same work_directory but maybe have * different configuration settings. This might be useful if one was diff --git a/src/library/IndexManager.php b/src/library/IndexManager.php index ad83faca8..e2a920a92 100644 --- a/src/library/IndexManager.php +++ b/src/library/IndexManager.php @@ -323,17 +323,20 @@ class IndexManager implements CrawlConstants * @param string $term what to look up in the indexes dictionary * no mask is used for this look up * @param string $index_name index to look up term or phrase in + * @param boolean $discount_terms whether terms should be discounted + * based on their generation or not * @return int number of documents */ - public static function discountedNumDocsTerm($term, $index_name) + public static function discountedNumDocsTerm($term, $index_name, + $discount_terms = true) { static $num_docs_cache = []; if (isset($num_docs_cache[$index_name][$term])) { return $num_docs_cache[$index_name][$term]; } $version = self::getVersion($index_name); - $term_id = ($version > 2) ? canonicalTerm($term) : - crawlHashWord($term, true); + $term_id = $discount_terms ? (($version > 2) ? canonicalTerm($term) : + crawlHashWord($term, true)) : $term; $word_info = self::getWordInfo($index_name, $term_id, -1, 0, C\NUM_DISTINCT_GENERATIONS); if ($version >= 3 && !empty($word_info)) { @@ -351,7 +354,7 @@ class IndexManager implements CrawlConstants $generation = $generation_info['PARTITION']; $num_docs = $generation_info['NUM_DOCS']; } - $discount = max($generation + 1, $i++); + $discount = $discount_terms ? max($generation + 1, $i++) : 1; $total += $num_docs / $discount; } if (count($num_docs_cache) > 1000) { diff --git a/src/library/index_bundle_iterators/IntersectIterator.php b/src/library/index_bundle_iterators/IntersectIterator.php index cb7d34269..e8beae9d5 100644 --- a/src/library/index_bundle_iterators/IntersectIterator.php +++ b/src/library/index_bundle_iterators/IntersectIterator.php @@ -191,7 +191,7 @@ class IntersectIterator extends IndexBundleIterator if ($status == -1) { return -1; } - //next we finish computing BM25F + //next we finish computing divergence from randomness $docs = $this->index_bundle_iterators[0]->currentDocsWithWord(); $weight = $this->weight; if (is_array($docs) && count($docs) == 1) { diff --git a/src/library/index_bundle_iterators/UnionIterator.php b/src/library/index_bundle_iterators/UnionIterator.php index 80d51d14d..d556a42ef 100644 --- a/src/library/index_bundle_iterators/UnionIterator.php +++ b/src/library/index_bundle_iterators/UnionIterator.php @@ -30,6 +30,8 @@ */ namespace seekquarry\yioop\library\index_bundle_iterators; +use seekquarry\yioop\library\IndexManager; +use seekquarry\yioop\library\PhraseParser; /** * Used to iterate over the documents which occur in any of a set of * WordIterator results @@ -65,13 +67,26 @@ class UnionIterator extends IndexBundleIterator * @var array */ public $key_iterator_table; + /** + * The timestamp of the index associated with this iterator + * @var string + */ + public $index_name; + /** + * The total count of indexed documents in the current index + * @var int + */ + public $total_num_docs; /** * Creates a union iterator with the given parameters. * * @param object $index_bundle_iterators to use as a source of documents * to iterate over + * @param string $index_name time_stamp of the index to use + * @param int $total_num_docs total number of documents in the current index */ - public function __construct($index_bundle_iterators) + public function __construct($index_bundle_iterators, + $index_name, $total_num_docs) { $this->index_bundle_iterators = $index_bundle_iterators; /* @@ -84,6 +99,8 @@ class UnionIterator extends IndexBundleIterator $this->key_iterator_table = []; $this->seen_docs = 0; $this->seen_docs_unfiltered = 0; + $this->index_name = $index_name; + $this->total_num_docs = $total_num_docs; for ($i = 0; $i < $this->num_iterators; $i++) { $this->num_docs += $this->index_bundle_iterators[$i]->num_docs; /* @@ -140,30 +157,229 @@ class UnionIterator extends IndexBundleIterator { $pages = []; $docs = []; - $high_score = []; - $high_score = []; $found_docs = false; + $results_heap = []; + $k_least_score = ['LEAST_SCORE' => 0, 'INDEX' => 0]; + $query_terms = $this->getQueryTerms(); for ($i = 0; $i < $this->num_iterators; $i++) { $docs = $this->index_bundle_iterators[$i]->currentDocsWithWord(); if (is_array($docs)) { - $doc_keys = array_keys($docs); - foreach ($doc_keys as $key) { - $docs[$key]["ITERATOR"] = $i; - $this->key_iterator_table[$key] = $i; + /* + Iterate over all the documents fetched and add a doc to the + results' max heap only if the heap is not full / the + relevance score of the doc is greater than the current kth-best + score + */ + foreach ($docs as $doc_key => $doc) { + $doc["ITERATOR"] = $i; + $this->key_iterator_table[$doc_key] = $i; + $score = $doc[self::RELEVANCE]; + $full_heap = + (count($results_heap) == $this->results_per_block); + if ($full_heap && $score <= $k_least_score['LEAST_SCORE']) { + continue; + } else { + $next_page_index = $full_heap ? + $k_least_score['INDEX'] : + count($results_heap); + $results_heap[$next_page_index]['SCORE'] = $score; + $results_heap[$next_page_index]['DOC'] = $doc; + $this->heapifyUp($results_heap, $next_page_index); + } + /* + If the heap is full after inserting the new doc, + recompute the minimum score in the heap (which will be + replaced with the next doc that has to be inserted) + */ + if ($full_heap) { + $min_score = min($results_heap); + $k_least_score = ['LEAST_SCORE' => $min_score, + 'INDEX' => array_search($min_score, $results_heap)]; + } + } + /* + Drop query terms whose maxScores are lower than the current + kth-best score, where k is the max number of results that + can be returned + */ + if (count($results_heap) == $this->results_per_block) { + $this->compareByMaxScore($query_terms, + $k_least_score['LEAST_SCORE']); } - $pages = array_merge($pages, $docs); $found_docs = true; } } if ($found_docs == false) { $this->pages = $docs; return $docs; + } else { + // Get the top k result documents from the max heap + while (!empty($results_heap)) { + $pages[] = $this->extractMaxScoringDoc($results_heap)['DOC']; + } } $this->count_block_unfiltered = count($pages); $this->pages = $pages; $this->count_block = count($pages); return $pages; } + /** + * Compares each of the query terms' maxScores with the current + * least score in the max heap of result documents (i.e., the current + * kth-best score). If the term's maxScore is <= the current least score + * in the top k results, remove the word iterator associated with that + * term, as it will never make it to the top k documents. + * + * @param array $query_terms on this union iterator + * @param int $least_score current kth-best score + */ + public function compareByMaxScore(&$query_terms, $least_score) + { + foreach ($query_terms as $query_term => $term_info) { + if ($term_info['MAX_SCORE'] <= $least_score) { + $iterator_index = $term_info['ITERATOR']; + $iterator = $this->index_bundle_iterators[$iterator_index]; + if ($iterator instanceof IntersectIterator) { + $word_iterators = $iterator->index_bundle_iterators; + for ($j = 0; $j < count($word_iterators); $j++) { + if ($word_iterators[$j]->word_key == $query_term) { + array_splice($this-> + index_bundle_iterators[$iterator_index], $j, 1); + unset($query_terms[$query_term]); + break; + } + } + } else { + if ($iterator->word_key == $query_term) { + array_splice($this->index_bundle_iterators, + $iterator_index, 1); + unset($query_terms[$query_term]); + } + } + } + } + } + /** + * Gets the top-scoring document in the max heap of result documents. + * + * @param array $heap of result docs + * @return object top-scoring document + */ + public function extractMaxScoringDoc(&$heap) + { + $top_doc = $heap[0]; + $last_index = count($heap) - 1; + $heap[0] = $heap[$last_index]; + unset($heap[$last_index]); + $this->heapifyDown($heap, 0); + return $top_doc; + } + /** + * Reheaps the given heap using bubble down operations (after extracting + * the root document from the heap). + * + * @param array $heap of result docs + * @param int $index to begin heapifyDown operation + */ + public function heapifyDown(&$heap, $index) + { + $heap_size = count($heap); + while ($index < $heap_size) { + $left = $index * 2 + 1; + $right = $index * 2 + 2; + $top_doc = $index; + if ($left < $heap_size && $heap[$left] > $heap[$top_doc]) { + $top_doc = $left; + } + if ($right < $heap_size && $heap[$right] > $heap[$top_doc]) { + $top_doc = $right; + } + if ($top_doc != $index) { + $temp_doc = $heap[$top_doc]; + $heap[$top_doc] = $heap[$index]; + $heap[$index] = $temp_doc; + $index = $top_doc; + } else { + break; + } + } + } + /** + * Reheaps the given heap using bubble up operations (after inserting a new + * document into the heap). + * + * @param array $heap of result docs + * @param int $index to begin heapifyUp operation + */ + public function heapifyUp(&$heap, $index) + { + if ($index == 0) { + return; + } + while ($index > 0) { + $parent_index = floor(($index-1) / 2); + if ($heap[$parent_index] >= $heap[$index]) { + break; + } + $temp_doc = $heap[$parent_index]; + $heap[$parent_index] = $heap[$index]; + $heap[$index] = $temp_doc; + $index = $parent_index; + } + } + + /** + * This method fetches all the query terms associated with the nested + * word iterators on the current union iterator instance. + * + * @return array of query terms + */ + public function getQueryTerms() + { + $query_terms = []; + for ($i = 0; $i < $this->num_iterators; $i++) { + $iterator = $this->index_bundle_iterators[$i]; + if ($iterator instanceof IntersectIterator) { + $word_iterators = $iterator->index_bundle_iterators; + } else { + $word_iterators = [$iterator]; + } + foreach ($word_iterators as $word_iterator) { + if (property_exists($word_iterator, 'word_key')) { + $word_key = $word_iterator->word_key; + $check_meta = false; + foreach (PhraseParser::$meta_words_list as $meta) { + $meta_word = str_replace(':', '3A', $meta); + if (str_starts_with($word_key, $meta_word)) { + $check_meta = true; + break; + } + } + if (!$check_meta) { + $max_score = $this->getMaxScoreForTerm($word_key); + $query_terms[$word_key] = ['ITERATOR' => $i, + 'MAX_SCORE' => $max_score]; + } + } + } + } + return $query_terms; + } + + /** + * This method calculates the maxScore value for the term supplied. + * + * @param string $term to find score of + * @return float maxScore + */ + public function getMaxScoreForTerm($term) + { + $score = IndexManager::discountedNumDocsTerm($term, + $this->index_name, false); + $max_score = $score == 0 ? 0.0 : 2.2 * + log($this->total_num_docs/$score); + return $max_score; + } /** * Forwards the iterator one group of docs * @param array $gen_doc_offset a generation, doc_offset pair. If set, diff --git a/src/library/index_bundle_iterators/WordIterator.php b/src/library/index_bundle_iterators/WordIterator.php index 4427bc299..9f75215fe 100644 --- a/src/library/index_bundle_iterators/WordIterator.php +++ b/src/library/index_bundle_iterators/WordIterator.php @@ -710,6 +710,7 @@ class WordIterator extends IndexBundleIterator } list($preface_positions, $num_description_scores) = array_values(array_shift($doc_info)); + $num_description_scores = intval($num_description_scores); $posting["PATH_KEYWORDS_END_POS"] = ($preface_positions & 255); $preface_positions = $preface_positions >> 8; $posting["TITLE_END_POS"] = ($preface_positions & 255); diff --git a/src/models/PhraseModel.php b/src/models/PhraseModel.php index e1a554eea..d56b3320d 100755 --- a/src/models/PhraseModel.php +++ b/src/models/PhraseModel.php @@ -227,6 +227,7 @@ class PhraseModel extends ParallelModel $results = null; $answer_score_map = []; $word_structs = []; + $use_conjunctive = C\USE_CONJUNCTIVE_QUERY; /* this is a quick and dirty parsing and will usually work, exceptions would be # or | in quotes or if someone tried @@ -284,7 +285,6 @@ class PhraseModel extends ParallelModel foreach ($query_parts as $phrase => $pre_result_bounds) { $phrase_high = $pre_result_bounds[0][1]; $result_bounds = []; - $start_flag = false; $num_bounds = 0; foreach ($pre_result_bounds as $bound) { if ($bound[0] > $results_high) { @@ -343,10 +343,17 @@ class PhraseModel extends ParallelModel if ($cache_results) { list($word_structs, $format_words) = $cache_results; } else { - $disjunct_phrases = explode("|", $phrase); + if ($use_conjunctive) { + $disjunct_phrases = explode("|", $phrase); + } else { + $disjunct_phrases = $this->parseWordStructDisjunctiveQuery( + $phrase, $guess_semantics); + $phrase = implode(" | ", $disjunct_phrases); + } $can_use_query_map = $can_use_query_map && (count($disjunct_phrases) == 1) && !empty($filter) && $low == 0; + $original_has_disjuncts = (strpos($phrase, "|") !== false); $query_map_results = []; $query_map_urls = []; if ($can_use_query_map) { @@ -367,8 +374,7 @@ class PhraseModel extends ParallelModel $map_cnt++; } } - if ($guess_semantics) { - $original_has_disjuncts = (strpos($phrase, "|") !== false); + if ($use_conjunctive && $guess_semantics) { $repeat_check = []; $phrase = ""; $delim = " "; @@ -383,7 +389,7 @@ class PhraseModel extends ParallelModel $delim = " | "; } $disjunct_phrases = explode("|", $phrase); - } + } if (C\QUERY_STATISTICS) { $this->query_info['QUERY'] .= "$in2<b>Guessed Semantics</b>:<br>$in2$phrase<br>"; @@ -396,7 +402,8 @@ class PhraseModel extends ParallelModel $dis_cnt++; } list($word_struct, $format_words) = - $this->parseWordStructConjunctiveQuery($disjunct); + $this->parseWordStructConjunctiveQuery( + $disjunct, $use_conjunctive); if ($word_struct != null) { $word_structs[] = $word_struct; } @@ -567,6 +574,95 @@ class PhraseModel extends ParallelModel } return $results; } + /** + * Separates the cumulative search string into a series of + * disjunctive phrases to be looked up in the current index. + * + * @param string $search_phrase entered by user + * @param boolean $guess_semantics whether semantics should be + * guessed from the query string or not + * @return array of disjunct search phrases + */ + public function parseWordStructDisjunctiveQuery(&$search_phrase, + $guess_semantics) + { + $phrase = $search_phrase; + $search_terms = []; + /* + Extracts all terms specified within quotes into a single + conjunctive query + */ + preg_match_all('/"(.*?)"/', $phrase, $matches); + foreach ($matches[0] as $match) { + $phrase = str_replace($match, '', $phrase); + $search_terms[] = $match; + } + $phrase = trim($phrase); + $split_phrase = explode(" ", $phrase); + $split_terms = []; + $s = ''; + // Extracts all terms separated by '&' into a single conjunctive query + for ($i = 0; $i < count($split_phrase); $i++) { + while ($i < count($split_phrase)-1 && + $split_phrase[$i+1] == '&') { + $s .= $split_phrase[$i] . " "; + $i += 2; + } + $s .= $split_phrase[$i]; + $split_terms[] = $s; + $s = ''; + } + $search_terms = array_merge($search_terms, $split_terms); + $meta_words = []; + // Extracts any meta tags specified in the search phrase + for ($i = 0; $i < count($search_terms); $i++) { + foreach (PhraseParser::$meta_words_list as $meta_word) { + if (strpos($search_terms[$i], $meta_word) === 0) { + $meta_words[] = $search_terms[$i]; + array_splice($search_terms, $i, 1); + break; + } + } + } + $phrase = ''; + foreach ($search_terms as $search_term) { + if (!str_contains($search_term, ' ') && + !str_contains($search_term, '"')) { + $phrase .= $search_term . ' '; + } + } + $locale_tag = L\guessLocaleFromString($search_terms[0]); + $new_terms = PhraseParser::extractPhrases($phrase, $locale_tag); + foreach ($new_terms as $new_term) { + $new_term = trim($new_term); + if (!in_array($new_term, $search_terms) && strlen($new_term) > 0) { + $search_terms[] = $new_term; + } + } + for ($i = 0; $i < count($search_terms); $i++) { + foreach ($meta_words as $meta_word) { + $search_terms[$i] .= " " . $meta_word; + } + } + $repeat_check = []; + for ($i = 0; $i < count($search_terms); $i++) { + $term = $search_terms[$i]; + $check = trim($term); + if (isset($repeat_check[$check])) { + continue; + } + $repeat_check[$check] = true; + if ($guess_semantics) { + $query_part = $this->guessSemantics($term); + $search_terms[$i] = $query_part; + } + } + /* + The resultant array holds multiple strings, each signifying + a disjunctive query + */ + return $search_terms; + } /** * Parses from a string phrase representing a conjunctive query, a struct * consisting of the words keys searched for, the allowed and disallowed @@ -576,9 +672,11 @@ class PhraseModel extends ParallelModel * @param string &$phrase string to extract struct from, if the phrase * semantics is guessed or an if condition is processed the value of * phrase will be altered. (Helps for feeding to network queries) + * @param boolean $use_conjunctive whether the search query is using + * conjunctive or disjunctive query logic * @return array struct representing the conjunctive query */ - public function parseWordStructConjunctiveQuery(&$phrase) + public function parseWordStructConjunctiveQuery(&$phrase, $use_conjunctive) { $query = $phrase; $indent= " "; @@ -648,10 +746,14 @@ class PhraseModel extends ParallelModel } $quote_positions[] = $term_positions_within_quoted_query; } else { - $new_words = - PhraseParser::extractPhrases($phrase_part, $locale_tag, - $index_name); - $base_words = array_merge($base_words, $new_words); + if ($use_conjunctive) { + $new_words = + PhraseParser::extractPhrases($phrase_part, $locale_tag, + $index_name); + $base_words = array_merge($base_words, $new_words); + } else { + $base_words[] = $phrase_part; + } } $num_words = count($base_words); $quote_state = ($quote_state) ? false : true; @@ -706,7 +808,7 @@ class PhraseModel extends ParallelModel $hashes = []; $word_keys = []; foreach ($words as $word) { - $word_keys[] = $make_term_id($word); + $word_keys[] = $make_term_id(trim($word)); } if (count($word_keys) == 0) { $word_keys = null; @@ -1909,7 +2011,15 @@ class PhraseModel extends ParallelModel } elseif ($num_iterators == 1) { $union_iterator = $iterators[0]; } else { - $union_iterator = new I\UnionIterator($iterators); + $actual_index_name = $index_name; + if (($index_name[0] == "-")) { + $actual_index_name = substr($index_name, 1); + } + $index = IndexManager::getIndex($actual_index_name); + $index_info = $index->getArchiveInfo($index->dir_name); + $N = $index_info['VISITED_URLS_COUNT']; + $union_iterator = new I\UnionIterator($iterators, + $actual_index_name, $N); } $raw = intval($raw); if ($raw > 0) {