diff --git a/src/library/CrawlConstants.php b/src/library/CrawlConstants.php index 79e58dbe7..5401c77ad 100755 --- a/src/library/CrawlConstants.php +++ b/src/library/CrawlConstants.php @@ -264,7 +264,7 @@ interface CrawlConstants const SCRAPER_INFO = 'eq'; const SEQUENCE_NUMBER = 'er'; const FETCHER_QUEUE_SERVER_RATIO = 'es'; - const NEXT_DOC = 'et'; + const GEN_OFFSET = 'et'; const ITERATOR = 'eu'; const MAX_SCORE = 'ev'; } diff --git a/src/library/index_bundle_iterators/GroupIterator.php b/src/library/index_bundle_iterators/GroupIterator.php index c58a9d757..03b3825e6 100644 --- a/src/library/index_bundle_iterators/GroupIterator.php +++ b/src/library/index_bundle_iterators/GroupIterator.php @@ -208,18 +208,9 @@ class GroupIterator extends IndexBundleIterator $pages = -1; } } else if (!empty($new_pages)) { - if (count($new_pages) == 1) { - $pages = array_merge($pages, $new_pages); - } else { - $pages = $new_pages; - $done = true; - } + $pages += $new_pages; $count = count($pages); } - if (isset($this->index_bundle_iterator->hard_query)) { - $this->results_per_block = - $this->index_bundle_iterator->hard_query; - } if ($count < $this->results_per_block && !$done) { $this->index_bundle_iterator->advance(); } else { diff --git a/src/library/index_bundle_iterators/IndexBundleIterator.php b/src/library/index_bundle_iterators/IndexBundleIterator.php index dfb27e68d..e1554322d 100644 --- a/src/library/index_bundle_iterators/IndexBundleIterator.php +++ b/src/library/index_bundle_iterators/IndexBundleIterator.php @@ -132,26 +132,6 @@ abstract class IndexBundleIterator implements CrawlConstants { return 5; } - /** - * This method calculates the max relevance value for query underlying - * the iterator to the document currently being iterated over - * by the query - * @return float maximum score for document relevance to a query - */ - public function getMaxRelevanceScore() - { - return 0.01; - } - /** - * This method calculates the maximum overall score value for any document - * returned by this iterator. It should be overriden in subclasses as - * makes sense - * @return float maximum score - */ - public function getMaxScore() - { - return $this->getMaxDocQualityScore() + $this->getMaxRelevanceScore(); - } /** * Returns a string representation of a plan by which the current iterator * finds its results @@ -347,13 +327,13 @@ abstract class IndexBundleIterator implements CrawlConstants $remaining_partitions = ($is_ascending) ? $number_of_partitions - $num_seen_partitions : $num_seen_partitions - 1; - $pre_rank_and_bonuses = ($remaining_partitions * + $rank_and_bonuses = ($remaining_partitions * $this->avg_items_per_partition)/ (($number_of_partitions + 1) * ($avg_items_per_partition + 1)) + $last_partition_pos / $max_items_per_partition; if (IndexDocumentBundle::isAHostDocId($doc_key)) { - $pre_rank_and_bonuses += + $rank_and_bonuses += (IndexDocumentBundle::isACldDocId($doc_key)) ? $cld_bonus : $host_bonus; } @@ -374,12 +354,11 @@ abstract class IndexBundleIterator implements CrawlConstants IndexDocumentBundle::DOCID_PART_LEN << 1] ?? 0) & 96; if ($doc_id_format != 96) { if (IndexDocumentBundle::isAWikipediaPage($doc_key)) { - $pre_rank_and_bonuses += $wiki_bonus; + $rank_and_bonuses += $wiki_bonus; } - $pre_rank_and_bonuses += $num_slashes_bonus / + $rank_and_bonuses += $num_slashes_bonus / (IndexDocumentBundle::findNumSlashes($doc_key) + 1); } - return $this->getMaxDocQualityScore() * - $pre_rank_and_bonuses / $max_pre_rank_and_bonuses; + return $rank_and_bonuses; } } diff --git a/src/library/index_bundle_iterators/IntersectIterator.php b/src/library/index_bundle_iterators/IntersectIterator.php index 613ca0011..8e232a0dc 100644 --- a/src/library/index_bundle_iterators/IntersectIterator.php +++ b/src/library/index_bundle_iterators/IntersectIterator.php @@ -313,90 +313,6 @@ class IntersectIterator extends IndexBundleIterator } } } - /** - * Given the position_lists of a collection of terms computes - * a score for how close those words were in the given document - * - * @param array &$word_position_lists a 2D array item - * number => position_list (locations in doc where item occurred) for - * that item. - * @param array &$word_len_lists length for each item of its position list - * @param bool $is_doc whether this is the position list of a document - * or a link - * @param int $doc_len the length of the document - * @return sum of inverse of all covers computed by plane sweep algorithm - */ - public function computeProximity(&$word_position_lists, &$word_len_lists, - $is_doc, $doc_len) - { - $num_iterators = $this->num_iterators; - if ($num_iterators < 1) { - return 0; - } - $covers = []; - $position_list = $word_position_lists; - $interval = []; - $num_words = count($position_list); - for ($i = 0; $i < $num_words; $i++) { - $min = (!empty($position_list[$i])) ? - array_shift($position_list[$i]) : null; - if (empty($min)) { - break; - } else { - array_push($interval, [$min, $i]); - for ($j = 0; $j < $num_words; $j++) { - if (isset($position_list[$j][0]) && - $min == $position_list[$j][0]) { - array_shift($position_list[$j]); - } - } - } - } - if (count($interval) != $num_words) { - return 0; - } - sort($interval); - $l = array_shift($interval); - $r = end($interval); - $stop = false; - if (count($position_list[$l[1]]) == 0) { - $stop = true; - } - while(!$stop) { - $p = array_shift($position_list[$l[1]]); - for ($i = 0;$i < $num_words; $i++){ - if (isset($position_list[$i][0]) && - $p == $position_list[$i][0]) { - array_shift($position_list[$i]); - } - } - $q = $interval[0][0]; - if ($p > $r[0]) { - array_push($covers, [$l[0], $r[0]]); - array_push($interval, [$p, $l[1]]); - } else { - if ($p < $q) { - array_unshift($interval, [$p, $l[1]]); - } else { - array_push($interval, [$p, $l[1]]); - sort($interval); - } - } - $l = array_shift($interval); - $r = end($interval); - if (count($position_list[$l[1]]) == 0) { - $stop = true; - } - } - array_push($covers, [$l[0],$r[0]]); - $score = 0; - foreach ($covers as $cover) { - $score += (1/($cover[1] - $cover[0] + 1)); - } - $score = ($num_words * $score)/max($doc_len, 1); - // this will ensure the score is less than 1 - return $score; - } /** * Finds the next generation and doc offset amongst all the iterators * that contains the word. It assumes that the (generation, doc offset) @@ -545,18 +461,4 @@ class IntersectIterator extends IndexBundleIterator an intersect iterator", E_USER_ERROR); } } - /** - * This method calculates the max relevance value for query underlying - * the iterator to the document currently being iterated over - * by the query - * @return float maximum score for document relevance to a query - */ - public function getMaxRelevanceScore() - { - $max_relevance = 0; - foreach ($this->index_bundle_iterators as $iterator) { - $max_relevance += $iterator->getMaxRelevanceScore(); - } - return $max_relevance; - } } diff --git a/src/library/index_bundle_iterators/NetworkIterator.php b/src/library/index_bundle_iterators/NetworkIterator.php index fd9382e06..8510fd0a9 100644 --- a/src/library/index_bundle_iterators/NetworkIterator.php +++ b/src/library/index_bundle_iterators/NetworkIterator.php @@ -86,12 +86,6 @@ class NetworkIterator extends IndexBundleIterator * @var int */ public $num_downloaded; - /** - * Used to keep track of the original desired number of results to be - * returned in one find docs call versus the number actually retrieved. - * @var int - */ - public $hard_query; /** * Flags used to keep track of whether a given machine has more search * result data. Array of booleans @@ -140,7 +134,6 @@ class NetworkIterator extends IndexBundleIterator $this->next_results_per_server = self::serverAdjustedResultsPerBlock($num_servers, $this->results_per_block); - $this->hard_query = false; $this->base_query = "q=" . urlencode($query). "&f=serial&network=false&raw=1&its=$timestamp&guess=false"; foreach (["cld_url_bonus" => C\CLD_URL_BONUS, @@ -181,7 +174,6 @@ class NetworkIterator extends IndexBundleIterator self::serverAdjustedResultsPerBlock($num_servers, $this->results_per_block); $count = count($this->queue_servers); - $this->hard_query = false; for ($i = 0; $i < $count; $i++) { $this->more_flags[$i] = true; } @@ -306,9 +298,6 @@ class NetworkIterator extends IndexBundleIterator } } $machine_times = substr( $machine_times, 0, -strlen("<br>")); - if (isset($pre_result["HARD_QUERY"])) { - $this->hard_query = $pre_result["HARD_QUERY"]; - } if ($num_with_results > 0) { $this->next_results_per_server = self::serverAdjustedResultsPerBlock($num_with_results, diff --git a/src/library/index_bundle_iterators/UnionIterator.php b/src/library/index_bundle_iterators/UnionIterator.php index 82e4340cc..4ea3975fe 100644 --- a/src/library/index_bundle_iterators/UnionIterator.php +++ b/src/library/index_bundle_iterators/UnionIterator.php @@ -63,11 +63,6 @@ class UnionIterator extends IndexBundleIterator * @var int */ public $seen_docs_unfiltered; - /** - * stores a mapping between seen doc keys and which iterator they came from - * @var array - */ - public $key_iterator_table; /** * The timestamp of the index associated with this iterator * @var string @@ -78,21 +73,6 @@ class UnionIterator extends IndexBundleIterator * @var int */ public $total_num_docs; - /** - * Heap of query terms whose scores are considered while finding results - * @var array - */ - public $terms_heap; - /** - * Heap of query terms whose scores are not considered while finding results - * @var array - */ - public $low_scoring_terms; - /** - * Heap of result documents - * @var array - */ - public $results_heap; /** * Creates a union iterator with the given parameters. * @@ -104,33 +84,34 @@ class UnionIterator extends IndexBundleIterator public function __construct($index_bundle_iterators, $index_name, $total_num_docs) { - $this->index_bundle_iterators = $index_bundle_iterators; /* estimate number of results by sum of all iterator counts, then improve estimate as iterate */ - $this->num_iterators = count($index_bundle_iterators); + $num_iterators = count($index_bundle_iterators); + $this->num_iterators = $num_iterators; $this->num_docs = 0; /* result_per_block is at most the sum of results_per_block of things we are iterating. Value is already init'd in base class. */ - $this->results_per_block = C\MIN_RESULTS_TO_GROUP; - $this->key_iterator_table = []; + $this->results_per_block = intval(C\MIN_RESULTS_TO_GROUP); $this->seen_docs = 0; $this->seen_docs_unfiltered = 0; $this->index_name = $index_name; $this->total_num_docs = $total_num_docs; - $this->low_scoring_terms = []; - for ($i = 0; $i < $this->results_per_block; $i++) { - $this->results_heap[$i][self::SCORE] = 0; - } - $this->initializeTermsHeap(); - for ($i = 0; $i < $this->num_iterators; $i++) { - $this->index_bundle_iterators[$i]->setResultsPerBlock(1); - $this->num_docs += $this->index_bundle_iterators[$i]->num_docs; - $this->seen_docs += $this->index_bundle_iterators[$i]->seen_docs; + $num_smaller = array_fill(0, $num_iterators, 0); + for ($i = 0; $i < $num_iterators; $i++) { + $index_bundle_iterators[$i]->setResultsPerBlock(1); + $num_docs = $index_bundle_iterators[$i]->num_docs; + $this->num_docs += $num_docs; + for ($j = 0; $j < $i; $j++) { + if ($num_docs < $index_bundle_iterators[$j]->num_docs) { + $num_smaller[$j]++; + } + } + $this->seen_docs += $index_bundle_iterators[$i]->seen_docs; if (isset($this->index_bundle_iterators[$i]->seen_docs_unfiltered)){ $this->seen_docs_unfiltered += $this->index_bundle_iterators[$i]->seen_docs_unfiltered; @@ -138,11 +119,17 @@ class UnionIterator extends IndexBundleIterator $this->seen_docs_unfiltered += $this->seen_docs; } } - $doc_block = $this->currentDocsWithWord(); + asort($num_smaller); + $i = 0; + foreach ($num_smaller as $index => $count) { + $this->index_bundle_iterators[$i] = + $index_bundle_iterators[$index]; + $i++; + } } /** * Returns CrawlConstants::ASCENDING or CrawlConstants::DESCENDING - * depending on the direction in which this iterator ttraverse the + * depending on the direction in which this iterator traverse the * underlying index archive bundle. * * @return int direction traversing underlying archive bundle @@ -167,31 +154,6 @@ class UnionIterator extends IndexBundleIterator $this->seen_docs_unfiltered = 0; $doc_block = $this->currentDocsWithWord(); } - /** - * Calculates the total relevance score of the result document - * - * @param array $heap of terms - * @param int $relevance_score previously calculated relevance score - * @return array - */ - public function getDocScore($heap, $relevance_score = 0) - { - $d = $this->currentGenDocOffsetWithWord(); - $doc = []; - while ($d != -1 && !empty($heap) && $heap[0][self::NEXT_DOC] === $d) { - $iterator_idx = $heap[0][self::ITERATOR]; - $iterator = $this->index_bundle_iterators[$iterator_idx]; - $docs = $iterator->findDocsWithWord(); - if (is_array($docs) && count($docs) == 1) { - $keys = array_keys($docs); - $doc = $docs[$keys[0]]; - $relevance_score += $doc[self::RELEVANCE]; - } - array_splice($heap, 0, 1); - $this->heapifyDown($heap, true); - } - return [$doc, $relevance_score]; - } /** * Hook function used by currentDocsWithWord to return the current block * of docs if it is not cached @@ -202,185 +164,118 @@ class UnionIterator extends IndexBundleIterator { $pages = []; $found_docs = false; - list($doc, $relevance_score) = $this->getDocScore($this->terms_heap); - if (!empty($this->low_scoring_terms)) { - list($doc_copy, $additional_score) = - $this->getDocScore($this->low_scoring_terms, $relevance_score); - $relevance_score += $additional_score; - } - if (!empty($doc) && $relevance_score > - $this->results_heap[0][self::SCORE]) { - // Update the document's scores - $doc[self::RELEVANCE] = $relevance_score; - $doc[self::SCORE] = $relevance_score + $doc[self::DOC_RANK]; - $found_docs = true; - $this->results_heap[0] = [self::DOC_INFO => $doc, - self::SCORE => $doc[self::SCORE]]; - $this->heapifyDown($this->results_heap, false); - } - $found_top_results = $this->results_heap[0][self::SCORE] > 0; - if (!$found_docs || $found_top_results) { - $pages = ($this->results_heap[ - $this->results_per_block - 1][self::SCORE] == 0) ? - -1 : $this->getResultsHeap(); - } else { - $pages = [$doc]; - } - $this->pages = $pages; - if (is_array($pages)) { - $this->count_block_unfiltered = count($pages); - $this->count_block = count($pages); + $num_iterators = $this->num_iterators; + $iterators = $this->index_bundle_iterators; + $this->count_block_unfiltered = 0; + $direction = $this->getDirection(); + $max_accumulators = intval($this->results_per_block); + $to_accumulators = []; + for ($i = 0; $i < $num_iterators; $i++) { + $from_accumulators = $to_accumulators; + $max_in_pos = count($from_accumulators); + $to_accumulators = []; + $quota_left = $max_accumulators - $max_in_pos; + $iterator = $iterators[$i]; + if ($quota_left == 0) { + for ($j = 0; $j < $max_accumulators; $j++) { + $current_gen_doc_offset = $from_accumulators[$j][ + self::GEN_OFFSET]; + $iterator_offset = $iterator->currentGenDocOffsetWithWord(); + if ($this->genDocOffsetCmp($iterator_offset, + $current_gen_doc_offset, $direction) < 0) { + $iterator->advance($current_gen_doc_offset); + $this->count_block_unfiltered++; + } + if( ($iterator_offset = + $iterator->currentGenDocOffsetWithWord()) == -1) { + break; + } + $to_accumulators[$j] = $from_accumulators[$j]; + $cmp = $this->genDocOffsetCmp($iterator_offset, + $current_gen_doc_offset, $direction); + if ($cmp == 0) { + $docs = $iterator->findDocsWithWord(); + if (is_array($docs) && count($docs) == 1) { + $keys = array_keys($docs); + $doc = $docs[$keys[0]]; + $to_accumulators[$j][self::RELEVANCE] += + $doc[self::RELEVANCE]; + } + } + } + for ($k = $j; $k < $max_accumulators; $k++) { + $to_accumulators[$k] = $from_accumulators[$k]; + } + } else { + $in_pos = 0; + $out_pos = 0; + while ($out_pos < $max_accumulators) { + if(($iterator_offset = + $iterator->currentGenDocOffsetWithWord()) == -1) { + break; + } + if ($in_pos < $max_in_pos) { + $current_gen_doc_offset = $from_accumulators[$in_pos][ + self::GEN_OFFSET]; + $cmp = $this->genDocOffsetCmp($iterator_offset, + $current_gen_doc_offset, $direction); + } else { + $cmp = -1; + } + if ($cmp < 0) { + $remaining_in_accumulator = + $max_in_pos - $in_pos - 1; + if ($max_accumulators - $out_pos + > $remaining_in_accumulator) { + $docs = $iterator->findDocsWithWord(); + if (is_array($docs) && count($docs) == 1) { + $keys = array_keys($docs); + $doc = $docs[$keys[0]]; + $to_accumulators[$out_pos] = $doc; + $to_accumulators[$out_pos][self::GEN_OFFSET] = + $iterator_offset; + $out_pos++; + } + $iterator->advance(); + } else { + $to_accumulators[$out_pos++] = + $from_accumulators[$in_pos++]; + } + } else if($cmp == 0) { + $to_accumulators[$out_pos] = + $from_accumulators[$in_pos]; + $docs = $iterator->findDocsWithWord(); + if (is_array($docs) && count($docs) == 1) { + $keys = array_keys($docs); + $doc = $docs[$keys[0]]; + $to_accumulators[$out_pos][self::RELEVANCE] += + $doc[self::RELEVANCE]; + } + $out_pos++; + $in_pos++; + } else { + $to_accumulators[$out_pos++] = + $from_accumulators[$in_pos++]; + } + $this->count_block_unfiltered++; + } + } } - return $pages; - } - /** - * Gets the docs in the results min heap sorted in - * descending order by score - * - * @return mixed array of result docs if any, -1 otherwise - */ - public function getResultsHeap() - { $pages = []; - while (!empty($this->results_heap)) { - $doc = $this->extractMinScoringDoc($this->results_heap); - if ($doc[self::SCORE] > 0) { - array_unshift($pages, $doc[self::DOC_INFO]); + foreach ($to_accumulators as $accumulator) { + if (!empty($accumulator[self::KEY])) { + $accumulator[self::SCORE] = $accumulator[self::DOC_RANK] + + $accumulator[self::RELEVANCE]; + $pages[$accumulator[self::KEY]] = $accumulator; } } - // Re-initialize the results heap for the next set of docs - $initial_heap_item = [self::SCORE => 0, self::DOC_INFO => null]; - $this->results_heap = array_fill(0, $this->results_per_block, - $initial_heap_item); if (empty($pages)) { - $pages = -1; + return -1; } + $this->pages = $pages; + $this->count_block = count($pages); return $pages; } - /** - * Compare between elements for heapify operations - * - * @param array $i first element - * @param array $j second element - * @param boolean $is_terms_heap basis for comparison - * @return boolean result of comparison - */ - public function compareElements($i, $j, $is_terms_heap) - { - $is_ascending = $this->getDirection(); - if (!$is_terms_heap) { - return $i[self::SCORE] > $j[self::SCORE]; - } - $i_next_doc = $i[self::NEXT_DOC]; - $j_next_doc = $j[self::NEXT_DOC]; - if ($is_ascending) { - if ($i_next_doc == -1) { - return true; - } else if ($j[self::NEXT_DOC] == -1) { - return false; - } - return $i_next_doc[0] > $j_next_doc[0] || - ($i_next_doc[0] == $j_next_doc[0] && - $i_next_doc[1] > $j_next_doc[1]); - } else { - return $j_next_doc[0] > $i_next_doc[0] || - ($j_next_doc[0] == $i_next_doc[0] && - $j_next_doc[1] > $i_next_doc[1]); - } - } - /** - * Performs reheap using bubble-down operation - * - * @param array $heap to be reheaped - * @param boolean $is_terms_heap to check comparison condition - */ - public function heapifyDown(&$heap, $is_terms_heap) - { - $index = 0; - $heap_size = count($heap); - while ($index < $heap_size) { - $left = ($index << 1) + 1; - $right = ($index + 1) << 1; - $least_doc = $index; - if ($left < $heap_size && - $this->compareElements($heap[$least_doc], $heap[$left], - $is_terms_heap)) { - $least_doc = $left; - } - if ($right < $heap_size && - $this->compareElements($heap[$least_doc], $heap[$right], - $is_terms_heap)) { - $least_doc = $right; - } - if ($least_doc != $index) { - $temp_doc = $heap[$least_doc]; - $heap[$least_doc] = $heap[$index]; - $heap[$index] = $temp_doc; - $index = $least_doc; - } else { - break; - } - } - } - /** - * Performs reheap using bubble-up operation - * - * @param array $heap to be reheaped - * @param boolean $is_terms_heap to check comparison condition - */ - public function heapifyUp(&$heap, $is_terms_heap) - { - $index = count($heap) - 1; - while ($index > 0) { - $parent_index = ($index - 1) >> 1; - if ($this->compareElements($heap[$index], $heap[$parent_index], - $is_terms_heap)) { - break; - } - $temp_doc = $heap[$parent_index]; - $heap[$parent_index] = $heap[$index]; - $heap[$index] = $temp_doc; - $index = $parent_index; - } - } - /** - * Gets the lowest-scoring document in the min heap of result documents. - * - * @param array $heap of result docs - * @return object lowest-scoring document - */ - public function extractMinScoringDoc(&$heap) - { - $lowest_doc = $heap[0]; - $last_index = count($heap) - 1; - $heap[0] = $heap[$last_index]; - unset($heap[$last_index]); - $this->heapifyDown($heap, false); - return $lowest_doc; - } - /** - * This method creates a heap out of all the query terms - * associated with the nested word iterators on the current - * union iterator instance. - * - */ - public function initializeTermsHeap() - { - if (!empty($this->terms_heap)) { - return; - } - $this->terms_heap = []; - $num_iterators = $this->num_iterators; - for ($i = 0; $i < $num_iterators; $i++) { - $iterator = $this->index_bundle_iterators[$i]; - $this->terms_heap[] = [ - self::ITERATOR => $i, - self::MAX_SCORE => $iterator->getMaxScore(), - self::NEXT_DOC => $iterator->currentGenDocOffsetWithWord() - ]; - $this->heapifyUp($this->terms_heap, true); - } - } /** * Forwards the iterator one group of docs * @param array $gen_doc_offset a generation, doc_offset pair. If set, @@ -390,46 +285,17 @@ class UnionIterator extends IndexBundleIterator */ public function advance($gen_doc_offset = null) { - $this->advanceSeenDocs(); + $this->current_block_fresh = false; + $this->seen_docs += $this->count_block; $this->seen_docs_unfiltered += $this->count_block_unfiltered; - $total_num_docs = 0; - $d = $this->currentGenDocOffsetWithWord(); - $score_k = $this->results_heap[0][self::SCORE]; - while ($d != -1 && $this->terms_heap[0][self::NEXT_DOC] === $d) { - $iterator_idx = $this->terms_heap[0][self::ITERATOR]; - $iterator = $this->index_bundle_iterators[$iterator_idx]; - $total_num_docs += $iterator->num_docs; - $iterator->advance($gen_doc_offset); - $next_doc = $iterator->currentGenDocOffsetWithWord(); - $this->terms_heap[0][self::NEXT_DOC] = $next_doc; - if ($score_k > $this->terms_heap[0][self::MAX_SCORE]) { - $this->low_scoring_terms[] = $this->terms_heap[0]; - $this->heapifyUp($this->low_scoring_terms, true); - array_splice($this->terms_heap, 0, 1); - } - $this->heapifyDown($this->terms_heap, true); - } - $d = []; - $d[self::NEXT_DOC] = $this->currentGenDocOffsetWithWord(); - if (!empty($this->low_scoring_terms)) { - while ($d[self::NEXT_DOC] != -1 && - $this->compareElements($d, $this->low_scoring_terms[0], true)) { - $lowest_doc = $this->low_scoring_terms[0]; - $iterator_idx = $lowest_doc[self::ITERATOR]; - $iterator = $this->index_bundle_iterators[$iterator_idx]; + $this->num_docs = + floor(($this->seen_docs * $this->total_num_docs) / + $this->seen_docs_unfiltered); + if ($gen_doc_offset != null) { + foreach ($this->index_bundle_iterators as $iterator) { $iterator->advance($gen_doc_offset); - $next_doc = $iterator->currentGenDocOffsetWithWord(); - $this->low_scoring_terms[0][self::NEXT_DOC] = $next_doc; - $this->heapifyDown($this->low_scoring_terms, true); } } - if ($this->seen_docs_unfiltered > 0) { - $this->num_docs = - floor(($this->seen_docs * $total_num_docs) / - $this->seen_docs_unfiltered); - } else { - $this->num_docs = 0; - } } /** * This method is supposed to set @@ -456,22 +322,14 @@ class UnionIterator extends IndexBundleIterator * @return mixed the desired document offset and generation. */ public function currentGenDocOffsetWithWord() { - return !empty($this->terms_heap) ? - (key_exists(self::NEXT_DOC, $this->terms_heap[0]) ? - $this->terms_heap[0][self::NEXT_DOC] : -1) : -1; + $gen_doc_offset = -1; + $index_bundle_iterators = $this->index_bundle_iterators; + foreach ($index_bundle_iterators as $iterator) { + $gen_doc_offset = $iterator->currentGenDocOffsetWithWord(); + if ($gen_doc_offset != -1) { + break; + } + } + return $gen_doc_offset; } - /** - * This method calculates the max relevance value for query underlying - * the iterator to the document currently being iterated over - * by the query - * @return float maximum score for document relevance to a query - */ - public function getMaxRelevanceScore() - { - $max_relevance = 0; - foreach ($this->index_bundle_iterators as $iterator) { - $max_relevance += $iterator->getMaxRelevanceScore(); - } - return $max_score; - } } diff --git a/src/library/index_bundle_iterators/WordIterator.php b/src/library/index_bundle_iterators/WordIterator.php index 7cbde614d..0477d85d1 100644 --- a/src/library/index_bundle_iterators/WordIterator.php +++ b/src/library/index_bundle_iterators/WordIterator.php @@ -687,7 +687,8 @@ class WordIterator extends IndexBundleIterator $posting[self::DESCRIPTION_SCORES] = array_slice($doc_info, 0, $num_description_scores); if ($posting['FREQUENCY'] > 0) { - $frequency = $this->frequencyNormalizationScoring( + list($bonuses, $frequency) = + $this->frequencyScoring( $occurrences_per_doc, $posting[self::POSITION_LIST], $posting[self::DOC_LEN], @@ -696,7 +697,9 @@ class WordIterator extends IndexBundleIterator $posting["PATH_KEYWORDS_END_POS"], $posting[self::DESCRIPTION_SCORES]); // Divergence-from-randomness + preface score - $posting[self::RELEVANCE] = + $posting[self::RELEVANCE] = 2.5 * $bonuses * + log(1 + 1/max(1, $occurrences_per_doc), 2) / + ($bonuses + 1) + ((log(1 + $occurrences_per_doc, 2) + $frequency * log(1 + 1/max(1, $occurrences_per_doc), 2)) / ($frequency + 1)); @@ -721,37 +724,11 @@ class WordIterator extends IndexBundleIterator return $key_postings; } /** - * This method calculates the max relevance value for the relevance - * calculation of the term to the query - * @return float maximum score for document relevance to a query - */ - public function getMaxRelevanceScore() - { - $occurrences_per_doc = $this->num_occurrences / - max($this->total_num_docs, 1); - $max_score = 1 + log(1 + 1 / max(1, $occurrences_per_doc), 2); - return $max_score; - } - /** - * This method calculates the maximum overall score value for any document - * returned by this iterator. - * @return float maxScore - */ - public function getMaxScore() - { - if ($this->is_meta) { - return 0.01; - } - $max_score = $this->getMaxDocQualityScore() + - $this->getMaxRelevanceScore(); - return $max_score; - } - /** - * Normalizes the frequencies of a term within a document with respect to + * Computes weighted frequencies of a term within a document with respect to * the length of the document, the positions of the term with the document * and the overall importance score for a given position within the document * Also computes the score of the posting for the host keywords, - * title keywords, and path keywords. + * title keywords, and path keywords bonuses. * * @param float $occurrences_per_doc expected number of occurrence of term * per/doc. @@ -765,10 +742,9 @@ class WordIterator extends IndexBundleIterator * summary that demarks the end of the title portion of the summary * @param array $descriptions_scores boundaries and scores of different * regions with document - * @return array [normalized frequency, score for host name, title, - * and path keywords] + * @return array [score for host title path keywords bonuses, frequency] */ - public function frequencyNormalizationScoring( + public function frequencyScoring( $occurrences_per_doc, $positions, $num_words, $host_keywords_end_pos, $title_end_pos, $path_keywords_end_pos, $descriptions_scores) { @@ -793,7 +769,6 @@ class WordIterator extends IndexBundleIterator $path_bonus = $this->ranking_factors["PATH_KEYWORD_BONUS"]; $title_bonus = $this->ranking_factors["TITLE_BONUS"]; $len_term = strlen($this->word_key); - $max_doc_norm_score = $host_bonus + $path_bonus + $title_bonus + 1; $first_index = 0; $old_pos = 0; /* @@ -813,6 +788,7 @@ class WordIterator extends IndexBundleIterator ], $descriptions_scores); $num_scores = count($descriptions_scores); $weighted_frequency = 0; + $bonuses = 0; foreach ($positions as $position) { $last_index = $num_scores - 1; /* description score offsets are with respect to the description @@ -828,17 +804,21 @@ class WordIterator extends IndexBundleIterator $first_index = $mid_index; } } - $weight = $descriptions_scores[$first_index]['SCORE']; + $weight = $descriptions_scores[$first_index]['SCORE'];; $start_description_pos = $descriptions_scores[$first_index]['POS']; $len_description = ($first_index == $num_scores - 1) ? $pseudo_doc_length - $start_description_pos : $descriptions_scores[$first_index + 1]['POS'] - $start_description_pos; - $weighted_frequency += $weight * $len_term / $len_description; + $frequency_term = $weight * $len_term / $len_description; + if ($position <= 0) { + $bonuses += $frequency_term; + } else { + $weighted_frequency += $frequency_term; + } } - $frequency = ($weighted_frequency/$max_doc_norm_score) * $num_words * - $length_normalization; - return $frequency; + $frequency = $weighted_frequency * $length_normalization; + return [$bonuses, $frequency]; } /** * Updates the seen_docs count during an advance() call diff --git a/src/models/PhraseModel.php b/src/models/PhraseModel.php index b2b501087..e545c9ebf 100755 --- a/src/models/PhraseModel.php +++ b/src/models/PhraseModel.php @@ -622,8 +622,9 @@ class PhraseModel extends ParallelModel } foreach ($split_terms as $term) { if (!in_array($term, $special_words)) { - $search_terms = array_merge($search_terms, + $term = implode(" ", PhraseParser::segmentSegment($term, $locale_tag)); + $search_terms[] = $term; } } $phrase = '';