diff --git a/src/library/CrawlConstants.php b/src/library/CrawlConstants.php index 0353ea6ce..79e58dbe7 100755 --- a/src/library/CrawlConstants.php +++ b/src/library/CrawlConstants.php @@ -264,4 +264,7 @@ interface CrawlConstants const SCRAPER_INFO = 'eq'; const SEQUENCE_NUMBER = 'er'; const FETCHER_QUEUE_SERVER_RATIO = 'es'; + const NEXT_DOC = 'et'; + const ITERATOR = 'eu'; + const MAX_SCORE = 'ev'; } diff --git a/src/library/index_bundle_iterators/DisjointIterator.php b/src/library/index_bundle_iterators/DisjointIterator.php deleted file mode 100644 index 83bdebab6..000000000 --- a/src/library/index_bundle_iterators/DisjointIterator.php +++ /dev/null @@ -1,262 +0,0 @@ -<?php -/** - * SeekQuarry/Yioop -- - * Open Source Pure PHP Search Engine, Crawler, and Indexer - * - * Copyright (C) 2009 - 2023 Chris Pollett chris@pollett.org - * - * LICENSE: - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <https://www.gnu.org/licenses/>. - * - * END LICENSE - * - * @author Chris Pollett chris@pollett.org - * @license https://www.gnu.org/licenses/ GPL3 - * @link https://www.seekquarry.com/ - * @copyright 2009 - 2023 - * @filesource - */ -namespace seekquarry\yioop\library\index_bundle_iterators; - -/** - * Used to iterate over the documents which occur in a set of disjoint iterators - * all belonging to the same index - * - * @author Chris Pollett - * @see IndexArchiveBundle - */ -class DisjointIterator extends IndexBundleIterator -{ - /** - * An array of iterators whose intersection we get documents from - * @var array - */ - public $index_bundle_iterators; - /** - * Number of elements in $this->index_bundle_iterators - * @var int - */ - public $num_iterators; - /** - * The number of iterated docs before the restriction test - * @var int - */ - public $seen_docs_unfiltered; - /** - * Index of the iterator amongst those we are disjoint unioning of - * least gen_doc_offset - * @var int - */ - public $least_offset_index; - /** - * Creates an disjoint union iterator with the given parameters. - * - * @param object $index_bundle_iterators to use as a source of documents - * to iterate over - */ - public function __construct($index_bundle_iterators) - { - $this->index_bundle_iterators = $index_bundle_iterators; - $this->num_iterators = count($index_bundle_iterators); - $this->num_docs = 0; - $this->results_per_block = 1; - /* - We take an initial guess of the num_docs we return as the sum - of the num_docs of the underlying iterators. We are also setting - up here that we return at most one posting at a time from each - iterator - */ - $this->seen_docs = 0; - $this->seen_docs_unfiltered = 0; - for ($i = 0; $i < $this->num_iterators; $i++) { - $this->num_docs += $this->index_bundle_iterators[$i]->num_docs; - $this->index_bundle_iterators[$i]->setResultsPerBlock(1); - $this->seen_docs += $this->index_bundle_iterators[$i]->seen_docs; - if (isset($this->index_bundle_iterators[$i]->seen_docs_unfiltered)){ - $this->seen_docs_unfiltered += - $this->index_bundle_iterators[$i]->seen_docs_unfiltered; - } else { - $this->seen_docs_unfiltered += $this->seen_docs; - } - } - $this->leastGenDocOffsetsAmongstIterators(); - } - /** - * Returns CrawlConstants::ASCENDING or CrawlConstants::DESCENDING - * depending on the direction in which this iterator ttraverse the - * underlying index archive bundle. - * - * @return int direction traversing underlying archive bundle - */ - public function getDirection() - { - if (!empty($this->index_bundle_iterators[0])) { - return $this->index_bundle_iterators[0]->getDirection(); - } - return self::ASCENDING; - } - /** - * Returns the iterators to the first document block that it could iterate - * over - */ - public function reset() - { - for ($i = 0; $i < $this->num_iterators; $i++) { - $this->index_bundle_iterators[$i]->setResultsPerBlock(1); - $this->index_bundle_iterators[$i]->reset(); - } - - $this->seen_docs = 0; - $this->seen_docs_unfiltered = 0; - $this->leastGenDocOffsetsAmongstIterators(); - } - /** - * Hook function used by currentDocsWithWord to return the current block - * of docs if it is not cached - * - * @return mixed doc ids and rank if there are docs left, -1 otherwise - */ - public function findDocsWithWord() - { - $least_offset = $this->leastGenDocOffsetsAmongstIterators(); - if ($least_offset == -1) { - return -1; - } - //next we finish computing the score - $docs = $this->index_bundle_iterators[ - $this->least_offset_index]->currentDocsWithWord(); - $this->count_block = 0; - if (is_array($docs)) { - $this->count_block = count($docs); - } - $this->pages = $docs; - return $docs; - } - /** - * Gets the doc_offset and generation for the next document that - * would be return by this iterator - * - * @return mixed an array with the desired document offset - * and generation; -1 on fail - */ - public function currentGenDocOffsetWithWord() { - if ($this->num_iterators <= 0) { - return -1; - } - return $this->leastGenDocOffsetsAmongstIterators(); - } - /** - * Finds the next generation and doc offset amongst all the iterators - * that is of least value - */ - public function leastGenDocOffsetsAmongstIterators() - { - $least_gen_offset = -1; - $this->least_offset_index = 0; - $direction = $this->getDirection(); - for ($i = 0; $i < $this->num_iterators; $i++) { - $cur_gen_doc_offset = - $this->index_bundle_iterators[ - $i]->currentGenDocOffsetWithWord(); - if ($least_gen_offset == -1 && is_array($cur_gen_doc_offset)) { - $least_gen_offset = $cur_gen_doc_offset; - $this->least_offset_index = $i; - continue; - } else if ($cur_gen_doc_offset == -1) { - continue; - } - $gen_doc_cmp = $this->genDocOffsetCmp($cur_gen_doc_offset, - $least_gen_offset, $direction); - if ($gen_doc_cmp < 0) { - $least_gen_offset = $cur_gen_doc_offset; - $this->least_offset_index = $i; - } - } - return $least_gen_offset; - } - /** - * Forwards the iterator one group of docs - * @param array $gen_doc_offset a generation, doc_offset pair. If set, - * the must be of greater than or equal generation, and if equal the - * next block must all have $doc_offsets larger than or equal to - * this value - */ - public function advance($gen_doc_offset = null) - { - $no_change = true; - //num_docs can change when advance() called so that's why we recompute - $total_num_docs = 0; - if ($gen_doc_offset !== null) { - $direction = $this->getDirection(); - for ($i = 0; $i < $this->num_iterators; $i++) { - $cur_gen_doc_offset = $this->index_bundle_iterators[ - $i]->currentGenDocOffsetWithWord(); - if ($this->genDocOffsetCmp($cur_gen_doc_offset, - $gen_doc_offset, $direction) < 0) { - if ($no_change) { - $this->current_block_fresh = false; - $this->seen_docs += 1; - $this->seen_docs_unfiltered = 0; - $no_change = false; - } - $this->seen_docs_unfiltered += - $this->index_bundle_iterators[$i]->seen_docs; - $total_num_docs += - $this->index_bundle_iterators[$i]->num_docs; - $this->index_bundle_iterators[$i]->advance($gen_doc_offset); - } - } - } else { - if (!$this->current_block_fresh) { - $this->leastGenDocOffsetsAmongstIterators(); - } - $this->current_block_fresh = false; - $this->seen_docs += 1; - $this->seen_docs_unfiltered = 0; - $least= $this->least_offset_index; - if (!isset($this->index_bundle_iterators[$least])) { - return; - } - $this->seen_docs_unfiltered += - $this->index_bundle_iterators[$least]->seen_docs; - $total_num_docs += $this->index_bundle_iterators[$least]->num_docs; - $this->index_bundle_iterators[$least]->advance(); - } - if ($this->seen_docs_unfiltered > 0) { - $this->num_docs = - floor(($this->seen_docs * $total_num_docs) / - $this->seen_docs_unfiltered); - } - } - /** - * This method is supposed to set - * the value of the result_per_block field. This field controls - * the maximum number of results that can be returned in one go by - * currentDocsWithWord(). This method cannot be consistently - * implemented for this iterator and expect it to behave nicely - * it this iterator is used together with union_iterator or - * intersect_iterator. So to prevent a user for doing this, calling this - * method results in a user defined error - * - * @param int $num the maximum number of results that can be returned by - * a block - */ - public function setResultsPerBlock($num) { - if ($num != 1) { - trigger_error("Cannot set the results per block of - a phrase iterator", E_USER_ERROR); - } - } -} diff --git a/src/library/index_bundle_iterators/DocIterator.php b/src/library/index_bundle_iterators/DocIterator.php index 9b9d36b60..bb59a63a3 100755 --- a/src/library/index_bundle_iterators/DocIterator.php +++ b/src/library/index_bundle_iterators/DocIterator.php @@ -98,6 +98,12 @@ class DocIterator extends IndexBundleIterator * @var int */ public $current_offset; + /** + * How url, keywords, and title words should influence relevance + * and doc rank calculations + * @var array + */ + public $ranking_factors; /** * An array of shard docids_lens * @var array @@ -135,16 +141,17 @@ class DocIterator extends IndexBundleIterator * added. Note: this value is not saved permanently. So you * could in theory open two read only versions of the same bundle but * reading the results in different directions - * @param int $results_per_block the maximum number of results that can - * be returned by a findDocsWithWord call + * @param array $ranking_factors field says url being a host, cld, + * or having a lot of slashes should affect its doc rank calculations */ public function __construct($index_name, $filter = null, $results_per_block = IndexBundleIterator::RESULTS_PER_BLOCK, - $direction = self::ASCENDING) + $direction = self::ASCENDING, $ranking_factors = []) { $this->filter = $filter; $this->index_name = $index_name; $this->direction = $direction; + $this->ranking_factors = $ranking_factors; $this->index_version = IndexManager::getVersion($index_name); $index = IndexManager::getIndex($index_name, $direction); if (empty($index)) { @@ -285,9 +292,15 @@ class DocIterator extends IndexBundleIterator $doc_id = $doc_keys[$this->next_offset]; $doc_info = $doc_map_tools->unpack($doc_map[$doc_id]); $item = [self::GENERATION => $this->current_generation]; - list($item[self::DOC_LEN], $item[self::SCORE]) = + $item[self::DOC_RANK] = $this->computeDocRank($doc_id, + $this->next_offset, $this->current_generation, + $this->num_generations, $this->last_offset, + $this->last_offset, $this->last_offset, + $this->ranking_factors, $is_ascending); + list($item[self::DOC_LEN], ) = array_values(array_shift($doc_info)); - list($item['TITLE_LENGTH'], $num_description_scores) = + $item[self::SCORE] = $item[self::DOC_RANK]; + list(, $num_description_scores) = array_values(array_shift($doc_info)); $item[self::DESCRIPTION_SCORES] = array_slice($doc_info, 0, $num_description_scores); diff --git a/src/library/index_bundle_iterators/IndexBundleIterator.php b/src/library/index_bundle_iterators/IndexBundleIterator.php index 50c89a417..dfb27e68d 100644 --- a/src/library/index_bundle_iterators/IndexBundleIterator.php +++ b/src/library/index_bundle_iterators/IndexBundleIterator.php @@ -32,8 +32,12 @@ namespace seekquarry\yioop\library\index_bundle_iterators; use seekquarry\yioop\configs as C; use seekquarry\yioop\library as L; -use seekquarry\yioop\library\PhraseParser; use seekquarry\yioop\library\CrawlConstants; +use seekquarry\yioop\library\IndexDocumentBundle; +use seekquarry\yioop\library\PhraseParser; +use seekquarry\yioop\library\PartitionDocumentBundle; + + /** For toHexString and Yioop constants*/ require_once __DIR__."/../Utility.php"; @@ -99,7 +103,7 @@ abstract class IndexBundleIterator implements CrawlConstants * next block must all have $doc_offsets larger than or equal to * this value */ - abstract function advance($gen_doc_offset = null); + public abstract function advance($gen_doc_offset = null); /** * Gets the doc_offset and generation for the next document that * would be return by this iterator @@ -107,14 +111,47 @@ abstract class IndexBundleIterator implements CrawlConstants * @return mixed an array with the desired document offset * and generation; -1 on fail */ - abstract function currentGenDocOffsetWithWord(); + public abstract function currentGenDocOffsetWithWord(); /** * Hook function used by currentDocsWithWord to return the current block * of docs if it is not cached * * @return mixed doc ids and score if there are docs left, -1 otherwise */ - abstract function findDocsWithWord(); + public abstract function findDocsWithWord(); + /** + * This method calculates the max score value for the Doc Quality + * calculation (query independent) for an document returned by this + * iterator. Currently, for all documents we have coded a maximum + * DOC_RANK of 5 (based on code in @see computeDocRank ). This will + * likely need to be revisited in the future. + * + * @return float maximum score for document quality + */ + public function getMaxDocQualityScore() + { + return 5; + } + /** + * This method calculates the max relevance value for query underlying + * the iterator to the document currently being iterated over + * by the query + * @return float maximum score for document relevance to a query + */ + public function getMaxRelevanceScore() + { + return 0.01; + } + /** + * This method calculates the maximum overall score value for any document + * returned by this iterator. It should be overriden in subclasses as + * makes sense + * @return float maximum score + */ + public function getMaxScore() + { + return $this->getMaxDocQualityScore() + $this->getMaxRelevanceScore(); + } /** * Returns a string representation of a plan by which the current iterator * finds its results @@ -281,4 +318,68 @@ abstract class IndexBundleIterator implements CrawlConstants { $this->results_per_block = $num; } + /** + * + */ + public function computeDocRank($doc_key, $doc_map_index = 1, + $num_seen_partitions = 0, $number_of_partitions = 1, + $num_doc_keys = PartitionDocumentBundle::MAX_ITEMS_PER_FILE, + $avg_items_per_partition = PartitionDocumentBundle::MAX_ITEMS_PER_FILE, + $max_items_per_partition = PartitionDocumentBundle::MAX_ITEMS_PER_FILE, + $ranking_factors = [], $is_ascending = true) + { + /* + DOC_RANK calculate is a computes a document quality measure + either based on time item was added (freshness) or a + sum of signals (how early or late it was added to index), + whether the url was a CLD or HOST, whether page was a wiki + page, and the number of slashes in the url path + */ + $cld_bonus = $ranking_factors["CLD_URL_BONUS"] ?? 1; + $host_bonus = $ranking_factors["HOST_URL_BONUS"] ?? 1; + $wiki_bonus = $ranking_factors["WIKI_BONUS"] ?? 1; + $num_slashes_bonus = $ranking_factors["NUM_SLASHES_BONUS"] ?? 1; + $max_pre_rank_and_bonuses = $cld_bonus + $host_bonus + + $wiki_bonus + $wiki_bonus + 1; + $last_partition_pos = ($is_ascending) ? + $num_doc_keys - $doc_map_index : + $doc_map_index; + $remaining_partitions = ($is_ascending) ? + $number_of_partitions - $num_seen_partitions : + $num_seen_partitions - 1; + $pre_rank_and_bonuses = ($remaining_partitions * + $this->avg_items_per_partition)/ + (($number_of_partitions + 1) * + ($avg_items_per_partition + 1)) + + $last_partition_pos / $max_items_per_partition; + if (IndexDocumentBundle::isAHostDocId($doc_key)) { + $pre_rank_and_bonuses += + (IndexDocumentBundle::isACldDocId($doc_key)) ? + $cld_bonus : $host_bonus; + } + /** + * For backward compatibility: new bonuses should only be added + * for doc_ids following the new letter_code format. Since all + * old formats use letters (b, t, etc.) to denote the doc + * type, the ASCII values for these letters are all > 96 (i.e., + * bits 6 and 7 of the doc_id's 9th byte are both true). + * Since all new letter_code formats use bits 4, 5, 6, 7 to + * represent the doc type as int values mapped between 0-8, + * there is no value in a doc_id's 9th byte that can have both + * bits 6 and 7 set to true. + * This difference can be used to check whether $doc_key follows + * the old or new letter_code format. + */ + $doc_id_format = ord($doc_key[ + IndexDocumentBundle::DOCID_PART_LEN << 1] ?? 0) & 96; + if ($doc_id_format != 96) { + if (IndexDocumentBundle::isAWikipediaPage($doc_key)) { + $pre_rank_and_bonuses += $wiki_bonus; + } + $pre_rank_and_bonuses += $num_slashes_bonus / + (IndexDocumentBundle::findNumSlashes($doc_key) + 1); + } + return $this->getMaxDocQualityScore() * + $pre_rank_and_bonuses / $max_pre_rank_and_bonuses; + } } diff --git a/src/library/index_bundle_iterators/IntersectIterator.php b/src/library/index_bundle_iterators/IntersectIterator.php index e553da3d5..2acfd225f 100644 --- a/src/library/index_bundle_iterators/IntersectIterator.php +++ b/src/library/index_bundle_iterators/IntersectIterator.php @@ -546,16 +546,17 @@ class IntersectIterator extends IndexBundleIterator } } /** - * Returns the sum of maxScores for nested WordIterators - * - * @return int maxScore + * This method calculates the max relevance value for query underlying + * the iterator to the document currently being iterated over + * by the query + * @return float maximum score for document relevance to a query */ - public function getMaxScore() + public function getMaxRelevanceScore() { - $maxScore = 0; + $max_relevance = 0; foreach ($this->index_bundle_iterators as $iterator) { - $maxScore += $iterator->getMaxScore(); + $max_relevance += $iterator->getMaxRelevanceScore(); } - return $maxScore; + return $max_score; } } diff --git a/src/library/index_bundle_iterators/NegationIterator.php b/src/library/index_bundle_iterators/NegationIterator.php index aef26eb6f..8d8c7c772 100644 --- a/src/library/index_bundle_iterators/NegationIterator.php +++ b/src/library/index_bundle_iterators/NegationIterator.php @@ -119,7 +119,7 @@ class NegationIterator extends IndexBundleIterator //we get intersect docs one at a time so should be only one $keys = array_keys($docs); $key = $keys[0]; - $docs[$key][self::RELEVANCE] = 1; + $docs[$key][self::RELEVANCE] = 0.01; $docs[$key][self::SCORE] = $docs[$key][self::DOC_RANK] + $docs[$key][self::RELEVANCE]; } diff --git a/src/library/index_bundle_iterators/UnionIterator.php b/src/library/index_bundle_iterators/UnionIterator.php index f48518c2c..f966d3f8c 100644 --- a/src/library/index_bundle_iterators/UnionIterator.php +++ b/src/library/index_bundle_iterators/UnionIterator.php @@ -92,30 +92,6 @@ class UnionIterator extends IndexBundleIterator * @var array */ public $results_heap; - /** - * Heap constant to track the next occurrence of the term on a consituent - * iterator - */ - const NEXT_DOC = 'NEXT_DOC'; - /** - * Heap constant to track the index of a consituent iterator on - * $index_bundle_iterators - */ - const ITERATOR = 'ITERATOR'; - /** - * Heap constant to track the MaxScore of the term on a consituent - * iterator - */ - const MAX_SCORE = 'MAX_SCORE'; - /** - * Heap constant to track the doc fetched by a consituent iterator - */ - const DOC = 'DOC'; - /** - * Heap constant to track the score of a doc fetched by a consituent - * iterator - */ - const DOC_SCORE = 'DOC_SCORE'; /** * Creates a union iterator with the given parameters. * @@ -142,7 +118,7 @@ class UnionIterator extends IndexBundleIterator $this->total_num_docs = $total_num_docs; $this->low_scoring_terms = []; for ($i = 0; $i < self::RESULTS_PER_BLOCK; $i++) { - $this->results_heap[$i][self::DOC_SCORE] = 0; + $this->results_heap[$i][self::SCORE] = 0; } $this->initializeTermsHeap($this->terms_heap); for ($i = 0; $i < $this->num_iterators; $i++) { @@ -234,20 +210,19 @@ class UnionIterator extends IndexBundleIterator } if (!empty($doc) && $relevance_score > - $this->results_heap[0][self::DOC_SCORE]) { + $this->results_heap[0][self::SCORE]) { // Update the document's scores $doc[self::RELEVANCE] = $relevance_score; - $score = $relevance_score + $doc[self::DOC_RANK]; - $doc[self::SCORE] = $score; + $doc[self::SCORE] = $relevance_score + $doc[self::DOC_RANK]; $found_docs = true; - $this->results_heap[0][self::DOC] = $doc; - $this->results_heap[0][self::DOC_SCORE] = $score; + $this->results_heap[0] = [self::DOC_INFO => $doc, + self::SCORE => $doc[self::SCORE]]; $this->heapifyDown($this->results_heap, false); } - $found_top_results = $this->results_heap[0][self::DOC_SCORE] > 0; + $found_top_results = $this->results_heap[0][self::SCORE] > 0; if (!$found_docs || $found_top_results) { - $pages = ($this-> - results_heap[self::RESULTS_PER_BLOCK-1][self::DOC_SCORE] == 0) ? + $pages = ($this->results_heap[ + self::RESULTS_PER_BLOCK - 1][self::SCORE] == 0) ? -1 : $this->getResultsHeap(); } else { $pages = [$doc]; @@ -270,15 +245,14 @@ class UnionIterator extends IndexBundleIterator $pages = []; while (!empty($this->results_heap)) { $doc = $this->extractMinScoringDoc($this->results_heap); - if ($doc[self::DOC_SCORE] > 0) { - array_unshift($pages, $doc[self::DOC]); + if ($doc[self::SCORE] > 0) { + array_unshift($pages, $doc[self::DOC_INFO]); } } // Re-initialize the results heap for the next set of docs - $this->results_heap = []; - for ($i = 0; $i < self::RESULTS_PER_BLOCK; $i++) { - $this->results_heap[$i][self::DOC_SCORE] = 0; - } + $initial_heap_item = [self::SCORE => 0, self::DOC_INFO => null]; + $this->results_heap = array_fill(0, self::RESULTS_PER_BLOCK, + $initial_heap_item); if (empty($pages)) { $pages = -1; } @@ -296,21 +270,23 @@ class UnionIterator extends IndexBundleIterator { $is_ascending = $this->getDirection(); if (!$is_terms_heap) { - return $i[self::DOC_SCORE] > $j[self::DOC_SCORE]; + return $i[self::SCORE] > $j[self::SCORE]; } + $i_next_doc = $i[self::NEXT_DOC]; + $j_next_doc = $j[self::NEXT_DOC]; if ($is_ascending) { - if ($i[self::NEXT_DOC] == -1) { + if ($i_next_doc == -1) { return true; } else if ($j[self::NEXT_DOC] == -1) { return false; } - return $i[self::NEXT_DOC][0] > $j[self::NEXT_DOC][0] || - ($i[self::NEXT_DOC][0] == $j[self::NEXT_DOC][0] && - $i[self::NEXT_DOC][1] > $j[self::NEXT_DOC][1]); + return $i_next_doc[0] > $j_next_doc[0] || + ($i_next_doc[0] == $j_next_doc[0] && + $i_next_doc[1] > $j_next_doc[1]); } else { - return $j[self::NEXT_DOC][0] > $i[self::NEXT_DOC][0] || - ($j[self::NEXT_DOC][0] == $i[self::NEXT_DOC][0] && - $j[self::NEXT_DOC][1] > $i[self::NEXT_DOC][1]); + return $j_next_doc[0] > $i_next_doc[0] || + ($j_next_doc[0] == $i_next_doc[0] && + $j_next_doc[1] > $i_next_doc[1]); } } /** @@ -324,8 +300,8 @@ class UnionIterator extends IndexBundleIterator $index = 0; $heap_size = count($heap); while ($index < $heap_size) { - $left = $index * 2 + 1; - $right = $index * 2 + 2; + $left = ($index << 1) + 1; + $right = ($index + 1) << 1; $least_doc = $index; if ($left < $heap_size && $this->compareElements($heap[$least_doc], $heap[$left], @@ -357,7 +333,7 @@ class UnionIterator extends IndexBundleIterator { $index = count($heap) - 1; while ($index > 0) { - $parent_index = floor(($index - 1) / 2); + $parent_index = ($index - 1) >> 1; if ($this->compareElements($heap[$index], $heap[$parent_index], $is_terms_heap)) { break; @@ -395,14 +371,13 @@ class UnionIterator extends IndexBundleIterator if (!empty($terms)) { return; } - for ($i = 0; $i < $this->num_iterators; $i++) { + $num_iterators = $this->num_iterators; + for ($i = 0; $i < $num_iterators; $i++) { $iterator = $this->index_bundle_iterators[$i]; - $max_score = $iterator->getMaxScore(); - $position = $iterator->currentGenDocOffsetWithWord(); $terms[] = [ self::ITERATOR => $i, - self::MAX_SCORE => $max_score, - self::NEXT_DOC => $position + self::MAX_SCORE => $iterator->getMaxScore(), + self::NEXT_DOC => $iterator->currentGenDocOffsetWithWord() ]; $this->heapifyUp($terms, true); } @@ -420,7 +395,7 @@ class UnionIterator extends IndexBundleIterator $this->seen_docs_unfiltered += $this->count_block_unfiltered; $total_num_docs = 0; $d = $this->currentGenDocOffsetWithWord(); - $score_k = $this->results_heap[0][self::DOC_SCORE]; + $score_k = $this->results_heap[0][self::SCORE]; while ($d != -1 && $this->terms_heap[0][self::NEXT_DOC] === $d) { $iterator_idx = $this->terms_heap[0][self::ITERATOR]; $iterator = $this->index_bundle_iterators[$iterator_idx]; @@ -486,4 +461,18 @@ class UnionIterator extends IndexBundleIterator (key_exists(self::NEXT_DOC, $this->terms_heap[0]) ? $this->terms_heap[0][self::NEXT_DOC] : -1) : -1; } + /** + * This method calculates the max relevance value for query underlying + * the iterator to the document currently being iterated over + * by the query + * @return float maximum score for document relevance to a query + */ + public function getMaxRelevanceScore() + { + $max_relevance = 0; + foreach ($this->index_bundle_iterators as $iterator) { + $max_relevance += $iterator->getMaxRelevanceScore(); + } + return $max_score; + } } diff --git a/src/library/index_bundle_iterators/WordIterator.php b/src/library/index_bundle_iterators/WordIterator.php index 3592c8250..746934639 100644 --- a/src/library/index_bundle_iterators/WordIterator.php +++ b/src/library/index_bundle_iterators/WordIterator.php @@ -574,8 +574,8 @@ class WordIterator extends IndexBundleIterator IndexManager::lookupLatestVersionPage($url_hash, $this->index_name); if ($latest_version_info != null) { - $latest_partition = $latest_version_info[0]; - $latest_posting = $latest_version_info[1]; + list($latest_partition, $latest_posting) = + $latest_version_info; /** * Ensure that the discovered latest version * isn't the same as the current posting. @@ -584,7 +584,7 @@ class WordIterator extends IndexBundleIterator $latest_posting['DOC_MAP_INDEX'] != $doc_map_index) { $latest_base_folder = $index-> - getPartitionBaseFolder($latest_partition); + getPartitionBaseFolder($latest_partition); $latest_doc_map_filename = $latest_base_folder . "/" . IndexDocumentBundle::DOC_MAP_FILENAME; $latest_doc_map_index = @@ -670,53 +670,11 @@ class WordIterator extends IndexBundleIterator (max(1, $time - $original_score)) * $this->getMaxDocQualityScore(); } else { - $cld_bonus = $this->ranking_factors["CLD_URL_BONUS"]; - $host_bonus = $this->ranking_factors["HOST_URL_BONUS"]; - $wiki_bonus = $this->ranking_factors["WIKI_BONUS"]; - $num_slashes_bonus = - $this->ranking_factors["NUM_SLASHES_BONUS"]; - $max_pre_rank_and_bonuses = $cld_bonus + $host_bonus + - $wiki_bonus + $wiki_bonus + 1; - $last_partition_pos = ($is_ascending) ? - $num_doc_keys - $doc_map_index : - $doc_map_index; - $remaining_partitions = ($is_ascending) ? - $number_of_partitions - $num_seen_partitions : - $num_seen_partitions - 1; - $pre_rank_and_bonuses = ($remaining_partitions * - $this->avg_items_per_partition)/ - (($number_of_partitions + 1) * - ($this->avg_items_per_partition + 1)) + - $last_partition_pos / $this->max_items_per_partition; - if (IndexDocumentBundle::isAHostDocId($doc_key)) { - $pre_rank_and_bonuses += - (IndexDocumentBundle::isACldDocId($doc_key)) ? - $cld_bonus : $host_bonus; - } - /** - * For backward compatibility: new bonuses should only be added - * for doc_ids following the new letter_code format. Since all - * old formats use letters (b, t, etc.) to denote the doc - * type, the ASCII values for these letters are all > 96 (i.e., - * bits 6 and 7 of the doc_id's 9th byte are both true). - * Since all new letter_code formats use bits 4, 5, 6, 7 to - * represent the doc type as int values mapped between 0-8, - * there is no value in a doc_id's 9th byte that can have both - * bits 6 and 7 set to true. - * This difference can be used to check whether $doc_key follows - * the old or new letter_code format. - */ - $doc_id_format = ord($doc_key[ - IndexDocumentBundle::DOCID_PART_LEN << 1] ?? 0) & 96; - if ($doc_id_format != 96) { - if (IndexDocumentBundle::isAWikipediaPage($doc_key)) { - $pre_rank_and_bonuses += $wiki_bonus; - } - $pre_rank_and_bonuses += $num_slashes_bonus / - (IndexDocumentBundle::findNumSlashes($doc_key) + 1); - } - $posting[self::DOC_RANK] = $this->getMaxDocQualityScore() * - $pre_rank_and_bonuses / $max_pre_rank_and_bonuses; + $posting[self::DOC_RANK] = $this->computeDocRank($doc_key, + $doc_map_index, $num_seen_partitions, $number_of_partitions, + $num_doc_keys, $this->avg_items_per_partition, + $this->max_items_per_partition, + $this->ranking_factors, $is_ascending); } list($preface_positions, $num_description_scores) = array_values(array_shift($doc_info)); @@ -763,30 +721,20 @@ class WordIterator extends IndexBundleIterator return $key_postings; } /** - * This method calculates the maxScore value for the relevance calculation - * of the term to the query + * This method calculates the max relevance value for the relevance + * calculation of the term to the query * @return float maximum score for document relevance to a query */ public function getMaxRelevanceScore() { $occurrences_per_doc = $this->num_occurrences / max($this->total_num_docs, 1); - $max_score = 1 + log(1 + 1/max(1, $occurrences_per_doc), 2); - return $max_score; - } - /** - * This method calculates the maxScore value for the Doc Quality calculation - * for a document and a query - * @return float maximum score for document quality - */ - public function getMaxDocQualityScore() - { - $max_score = 5; + $max_score = 1 + log(1 + 1 / max(1, $occurrences_per_doc), 2); return $max_score; } /** - * This method calculates the maxScore value for the Doc Quality calculation - * for a document and a query + * This method calculates the maximum overall score value for any document + * returned by this iterator. * @return float maxScore */ public function getMaxScore() diff --git a/src/models/PhraseModel.php b/src/models/PhraseModel.php index d55566c25..6eb1b2e16 100755 --- a/src/models/PhraseModel.php +++ b/src/models/PhraseModel.php @@ -1905,7 +1905,7 @@ class PhraseModel extends ParallelModel } $word_iterators[$i] = new I\DocIterator( $actual_index_name, $filter, $to_retrieve, - $direction); + $direction, $ranking_factors); $min_group_override = true; } else { $distinct_key = $distinct_word_keys[$i];