viewgit/inc/functions.php:22 Function utf8_encode() is deprecated [8192]
Filename | |
---|---|
src/configs/Config.php | |
src/library/IndexDocumentBundle.php | |
src/library/IndexManager.php | |
src/library/index_bundle_iterators/WordIterator.php |
diff --git a/src/configs/Config.php b/src/configs/Config.php index 537b25f1b..bf1350042 100755 --- a/src/configs/Config.php +++ b/src/configs/Config.php @@ -951,11 +951,12 @@ nsconddefine('MAX_URL_LEN', 2048); nsdefine('PAGE_RANGE_REQUEST', 1000000); /** * When getting information from an index dictionary in word iterator - * how many distinct generations to read in in one go + * for a version < 3 index how many distinct generations to read in in one go */ nsconddefine('NUM_DISTINCT_GENERATIONS', 20); /** - * Used in computing the DOC_RANK when a going through index in descending + * Used in computing the DOC_RANK for version < 3 indexes when a going + * through index in descending * fashion. It represents an upper bound on the maximum number of * generations an IndexArchiveBundle should have */ diff --git a/src/library/IndexDocumentBundle.php b/src/library/IndexDocumentBundle.php index a40ece3e9..d77492ccf 100644 --- a/src/library/IndexDocumentBundle.php +++ b/src/library/IndexDocumentBundle.php @@ -1456,10 +1456,8 @@ class IndexDocumentBundle implements CrawlConstants * dictionary * @param int $threshold after the number of results exceeds this amount * stop looking for more dictionary entries. - * @param int $start_generation what generation in the index to start - * finding occurrence of phrase from - * @param int $num_distinct_generations from $start_generation how - * many generation to search forward to + * @param int $offset + * @param int $num_partitions * @param bool $with_remaining_total whether to total number of * postings found as well or not * @return array either [total, sequence of four tuples] diff --git a/src/library/IndexManager.php b/src/library/IndexManager.php index b6e80ef94..24a2d90cf 100644 --- a/src/library/IndexManager.php +++ b/src/library/IndexManager.php @@ -326,59 +326,6 @@ class IndexManager implements CrawlConstants } return ($with_remaining_total) ? [$total, $info] : $info; } - /** - * Returns the number of document that a given term or phrase appears in - * in the given index where we discount later generation -- those with - * lower document rank more - * - * @param string $term what to look up in the indexes dictionary - * no mask is used for this look up - * @param string $index_name index to look up term or phrase in - * @param boolean $discount_terms whether terms should be discounted - * based on their generation or not - * @return int number of documents - */ - public static function discountedNumDocsTerm($term, $index_name, - $discount_terms = true) - { - static $num_docs_cache = []; - if (isset($num_docs_cache[$index_name][$term])) { - return $num_docs_cache[$index_name][$term]; - } - $version = self::getVersion($index_name); - $term_id = $discount_terms ? (($version > 2) ? canonicalTerm($term) : - crawlHashWord($term, true)) : $term; - $word_info = self::getWordInfo($index_name, $term_id, -1, 0, - C\NUM_DISTINCT_GENERATIONS); - if ($version >= 3 && !empty($word_info)) { - $word_info = $word_info['ROWS']; - } - if (empty($word_info)) { - return 0.0; - } - $total = 0.0; - $i = 1; - foreach ($word_info as $generation_info) { - if ($version < 3) { - list($generation, , , $num_docs) = $generation_info; - } else { - $generation = $generation_info['PARTITION']; - $num_docs = $generation_info['NUM_DOCS']; - } - $discount = $discount_terms ? max($generation + 1, $i++) : 1; - $total += $num_docs / $discount; - } - if (count($num_docs_cache) > 1000) { - $num_docs_cache = []; - } - if (!empty($num_docs_cache[$index_name]) && - count($num_docs_cache[$index_name]) > 10000) { - $num_docs_cache[$index_name] = []; - } - $num_docs_cache[$index_name][$term] = $total; - return $total; - } - /** * Finds posting info related to the most recent version * of a URL in the given index diff --git a/src/library/index_bundle_iterators/WordIterator.php b/src/library/index_bundle_iterators/WordIterator.php index 65bb718e6..8d89c1a9d 100644 --- a/src/library/index_bundle_iterators/WordIterator.php +++ b/src/library/index_bundle_iterators/WordIterator.php @@ -362,8 +362,8 @@ class WordIterator extends IndexBundleIterator $this->empty = ($this->num_generations == 0); } $this->term_info_computed = true; - $this->no_more_generations = ($this->num_generations < - C\NUM_DISTINCT_GENERATIONS); + $this->no_more_generations = $this->index_version >= 3 + || count($info) < C\NUM_DISTINCT_GENERATIONS; } /** * Hook function used by currentDocsWithWord to return the current block @@ -1065,12 +1065,7 @@ class WordIterator extends IndexBundleIterator $index_info = IndexManager::getWordInfo($this->index_name, $this->word_key, 0, $this->num_generations, C\NUM_DISTINCT_GENERATIONS, true); - if ($this->index_version < 3) { - list($estimated_remaining_total, $info) = $index_info; - } else { - $estimated_remaining_total = $index_info['TOTAL_COUNT']; - $info = $index_info["ROWS"]; - } + list($estimated_remaining_total, $info) = $index_info; if (count($info) > 0) { $this->num_docs = $this->seen_docs + $estimated_remaining_total; @@ -1078,8 +1073,8 @@ class WordIterator extends IndexBundleIterator $this->dictionary_info = array_merge( $this->dictionary_info, array_values($info)); $this->num_generations = count($this->dictionary_info); - $this->no_more_generations = - count($info) < C\NUM_DISTINCT_GENERATIONS; + $this->no_more_generations = $this->index_version >= 3 + || count($info) < C\NUM_DISTINCT_GENERATIONS; //will increment back to where were next loop if ($is_ascending) { $this->generation_pointer--;