viewgit/inc/functions.php:22 Function utf8_encode() is deprecated [8192]
diff --git a/src/executables/ArcTool.php b/src/executables/ArcTool.php index 183b375a8..7e298c674 100755 --- a/src/executables/ArcTool.php +++ b/src/executables/ArcTool.php @@ -433,11 +433,13 @@ class ArcTool extends DictionaryUpdater implements CrawlConstants if ($start_record < 0 || $record['PARTITION'] >= $start_record) { echo "RECORD: $i\n"; echo "PARTITION: {$record['PARTITION']}\n"; - echo "NUMBER OF DOCS: {$record['NUM_DOCS']}\n\n"; + echo "NUMBER OF DOCS: {$record['NUM_DOCS']}\n"; $postings_offset = (empty($record['POSTINGS_OFFSET'])) ? -1: $record['POSTINGS_OFFSET']; + echo "POSTINGS_OFFSET: $postings_offset\n"; $postings_len = (empty($record['POSTINGS_LEN']))? -1 : $record['POSTINGS_LEN']; + echo "POSTINGS_LEN: $postings_len\n\n"; $is_postings_array = isset($record['POSTINGS']) && is_array($record['POSTINGS']); if ($postings_offset == -1 && !$is_postings_array) { @@ -747,7 +749,7 @@ class ArcTool extends DictionaryUpdater implements CrawlConstants return $name; } /** - * Outputs tot the terminal if the bloom filter $filter_path contains + * Outputs to the terminal if the bloom filter $filter_path contains * the string $item * @param string $filter_path name of bloom filter file to check if * contains item @@ -847,7 +849,6 @@ class ArcTool extends DictionaryUpdater implements CrawlConstants $description = ($alternate_description) ? $alternate_description : "Description: " . $info['DESCRIPTION']; echo "$description\n"; - var_dump($info); if (!$only_crawl_params) { $num_partitions = $info['SAVE_PARTITION'] + 1; echo "Number of partitions: $num_partitions \n"; diff --git a/src/library/FeedDocumentBundle.php b/src/library/FeedDocumentBundle.php index 6b74faf45..a75030026 100644 --- a/src/library/FeedDocumentBundle.php +++ b/src/library/FeedDocumentBundle.php @@ -288,8 +288,8 @@ class FeedDocumentBundle extends IndexDocumentBundle $this->addScoresDocMap($doc_id, $num_words, intval($item[self::PUBDATE]), 0, $title_length, $title_length, [], []); - $this->addTermPostingLists(0, $num_words, - $word_list, $meta_ids, $this->doc_map_counter); + $this->addTermPostingLists(0, $word_list, $meta_ids, + $this->doc_map_counter); $this->doc_map_counter++; $this->updateTrendingTermCounts($term_counts, $phrase_string, $word_list, $media_category, $source_name, $lang, diff --git a/src/library/IndexDocumentBundle.php b/src/library/IndexDocumentBundle.php index ed65f7a05..60259c7cc 100644 --- a/src/library/IndexDocumentBundle.php +++ b/src/library/IndexDocumentBundle.php @@ -184,16 +184,14 @@ class IndexDocumentBundle implements CrawlConstants */ public $next_partition_to_add; /** - * IndexDictionary for all shards in the IndexArchiveBundle - * This contains entries of the form (word, num_shards with word, - * posting list info 0th shard containing the word, - * posting list info 1st shard containing the word, ...) - * @var object + * Reference to the LSMTree used to store term => array of partition + * posting list info + * @var LSMTree */ public $dictionary; /** * PartitionDocumentBundle for web page documents - * @var object + * @var PartitionDocumentBundle */ public $documents; /** @@ -965,8 +963,8 @@ class IndexDocumentBundle implements CrawlConstants $url_info[self::SCORE], $host_keywords_end_pos, $title_end_pos, $path_keywords_end_pos, $description_scores, $user_ranks, $terms_filter); - $this->addTermPostingLists(0, $num_words, - $word_lists, $meta_ids, $this->doc_map_counter); + $this->addTermPostingLists(0, $word_lists, $meta_ids, + $this->doc_map_counter); $this->doc_map_counter++; $interim_elapse = changeInMicrotime($interim_time); if ($interim_elapse > 5) { @@ -1150,8 +1148,6 @@ class IndexDocumentBundle implements CrawlConstants * @param int $position_offset number of header bytes that might be used * before including any position data in the file that positions will * eventually be stored. - * @param int $doc_length length of document in terms for the document - * for which we are adding posting data. * @param array $word_lists term => positions within current document of * that term for the document whose posting data we are adding * @param array $meta_ids meta terms associated with the document we are @@ -1160,9 +1156,10 @@ class IndexDocumentBundle implements CrawlConstants * we are adding. I.e., 5 would mean there were 5 earlier documents whose * postings we have already added. */ - public function addTermPostingLists($position_offset, $doc_length, - $word_lists, $meta_ids, $doc_map_index) + public function addTermPostingLists($position_offset, $word_lists, + $meta_ids, $doc_map_index) { + static $my_counter = 0; $postings_tools = $this->postings_tools; $last_entries_tools = $this->last_entries_tools; foreach ($meta_ids as $meta_id) { @@ -1171,9 +1168,8 @@ class IndexDocumentBundle implements CrawlConstants foreach ($word_lists as $word => $position_list) { $term_id = canonicalTerm($word); $meta_prefix = substr($word, 0, 5); - $site_meta = ($meta_prefix == "site:" || $meta_prefix == "info:"); - $occurrences = $site_meta ? $doc_length : count($position_list); - if (!$site_meta && $occurrences > 0) { + $occurrences = count($position_list); + if ($occurrences > 0) { $encoded_position_list = encodePositionList($position_list); $offset = $position_offset + strlen($this->positions); $len = strlen($encoded_position_list); @@ -1192,7 +1188,7 @@ class IndexDocumentBundle implements CrawlConstants array_values($last_entry_row[0]); } $diff_doc_map_index = $doc_map_index - $last_index; - $diff_offset = (!$site_meta && $occurrences > 0) ? + $diff_offset = ($occurrences > 0) ? $offset - $last_offset : 0; $entry = $postings_tools->pack([ "DOC_MAP_INDEX" => $diff_doc_map_index, @@ -1561,7 +1557,7 @@ class IndexDocumentBundle implements CrawlConstants $file_handles[$partition] = $fh; } if ($fh && fseek($fh, $offset) == 0 && $len > 0) { - $out = fread($fh, $len); + $out = decode255(fread($fh, $len) ?? ""); return $out; } return ""; @@ -1619,13 +1615,16 @@ class IndexDocumentBundle implements CrawlConstants $current_pos); if ($pre_item["FREQUENCY"] > C\MAX_DESCRIPTION_LEN) { crawlLog("Posting decode error! Frequency too large"); + crawlLog(".. Decode Format was: " . $unpack_map[$int_info]); crawlLog("..Number to decode items: " . $num_items); crawlLog("..Number decoded: " . $i); crawlLog("..Length posting string: " . strlen($postings_string)); crawlLog("..Current position: " . $current_pos); crawlLog("..Large Frequency Observed: ". - $pre_item["FREQUENCY"] . " ". C\MAX_DESCRIPTION_LEN); + $pre_item["FREQUENCY"] . + " more than max description length:". + C\MAX_DESCRIPTION_LEN); return [$items, $sum_frequencies]; // sanity check 3 } $item = $pre_item; diff --git a/src/library/LSMTree.php b/src/library/LSMTree.php index e9c93e163..32c69e747 100644 --- a/src/library/LSMTree.php +++ b/src/library/LSMTree.php @@ -129,7 +129,7 @@ class LSMTree $this->block_factor = $block_factor; } /** - * + * @return string */ public function getTierFolder($tier) { @@ -137,14 +137,17 @@ class LSMTree sprintf("%'.04d", $tier); } /** - * + * @return bool */ public function occupiedTier($tier) { return file_exists($this->getTierFolder($tier) . "/A"); } /** - * + * Returns the highest occupied tier of the LSMTree + * @param bool $recompute whether to return cache value if exists (false) or + * recompute it by examing the file system (true) + * @return int the maximum tier */ public function getMaxTier($recompute = false) { @@ -167,7 +170,8 @@ class LSMTree return self::$max_tier = $max_tier; } /** - * + * Within a tier select which of the two slots (A or B) to write + * entry data to */ public function selectPutSlot($letter) { diff --git a/src/library/PackedTableTools.php b/src/library/PackedTableTools.php index 48fe14ff9..950d1580c 100644 --- a/src/library/PackedTableTools.php +++ b/src/library/PackedTableTools.php @@ -528,6 +528,9 @@ class PackedTableTools $packed_data .= chr($magnitude + $positive); $cur_int_char = ($cur_int_char == -1) ? 0 : $cur_int_char; + $cur_int_add = 0; /*(0 << $shift) deliberately + set so not to use old value by accident + */ } else { if ($magnitude < 32768) { $packed_int = pack("n", $magnitude); @@ -690,10 +693,13 @@ class PackedTableTools $current_pos += $len; break; case "INT": - if ($ints_used >= $num_int_columns || - empty($int_info)) { + if ($ints_used >= $num_int_columns) { return null; } + if (empty($int_info)) { + $item[$field_name] = 0; + break; + } $int_code = (ord($int_info[$current_int_pos]) & (3 << $shift)) >> $shift; if (!isset($table_row[$current_pos])) { diff --git a/src/library/index_bundle_iterators/DocIterator.php b/src/library/index_bundle_iterators/DocIterator.php index b4ca52210..2fb59e8af 100755 --- a/src/library/index_bundle_iterators/DocIterator.php +++ b/src/library/index_bundle_iterators/DocIterator.php @@ -76,13 +76,6 @@ class DocIterator extends IndexBundleIterator * @var int */ public $key_index; - /** - * The index version affects how the iterator cycles through documents - * There was a big change in index format between version 3 and prior - * formats - * @var int - */ - public $index_version; /** * The next byte offset of a doc in the IndexShard * @var int @@ -152,7 +145,6 @@ class DocIterator extends IndexBundleIterator $this->index_name = $index_name; $this->direction = $direction; $this->ranking_factors = $ranking_factors; - $this->index_version = IndexManager::getVersion($index_name); $index = IndexManager::getIndex($index_name, $direction); if (empty($index)) { $this->num_docs = 0; @@ -163,15 +155,8 @@ class DocIterator extends IndexBundleIterator return; } $info = $index->getArchiveInfo($index->dir_name); - if ($this->index_version < 3) { - $this->num_docs = $info['COUNT']; - $this->num_generations = - (isset($index->generation_info['ACTIVE'])) ? - $index->generation_info['ACTIVE'] + 1 : 0; - } else { - $this->num_docs = ($info['COUNT'] ?? 0) + ($info['ACTIVE'] ?? 0); - $this->num_generations = $info['SAVE_PARTITION'] + 1; - } + $this->num_docs = ($info['COUNT'] ?? 0) + ($info['ACTIVE'] ?? 0); + $this->num_generations = $info['SAVE_PARTITION'] + 1; $this->results_per_block = $results_per_block; $this->current_block_fresh = false; $this->reset(); @@ -207,36 +192,25 @@ class DocIterator extends IndexBundleIterator if($this->num_generations <= 0) { return; } - if ($this->index_version < 3 && isset($this->shard_lens[$generation])) { - $this->last_offset = $this->shard_lens[$generation]; - } else { - $index = IndexManager::getIndex($this->index_name); - if ($this->index_version < 3) { - $index->setCurrentShard($generation, true); - $shard = $index->getCurrentShard(); - $this->last_offset = $shard->docids_len; - $this->shard_lens[$generation] = $shard->docids_len; - } else { - if ($generation != $this->doc_map_generation) { - $base_folder = $index->getPartitionBaseFolder( - $this->current_generation); - $doc_map_filename = $base_folder . "/" . - IndexDocumentBundle::DOC_MAP_FILENAME; - $doc_map_tools = $index->doc_map_tools; - $this->doc_map = $doc_map_tools->load($doc_map_filename) - ?? []; - $doc_keys = array_keys($this->doc_map); - $key_index = []; - foreach ($this->doc_map as $key => $entry) { - if (!$index->isType($key, "link")) { - $key_index[] = $key; - } - } - $this->key_index = $key_index; - $this->last_offset = count($key_index) - 1; - $this->doc_map_generation = $generation; + $index = IndexManager::getIndex($this->index_name); + if ($generation != $this->doc_map_generation) { + $base_folder = $index->getPartitionBaseFolder( + $this->current_generation); + $doc_map_filename = $base_folder . "/" . + IndexDocumentBundle::DOC_MAP_FILENAME; + $doc_map_tools = $index->doc_map_tools; + $this->doc_map = $doc_map_tools->load($doc_map_filename) + ?? []; + $doc_keys = array_keys($this->doc_map); + $key_index = []; + foreach ($this->doc_map as $key => $entry) { + if (!$index->isType($key, "link")) { + $key_index[] = $key; } } + $this->key_index = $key_index; + $this->last_offset = count($key_index) - 1; + $this->doc_map_generation = $generation; } } /** @@ -262,20 +236,10 @@ class DocIterator extends IndexBundleIterator $pre_results = []; $this->next_offset = $this->current_offset; $index = IndexManager::getIndex($this->index_name); - if ($this->index_version < 3) { - $index->setCurrentShard($this->current_generation, true); - //the next call also updates next offset - $shard = $index->getCurrentShard(); - $num_docs_or_links = ($this->index_version < 3) ? - $shard->num_docs + $shard->num_link_docs : 0; - $doc_offset_key_len = IndexShard::DOC_KEY_LEN; - } $this->getGenerationInfo($this->current_generation); - if ($this->index_version >= 3) { - $doc_map_tools = $index->doc_map_tools; - $doc_keys = $this->key_index; - $doc_map = $this->doc_map; - } + $doc_map_tools = $index->doc_map_tools; + $doc_keys = $this->key_index; + $doc_map = $this->doc_map; $pre_results = []; $num_docs_so_far = 0; $termsfilter_len = IndexDocumentBundle::TERMSFILTER_LEN; @@ -284,47 +248,32 @@ class DocIterator extends IndexBundleIterator || (!$is_ascending && $this->next_offset < 0)) { break; } - if ($this->index_version < 3) { - $posting = L\packPosting($this->next_offset >> 4, [1]); - list($doc_id, $num_keys, $item) = - $shard->makeItem($posting, $num_docs_or_links, - $this->direction); - } else { - $doc_id = $doc_keys[$this->next_offset]; - $map_entry = $doc_map[$doc_id]; - // skip term filter if present - $map_entry = ($map_entry >= ($termsfilter_len + 1) && - $map_entry[0] == 't') ? - substr($map_entry, $termsfilter_len + 1) : - $map_entry; - $doc_info = $doc_map_tools->unpack($map_entry); - $item = [self::GENERATION => $this->current_generation]; - $item[self::DOC_RANK] = $this->computeDocRank($doc_id, - $this->next_offset, $this->current_generation, - $this->num_generations, $this->last_offset, - $this->last_offset, $this->last_offset, - $this->ranking_factors, $is_ascending); - list($item[self::DOC_LEN], ) = - array_values(array_shift($doc_info)); - $item[self::SCORE] = $item[self::DOC_RANK]; - list(, $num_description_scores) = - array_values(array_shift($doc_info)); - $item[self::DESCRIPTION_SCORES] = array_slice($doc_info, 0, - $num_description_scores); - $item[self::USER_RANKS] = array_slice($doc_info, - $num_description_scores); - $item[self::INDEX_VERSION] = $this->index_version; - $item[self::IS_DOC] = true; - } + $doc_id = $doc_keys[$this->next_offset]; + $map_entry = $doc_map[$doc_id]; + // skip term filter if present + $map_entry = ($map_entry >= ($termsfilter_len + 1) && + $map_entry[0] == 't') ? + substr($map_entry, $termsfilter_len + 1) : + $map_entry; + $doc_info = $doc_map_tools->unpack($map_entry); + $item = [self::GENERATION => $this->current_generation]; + $item[self::DOC_RANK] = $this->computeDocRank($doc_id, + $this->next_offset, $this->current_generation, + $this->num_generations, $this->last_offset, + $this->last_offset, $this->last_offset, + $this->ranking_factors, $is_ascending); + list($item[self::DOC_LEN], ) = + array_values(array_shift($doc_info)); + $item[self::SCORE] = $item[self::DOC_RANK]; + list(, $num_description_scores) = + array_values(array_shift($doc_info)); + $item[self::DESCRIPTION_SCORES] = array_slice($doc_info, 0, + $num_description_scores); + $item[self::USER_RANKS] = array_slice($doc_info, + $num_description_scores); + $item[self::IS_DOC] = true; if ($is_ascending) { - if ($this->index_version < 3) { - if ($num_keys % 2 == 0) { - $num_keys++; - } - $this->next_offset += ($num_keys + 1) * $doc_offset_key_len; - } else { - $this->next_offset++; - } + $this->next_offset++; } else { $this->next_offset = $this->getPreviousDocOffset( $this->next_offset); @@ -361,10 +310,7 @@ class DocIterator extends IndexBundleIterator */ public function getPreviousDocOffset($doc_offset) { - $doc_item_len = ($this->index_version < 3) ? - 4 * IndexShard::DOC_KEY_LEN : 1; - // this is not correct, only works if no additions doc keys - return $doc_offset - $doc_item_len; + return $doc_offset - 1; } /** * Updates the seen_docs count during an advance() call @@ -373,8 +319,7 @@ class DocIterator extends IndexBundleIterator { if ($this->current_block_fresh != true) { $is_ascending = ($this->direction == self::ASCENDING); - $doc_item_len = ($this->index_version < 3) ? - 4 * IndexShard::DOC_KEY_LEN : 1; + $doc_item_len = 1; $pre_num_docs = ($is_ascending) ? ($this->last_offset - $this->next_offset) / $doc_item_len : $this->next_offset/$doc_item_len; @@ -437,8 +382,7 @@ class DocIterator extends IndexBundleIterator $this->next_offset = $this->current_offset; } } - $this->seen_docs = $this->current_offset / - (($this->index_version < 3) ? 4 * IndexShard::DOC_KEY_LEN : 1); + $this->seen_docs = $this->current_offset; } } /** diff --git a/src/library/index_bundle_iterators/GroupIterator.php b/src/library/index_bundle_iterators/GroupIterator.php index 06d32b0fe..83581e3d0 100644 --- a/src/library/index_bundle_iterators/GroupIterator.php +++ b/src/library/index_bundle_iterators/GroupIterator.php @@ -343,24 +343,11 @@ class GroupIterator extends IndexBundleIterator if (isset($doc_info[self::GENERATION])) { $machine_id = (isset($doc_info[self::MACHINE_ID])) ? $doc_info[self::MACHINE_ID] :$this->current_machine; - if (!empty($doc_info[self::INDEX_VERSION])) { - $out_pages[$hash_url][self::SUMMARY_OFFSET][] = - [$machine_id, $doc_info[self::KEY], - $doc_info[self::CRAWL_TIME], - $doc_info[self::GENERATION], - "PDB"]; - } else if (is_int($doc_info[self::SUMMARY_OFFSET])) { - $out_pages[$hash_url][self::SUMMARY_OFFSET][] = - [$machine_id, $doc_info[self::KEY], - $doc_info[self::CRAWL_TIME], - $doc_info[self::GENERATION], - $doc_info[self::SUMMARY_OFFSET]]; - } else if (is_array($doc_info[self::SUMMARY_OFFSET])) { - $out_pages[$hash_url][self::SUMMARY_OFFSET] = - array_merge( - $out_pages[$hash_url][self::SUMMARY_OFFSET], - $doc_info[self::SUMMARY_OFFSET]); - } + $out_pages[$hash_url][self::SUMMARY_OFFSET][] = + [$machine_id, $doc_info[self::KEY], + $doc_info[self::CRAWL_TIME], + $doc_info[self::GENERATION], + "PDB"]; } } if ($add_lookup) { diff --git a/src/library/index_bundle_iterators/WordIterator.php b/src/library/index_bundle_iterators/WordIterator.php index 11d6bbd22..a1dcb9bd0 100644 --- a/src/library/index_bundle_iterators/WordIterator.php +++ b/src/library/index_bundle_iterators/WordIterator.php @@ -106,13 +106,6 @@ class WordIterator extends IndexBundleIterator * @var string */ public $index_name; - /** - * The index version affects how the iterator cycles through documents - * There was a big change in index format between version 3 and prior - * formats - * @var int - */ - public $index_version; /** * Whether word key corresponds to a meta word * @var string @@ -280,14 +273,9 @@ class WordIterator extends IndexBundleIterator $info = ($this->direction == self::ASCENDING) ? $this->dictionary_info[0] : $this->dictionary_info[ $this->num_generations - 1]; - if ($this->index_version < 3) { - list($this->current_generation, $this->start_offset, - $this->last_offset, ) = $info; - } else { - $this->current_generation = $info['PARTITION']; - $this->start_offset = 0; - $this->last_offset = $info['NUM_DOCS'] - 1; - } + $this->current_generation = $info['PARTITION']; + $this->start_offset = 0; + $this->last_offset = $info['NUM_DOCS'] - 1; } else { $this->start_offset = 0; $this->last_offset = -1; @@ -320,7 +308,6 @@ class WordIterator extends IndexBundleIterator if (!empty($this->term_info_computed)) { return; } - $this->index_version = IndexManager::getVersion($index_name); $word_info = IndexManager::getWordInfo($index_name, $word_key, -1, -1, C\NUM_DISTINCT_GENERATIONS, true); $this->total_num_docs = $word_info['TOTAL_NUM_DOCS'] ?? 0; @@ -414,16 +401,6 @@ class WordIterator extends IndexBundleIterator public function getPostingsSliceResults() { $this->next_offset = $this->current_offset; - if ($this->index_version < 3) { - $index = IndexManager::getIndex($this->index_name); - $index->setCurrentShard($this->current_generation, true); - //the next call also updates next offset - $shard = $index->getCurrentShard(true); - $pre_results = $shard->getPostingsSlice($this->start_offset, - $this->next_offset, $this->last_offset, - $this->results_per_block, $this->direction); - return $pre_results; - } if ($this->direction == self::ASCENDING) { if ($this->current_offset < $this->start_offset) { $this->current_offset = $this->start_offset; @@ -699,7 +676,6 @@ class WordIterator extends IndexBundleIterator $posting[self::RELEVANCE]; $posting[self::USER_RANKS] = array_slice($doc_info, $num_description_scores); - $posting[self::INDEX_VERSION] = $this->index_version; $key_postings[$doc_key] = $posting; } if (!empty($fh)) { @@ -809,41 +785,20 @@ class WordIterator extends IndexBundleIterator */ public function advanceSeenDocs() { - $version = $this->index_version; if ($this->current_block_fresh != true) { if ($this->direction == self::ASCENDING) { - $remaining_postings = ($version < 3) ? - IndexShard::numDocsOrLinks( - $this->next_offset, $this->last_offset) : - $this->last_offset - $this->next_offset; + $remaining_postings = $this->last_offset - $this->next_offset; $num_docs = min($this->results_per_block, $remaining_postings); $delta_sign = 1; } else { - if ($version < 3) { - $total_guess = IndexShard::numDocsOrLinks( - $this->start_offset, $this->next_offset); - $num_docs = $total_guess % $this->results_per_block; - if ($num_docs == 0) { - $num_docs = $this->results_per_block; - } else { - $num_docs = IndexShard::numDocsOrLinks( - $this->start_offset, $this->last_offset) % - $this->results_per_block; - if ($num_docs == 0) { - $num_docs = $this->results_per_block; - } - } - } else { - $remaining_postings = $this->next_offset - - $this->start_offset + 1; - $num_docs = min($this->results_per_block, - $remaining_postings); - } + $remaining_postings = $this->next_offset - + $this->start_offset + 1; + $num_docs = min($this->results_per_block, + $remaining_postings); $delta_sign = -1; } - $posting_len = ($version < 3) ? IndexShard::POSTING_LEN : 1; $this->next_offset = $this->current_offset; - $this->next_offset += $delta_sign * $posting_len * $num_docs; + $this->next_offset += $delta_sign * $num_docs; if ($num_docs <= 0) { return; } @@ -881,22 +836,9 @@ class WordIterator extends IndexBundleIterator $this->advanceGeneration($gen_doc_offset[0]); $this->next_offset = $this->current_offset; } - if ($this->index_version < 3) { - $index = IndexManager::getIndex($this->index_name); - $index->setCurrentShard($this->current_generation, true); - $shard = $index->getCurrentShard(); - } if ($this->current_generation == $gen_doc_offset[0]) { - if ($this->index_version < 3) { - $end_offset = ($is_ascending) ? $this->last_offset : - $this->start_offset; - $offset_pair = $shard->nextPostingOffsetDocOffset( - $this->next_offset, $end_offset, $gen_doc_offset[1], - $this->direction); - } else { - $offset_pair = $this->nextDocIndexOffsetPair( - $gen_doc_offset[1]); - } + $offset_pair = $this->nextDocIndexOffsetPair( + $gen_doc_offset[1]); if ($offset_pair === false) { $this->advanceGeneration(); $this->next_offset = $this->current_offset; @@ -906,13 +848,10 @@ class WordIterator extends IndexBundleIterator $this->next_offset = $this->current_offset; } } - $posting_len = ($this->index_version < 3) ? IndexShard::POSTING_LEN : 1; if ($is_ascending) { - $this->seen_docs = ($this->current_offset - $this->start_offset) / - $posting_len; + $this->seen_docs = ($this->current_offset - $this->start_offset); } else { - $this->seen_docs = ($this->last_offset - $this->current_offset) / - $posting_len; + $this->seen_docs = ($this->last_offset - $this->current_offset); } $this->current_block_fresh = false; } @@ -1029,17 +968,11 @@ class WordIterator extends IndexBundleIterator $this->generation_pointer < $this->num_generations : $this->generation_pointer >= 0; if ($gen_check) { - if ($this->index_version < 3) { - list($this->current_generation, $this->start_offset, - $this->last_offset, ) - = $this->dictionary_info[$this->generation_pointer]; - } else { - $partition_info = - $this->dictionary_info[$this->generation_pointer]; - $this->current_generation = $partition_info['PARTITION']; - $this->start_offset = 0; - $this->last_offset = ($partition_info['NUM_DOCS'] ?? 1) - 1; - } + $partition_info = + $this->dictionary_info[$this->generation_pointer]; + $this->current_generation = $partition_info['PARTITION']; + $this->start_offset = 0; + $this->last_offset = ($partition_info['NUM_DOCS'] ?? 1) - 1; $this->current_offset = ($is_ascending) ? $this->start_offset: $this->last_offset; } @@ -1061,8 +994,7 @@ class WordIterator extends IndexBundleIterator public function getGenerationPostings($generation) { static $test_time = 0; - if ($this->index_version < 3 || - empty($this->dictionary_info[$generation])) { + if (empty($this->dictionary_info[$generation])) { return []; } $generation_info = $this->dictionary_info[$generation]; @@ -1071,24 +1003,13 @@ class WordIterator extends IndexBundleIterator return $generation_info['POSTINGS']; //already loaded } $index = IndexManager::getIndex($this->index_name); - if ($this->index_version < "3.2") { - if (empty($generation_info['LAST_BLOB_LEN'])) { - $postings_entry = ""; - } else { - $postings_entry = $index->dictionary->getArchive( - $this->archive_file, $generation_info['POSTINGS'], - $generation_info['LAST_BLOB_LEN']); - unset($this->dictionary_info[$generation]['LAST_BLOB_LEN']); - } + if (empty($generation_info['POSTINGS_OFFSET']) || + empty($generation_info['POSTINGS_LEN'])) { + $postings_entry = ""; } else { - if (empty($generation_info['POSTINGS_OFFSET']) || - empty($generation_info['POSTINGS_LEN'])) { - $postings_entry = ""; - } else { - $postings_entry = $index->getPostingsString($generation, - $generation_info['POSTINGS_OFFSET'], - $generation_info['POSTINGS_LEN']); - } + $postings_entry = $index->getPostingsString($generation, + $generation_info['POSTINGS_OFFSET'], + $generation_info['POSTINGS_LEN']); } $postings = []; if (!empty($postings_entry)) { @@ -1118,15 +1039,8 @@ class WordIterator extends IndexBundleIterator $this->generation_pointer >= $this->num_generations) : ($this->current_offset < $this->start_offset|| $this->generation_pointer < -1); - if ($offset_check) { - return -1; - } - if ($this->index_version < 3) { - $index = IndexManager::getIndex($this->index_name); - $index->setCurrentShard($this->current_generation, true); - $this->current_doc_offset = $index->getCurrentShard( - )->docOffsetFromPostingOffset($this->current_offset); - } else if (empty($this->dictionary_info[$this->generation_pointer])){ + if ($offset_check || + empty($this->dictionary_info[$this->generation_pointer])) { return -1; } else { $partition_info = $this->dictionary_info[$this->generation_pointer]; diff --git a/src/models/ParallelModel.php b/src/models/ParallelModel.php index 5066c23eb..4067f56c8 100755 --- a/src/models/ParallelModel.php +++ b/src/models/ParallelModel.php @@ -286,18 +286,13 @@ class ParallelModel extends Model } else { return false; } - if (IndexManager::getVersion($index_name) < 3) { - $summary = - $index_archive->getPage($summary_offset, $generation); - } else { - $summary = - $index_archive->getSummary($summary_offset, + $summary = + $index_archive->getSummary($summary_offset, + $generation); + if ($return_cached_page) { + $summary[self::PAGE] = + $index_archive->getCachePage($summary_offset, $generation); - if ($return_cached_page) { - $summary[self::PAGE] = - $index_archive->getCachePage($summary_offset, - $generation); - } } } else { $test_time = microtime(true); @@ -426,21 +421,12 @@ class ParallelModel extends Model $index_name = $this->index_name; } $index_archive = IndexManager::getIndex($index_name); - $index_version = IndexManager::getVersion($index_name); - $make_term_id = ($index_version < 3) ? C\NS_LIB . "crawlHashWord" : - C\NS_LIB . "canonicalTerm"; + $make_term_id = C\NS_LIB . "canonicalTerm"; if (!$index_archive) { return false; } $num_retrieved = 0; $summary_offset = null; - if ($index_version < 3 && - !isset($index_archive->generation_info['ACTIVE'])) { - return false; - } - if ($index_version < 3) { - $num_generations = $index_archive->generation_info['ACTIVE']; - } $add_info = (strncmp($url_or_key, "info:", 5) == 0) ? "" : "info:"; $hash_key = ($is_key) ? $make_term_id($url_or_key, true) : @@ -452,7 +438,7 @@ class ParallelModel extends Model if (!isset($info[0][4]) && empty($info['ROWS'][0])) { return false; } - $term_id = ($index_version < 3) ? $info[0][4] : $hash_key; + $term_id = $hash_key; if (!empty($info['ROWS'][0])) { $generation = $info['ROWS'][0]['PARTITION']; } @@ -462,8 +448,7 @@ class ParallelModel extends Model if (!$doc_info) { return false; } - $summary_offset = ($index_version < 3) ? - $doc_info[self::SUMMARY_OFFSET] : $doc_info[self::KEY]; + $summary_offset = $doc_info[self::KEY]; $generation = $doc_info[self::GENERATION]; } else { return false; diff --git a/src/models/PhraseModel.php b/src/models/PhraseModel.php index afdce9287..d7c00cb2c 100755 --- a/src/models/PhraseModel.php +++ b/src/models/PhraseModel.php @@ -749,7 +749,6 @@ class PhraseModel extends ParallelModel $quote_state = ($quote_state) ? false : true; } //stemmed, if have stemmer - $index_version = IndexManager::getVersion($index_name); $add_metas = $found_metas; $words = array_merge($base_words, $add_metas); if (count($words) == 0 && count($disallow_phrases) > 0) { @@ -783,8 +782,7 @@ class PhraseModel extends ParallelModel $this->query_info['QUERY'] .= ")<br>"; } } - $make_term_id = ($index_version < 3) ? C\NS_LIB . "crawlHashWord" : - C\NS_LIB . "canonicalTerm"; + $make_term_id = C\NS_LIB . "canonicalTerm"; if (isset($words) && count($words) == 1 && count($disallow_phrases) < 1 && !strpos($words[0], " ")) { $phrase_string = $words[0]; @@ -825,9 +823,6 @@ class PhraseModel extends ParallelModel "<br>"; } $disallow_keys[] = $make_term_id($disallow_stem[0]); - if ($index_version == 0) { - $disallow_keys[] = L\crawlHash($word); - } } if ($word_keys !== null) { $word_struct = ["KEYS" => $word_keys, @@ -1852,9 +1847,7 @@ class PhraseModel extends ParallelModel $index_name = $word_structs[0]["INDEX_NAME"]; } //we assume all indexes in use of the same version - $index_version = IndexManager::getVersion($index_name); - $make_term_id = ($index_version < 3) ? C\NS_LIB . "crawlHashWord" : - C\NS_LIB . "canonicalTerm"; + $make_term_id = C\NS_LIB . "canonicalTerm"; $doc_iterate_hashes = [substr($make_term_id("site:any"), 0, 9), substr(L\crawlHash("site:any"), 0, 9), substr($make_term_id("site:doc"), 0, 9), @@ -1924,8 +1917,7 @@ class PhraseModel extends ParallelModel $min_group_override = true; } else { $distinct_key = $distinct_word_keys[$i]; - $distinct_key_id = ($index_version < 3) ? - L\unbase64Hash($distinct_key) : $distinct_key; + $distinct_key_id = $distinct_key; $direction = self::ASCENDING; $actual_index_name = $index_name; if (($index_name[0] == "-")) {