viewgit/inc/functions.php:22 Function utf8_encode() is deprecated [8192]
diff --git a/src/executables/ArcTool.php b/src/executables/ArcTool.php index 7e298c674..4181aa58c 100755 --- a/src/executables/ArcTool.php +++ b/src/executables/ArcTool.php @@ -139,9 +139,7 @@ class ArcTool extends DictionaryUpdater implements CrawlConstants $this->checkFilter($argv[2], $argv[3]); break; case "count": - if (!isset($argv[3])) { - $argv[3] = false; - } + $argv[3] ??= false; $this->outputCountBundle($path, $argv[3]); break; case "doc-lookup": @@ -171,51 +169,41 @@ class ArcTool extends DictionaryUpdater implements CrawlConstants $this->outputArchiveList(); break; case "make-filter": - if (!isset($argv[4])) { - $argv[4] = -1; - } + $argv[4] ??= -1; $this->makeFilter($argv[2], $argv[3], $argv[4]); break; case "migrate": - if (!isset($argv[3])) { - $argv[3] = 1; - } + $argv[3] ??= 1; $this->migrateIndexArchive($path, $argv[3]); break; case "partition": + $argv[3] ??= 0; $this->outputPartitionInfo($path, $argv[3]); break; case "fix-partition": - if (!isset($argv[3])) { - $argv[3] = 0; - } - if (!isset($argv[4])) { - $argv[4] = -1; - } + $argv[3] ??= 0; + $argv[4] ??= -1; $this->fixPartitionIndexes($path, $argv[3], $argv[4]); break; case "rebuild": - if (!isset($argv[3])) { - $argv[3] = 0; - } - if (!isset($argv[4])) { - $argv[4] = 1; - } - $this->rebuildIndexBundle($path, $argv[3], true, $argv[4]); + $argv[3] ??= 0; + $argv[4] ??= 1; + $argv[5] ??= -1; + $this->rebuildIndexBundle($path, $argv[3], true, $argv[4], + $argv[5]); break; case "remerge": - if (!isset($argv[3])) { - $argv[3] = 0; - } - $this->rebuildIndexBundle($path, $argv[3], false); + $argv[3] ??= 0; + $argv[4] ??= 1; + $argv[5] ??= -1; + $this->rebuildIndexBundle($path, $argv[3], false, $argv[4], + $argv[5]); break; case "show": if (!isset($argv[3])) { $this->usageMessageAndExit(); } - if (!isset($argv[4])) { - $argv[4] = 1; - } + $argv[4] ??= 1; $this->outputShowPages($path, $argv[3], $argv[4]); break; default: @@ -1394,9 +1382,12 @@ EOD; * partition inverted indexes or to try to use existing ones if present * @param int $number_of_processes number of CPU processes to use * when trying to recompute partition inverted indexes + * @param mixed $last_generation which web archive generation to end + * rebuild at. Default value of -1 goes till last partition */ public function rebuildIndexBundle($archive_path, $start_generation = 0, - $force_recompute = true, $number_of_processes = 1) + $force_recompute = true, $number_of_processes = 1, + $last_generation = -1) { $rebuilding = ($force_recompute) ? "Rebuilding" : "Remerging"; $rebuild = ($force_recompute) ? "rebuild" : "remerge"; @@ -1477,12 +1468,14 @@ EOD; $recent_log_times[$i] = time(); } $rebuild_dones = []; - while ($next_partition < $save_partition) { + $end_partition = ($last_generation > 0) ? min($save_partition, + $last_generation) : $save_partition; + while ($next_partition < $end_partition) { if ($old_next_partition != $next_partition) { $old_next_partition = $next_partition; - $num_forks = min($save_partition - $next_partition, + $num_forks = min($end_partition - $next_partition, $number_of_processes); - echo "Num forks:$num_forks, ". + echo "Num forks: $num_forks, ". "num processes $number_of_processes\n"; for ($i = 0; $i < $num_forks; $i++) { $process_partition = $next_partition + $i; @@ -1547,7 +1540,8 @@ EOD; all partition, then the call below is used to remerge them into the global dictionary */ - $this->rebuildIndexBundle($archive_path, $start_generation, false); + $this->rebuildIndexBundle($archive_path, $start_generation, false, + last_generation: $last_generation); } echo "\nIndex $rebuild complete!\n"; } @@ -1633,12 +1627,12 @@ php ArcTool.php doc-lookup bundle_name partition doc_map_index returns the document stored in partition at doc_map_index (here doc_map_index is the value that would be stored in a posting) */ -php ArcTool.php dict bundle_name word [details] -php ArcTool.php dict double_index_name which_bundle word [details] -php ArcTool.php dict bundle_name word start_record num_records [details] -php ArcTool.php dict double_index_name which_bundle word start_record num_records [details] +php ArcTool.php dict bundle_info word [details] +php ArcTool.php dict bundle_info word start_record num_records [details] /* returns index dictionary records for word stored in index archive bundle - or double index bundle. In the later case you should provide which bundle + or double index bundle. bundle_info is either the name of the bundle, + or for double index bundle, the name of the bundle whitespace which + sub-bundle. In the later case you should provide which bundle you want dictionary info for. This command also supports start and number of record parameters. If the word details is added to the end of the command then additional information about each @@ -1699,30 +1693,27 @@ php ArcTool.php partition double_index_name which_bundle partition_number or double index bundle (in which case need to say either 0 or 1 bundle) */ -php ArcTool.php rebuild bundle_name -php ArcTool.php rebuild double_index_name which_bundle -php ArcTool.php rebuild bundle_name continue -php ArcTool.php rebuild bundle_name continue number_of_processes -php ArcTool.php rebuild double_index_name which_bundle continue -php ArcTool.php rebuild bundle_name partition_num -php ArcTool.php rebuild bundle_name partition_num number_of_processes -php ArcTool.php rebuild double_index_name which_bundle partition_num +php ArcTool.php rebuild bundle_info +php ArcTool.php rebuild bundle_info partition_num +php ArcTool.php rebuild bundle_info partition_num number_of_processes +php ArcTool.php rebuild bundle_info partition_num number_of_processes end_partition /* re-extracts words from summaries files in bundle_name a partition at a time, builds an inverted index for that partition and adds to the global - dictionary. If this process crashes the keyword continue can be used - to continue from where it left off. If a partition number is supplied - process continue from that partition number. The default number of - processes that are used for index rebuilding is 1, however, - rebuilding is faster if you specify a number_of_processes proportional - to the number of CPU cores of your machine. + dictionary. bundle_info is either the name of the bundle, or for + double index bundle, the name of the bundle whitespace which sub-bundle. + If this process crashes the keyword continue can be used + to continue from where it left off. If a partition_num is supplied + process continue from that partition number. The keyword "continue" can + be used to continue from the last processed partitition numebr. + The default number of processes that are used for index rebuilding is 1, + however, rebuilding is faster if you specify a number_of_processes + proportional to the number of CPU cores of your machine. */ -php ArcTool.php remerge bundle_name -php ArcTool.php remerge double_index_name which_bundle -php ArcTool.php remerge bundle_name continue -php ArcTool.php remerge double_index_name which_bundle continue -php ArcTool.php remerge bundle_name partition_num -php ArcTool.php remerge double_index_name which_bundle partition_num +php ArcTool.php remerge bundle_info +php ArcTool.php remerge bundle_info partition_num +php ArcTool.php remerge bundle_info partition_num number_of_processes +php ArcTool.php remerge bundle_info partition_num number_of_processes end_partition /* this operates like the previously described rebuild command except if the inverted index files for a partition already exist in that partition they are not recomputed (if they don't exist, they diff --git a/src/library/IndexDocumentBundle.php b/src/library/IndexDocumentBundle.php index 60259c7cc..bc91360ba 100644 --- a/src/library/IndexDocumentBundle.php +++ b/src/library/IndexDocumentBundle.php @@ -329,7 +329,7 @@ class IndexDocumentBundle implements CrawlConstants "POSITIONS_OFFSET" => "INT", "POSITIONS_LEN" => "INT"], $record_compressor); $unpack_codes = [0 => "C", 1 => "n", 2=> "N", 3 => "J"]; - $len_codes = [0 => 1, 1 => 2, 2=> 4, 3 => 8]; + $len_codes = [0 => 1, 1 => 2, 2 => 4, 3 => 8]; for ($i = 0; $i < 4; $i++) { for ($j = 0; $j < 4; $j++) { for ($k = 0; $k < 4; $k++) { @@ -789,8 +789,9 @@ class IndexDocumentBundle implements CrawlConstants if ($just_stats) { $term_stats = []; foreach ($this->postings as $term => $postings) { - list($posting_records,) = $this->unpackPostings($postings); - $term_stats[$term] = count($posting_records); + $stat_pos = 0; + $num_records = vByteDecode($postings, $stat_pos); + $term_stats[$term] = $num_records; } $statistics = [ "NUM_DOCS" => $this->doc_map_counter, @@ -1159,7 +1160,6 @@ class IndexDocumentBundle implements CrawlConstants public function addTermPostingLists($position_offset, $word_lists, $meta_ids, $doc_map_index) { - static $my_counter = 0; $postings_tools = $this->postings_tools; $last_entries_tools = $this->last_entries_tools; foreach ($meta_ids as $meta_id) { @@ -1167,7 +1167,6 @@ class IndexDocumentBundle implements CrawlConstants } foreach ($word_lists as $word => $position_list) { $term_id = canonicalTerm($word); - $meta_prefix = substr($word, 0, 5); $occurrences = count($position_list); if ($occurrences > 0) { $encoded_position_list = encodePositionList($position_list); @@ -1190,10 +1189,14 @@ class IndexDocumentBundle implements CrawlConstants $diff_doc_map_index = $doc_map_index - $last_index; $diff_offset = ($occurrences > 0) ? $offset - $last_offset : 0; + //note:pack adds vByteEncode of num rows packed to front $entry = $postings_tools->pack([ "DOC_MAP_INDEX" => $diff_doc_map_index, "FREQUENCY" => $occurrences, "POSITIONS_OFFSET" => $diff_offset, "POSITIONS_LEN" => $len]); + /* multiple entries can be associated with the same term_id. + term_id => vbyte_encoded_num_entries entry1 \xFF entry2 ... + */ $postings_tools->add($this->postings, $term_id, $entry, PackedTableTools::ADD_MEM_TABLE, PackedTableTools::APPEND_MODE); $add_entry = $last_entries_tools->pack( @@ -1591,7 +1594,7 @@ class IndexDocumentBundle implements CrawlConstants $len_posting_strings = strlen($postings_string); for ($i = 0; $i < $num_items; $i++) { if (!isset($postings_string[$current_pos])) { - crawlLog("Posting decode error"); + crawlLog("Posting decode error - Start beyond posting"); crawlLog("..Number to decode items: " . $num_items); crawlLog("..Number decoded: " . $i); crawlLog("..Length posting string: " . @@ -1603,8 +1606,12 @@ class IndexDocumentBundle implements CrawlConstants $current_pos++; $len_unpack_info = $unpack_len_map[$int_info]; if ($current_pos + $len_unpack_info > $len_posting_strings) { - crawlLog("Posting decode error"); + crawlLog("Posting decode error -". + " Decode length longer than string"); + crawlLog(".. Decode Format Length was: " . $len_unpack_info); crawlLog("..Number to decode items: " . $num_items); + crawlLog("..Length needed to decode: " . + ($len_unpack_info * $num_items)); crawlLog("..Number decoded: " . $i); crawlLog("..Length posting string: " . strlen($postings_string)); @@ -1733,7 +1740,7 @@ class IndexDocumentBundle implements CrawlConstants */ public static function setArchiveInfo($dir_name, $update_info) { - $archive_info_path = $dir_name. "/" . self::ARCHIVE_INFO_FILE; + $archive_info_path = $dir_name . "/" . self::ARCHIVE_INFO_FILE; if (file_exists($archive_info_path)) { $info = self::getArchiveInfo($dir_name); } diff --git a/src/library/LSMTree.php b/src/library/LSMTree.php index 32c69e747..3ead91e5c 100644 --- a/src/library/LSMTree.php +++ b/src/library/LSMTree.php @@ -341,7 +341,8 @@ class LSMTree { $add_rows = $this->getTier($i, $key); if (is_array($add_rows)) { - $rows += $add_rows; + // use array_merge rather than + or get wrong results here + $rows = array_merge($rows, $add_rows); } if ($limit > 0 && count($rows) > $max_rows) { break; diff --git a/src/library/PackedTableTools.php b/src/library/PackedTableTools.php index 950d1580c..9244de6f8 100644 --- a/src/library/PackedTableTools.php +++ b/src/library/PackedTableTools.php @@ -237,7 +237,7 @@ class PackedTableTools case self::ADD_FILE_PATH: $separator = (fsize($table) > 0) ? "\xFF" : ""; $out = $separator . encode255($encode_key . $table_row); - return (file_put_contents($table, $out , FILE_APPEND) > 0); + return (file_put_contents($table, $out, FILE_APPEND) > 0); case self::ADD_FILE_HANDLE: $separator = (ftell($table) > 0) ? "\xFF" : ""; $out = $separator . encode255($encode_key . $table_row); diff --git a/src/library/index_bundle_iterators/GroupIterator.php b/src/library/index_bundle_iterators/GroupIterator.php index 83581e3d0..61c400d31 100644 --- a/src/library/index_bundle_iterators/GroupIterator.php +++ b/src/library/index_bundle_iterators/GroupIterator.php @@ -208,7 +208,7 @@ class GroupIterator extends IndexBundleIterator $pages = -1; } } else if (!empty($new_pages)) { - $pages += $new_pages; + $pages = array_merge($pages, $new_pages); $count = count($pages); } if ($count < $this->results_per_block && !$done) { diff --git a/src/library/index_bundle_iterators/WordIterator.php b/src/library/index_bundle_iterators/WordIterator.php index a1dcb9bd0..f13d259ac 100644 --- a/src/library/index_bundle_iterators/WordIterator.php +++ b/src/library/index_bundle_iterators/WordIterator.php @@ -1007,7 +1007,8 @@ class WordIterator extends IndexBundleIterator empty($generation_info['POSTINGS_LEN'])) { $postings_entry = ""; } else { - $postings_entry = $index->getPostingsString($generation, + $postings_entry = $index->getPostingsString( + $generation_info['PARTITION'], $generation_info['POSTINGS_OFFSET'], $generation_info['POSTINGS_LEN']); } diff --git a/tests/PackedTableToolsTest.php b/tests/PackedTableToolsTest.php index b21dc6e1f..45a8175df 100644 --- a/tests/PackedTableToolsTest.php +++ b/tests/PackedTableToolsTest.php @@ -144,11 +144,15 @@ use seekquarry\yioop\library\UnitTest; ["PRIMARY KEY" => "ID", "A" => "INT", "B" => "TEXT", "C" => "INT"], C\NS_COMPRESSORS . "GzipCompressor"); $table = []; - $hash_key = md5("1", true); - $table_factory->add($table, $hash_key, - $table_factory->pack(["A" => 5, "B" => "Hello World", "C"=> 256])); - $hash_key = md5("2", true); - $table_factory->add($table, $hash_key, + $hash_key1 = md5("1", true); + $table_factory->add($table, $hash_key1, + $table_factory->pack(["A" => 5, "B" => "Hello World", "C"=> 256]), + mode: L\PackedTableTools::APPEND_MODE); + $table_factory->add($table, $hash_key1, + $table_factory->pack(["A" => 6, "B" => "Hello World2", "C"=> 257]), + mode: L\PackedTableTools::APPEND_MODE); + $hash_key2 = md5("2", true); + $table_factory->add($table, $hash_key2, $table_factory->pack(["A" => 20000, "B" => "laladida", "C"=> 5600])); $table_factory->save(self::TEST_DIR . "/save.txt", $table); @@ -159,5 +163,10 @@ use seekquarry\yioop\library\UnitTest; $loaded_table2 = $table_factory2->load(self::TEST_DIR . "/save2.txt"); $this->assertEqual($table, $loaded_table2, "Add two rows, save, load compressed table gives same result."); + $entry1 = $loaded_table[$hash_key1]; + $start = 0; + $num_entries = L\vByteDecode($entry1, $start); + $this->assertEqual($num_entries, 2, + "Row one has correct number of sub-entries."); } }