viewgit/inc/functions.php:22 Function utf8_encode() is deprecated [8192]
Filename | |
---|---|
src/executables/ArcTool.php | |
src/library/IndexArchiveBundle.php | |
src/library/IndexDictionary.php | |
src/library/IndexShard.php | |
src/locale/en_US/resources/Tokenizer.php |
diff --git a/src/executables/ArcTool.php b/src/executables/ArcTool.php index f498bddde..d4c18e2a5 100755 --- a/src/executables/ArcTool.php +++ b/src/executables/ArcTool.php @@ -590,7 +590,7 @@ class ArcTool implements CrawlConstants $shard = new IndexShard($shard_name, $i, C\NUM_DOCS_PER_GENERATION, true); if ($dictionary->addShardDictionary($shard)) { - $shard->saveWithoutDictionary(); + $shard->saveWithoutDictionary(true); file_put_contents($shard_count_file, $i + 1); } else { echo "Problem adding shard $i"; diff --git a/src/library/IndexArchiveBundle.php b/src/library/IndexArchiveBundle.php index eba02a888..de31ad370 100644 --- a/src/library/IndexArchiveBundle.php +++ b/src/library/IndexArchiveBundle.php @@ -252,8 +252,9 @@ class IndexArchiveBundle implements CrawlConstants public function addAdvanceGeneration($callback = null) { $this->addCurrentShardDictionary($callback); - echo "Resaving active shard without prefix and dictionary etc\n"; - $this->getActiveShard()->saveWithoutDictionary(); + crawlLog("Resaving active shard without prefix and dictionary."); + $this->getActiveShard()->saveWithoutDictionary(true); + crawlLog("..Done resaving active shard."); //Set up new shard $this->generation_info['ACTIVE']++; $this->generation_info['CURRENT'] = @@ -263,7 +264,7 @@ class IndexArchiveBundle implements CrawlConstants $this->current_shard = new IndexShard( $current_index_shard_file, $this->generation_info['ACTIVE'], $this->num_docs_per_generation); - file_put_contents($this->dir_name."/generation.txt", + file_put_contents($this->dir_name . "/generation.txt", serialize($this->generation_info)); } /** diff --git a/src/library/IndexDictionary.php b/src/library/IndexDictionary.php index 2dd219f0e..2760d11fa 100644 --- a/src/library/IndexDictionary.php +++ b/src/library/IndexDictionary.php @@ -1064,7 +1064,17 @@ class IndexDictionary implements CrawlConstants } /** * Adds auxiliary records for a given word id if after merging info for - * a given word id can't be stored in a single record + * a given word id can't be stored in a single record. + * A typical dictionary entry consists of a 20 byte word id, followed + * by the 4 bytes ints generation, offset, and length of the posting lists + * in that generation. If the high bit of the prefix characters in the + * word id are flipped, it indicates the presence of auxiliary records for + * that word id. In which case bytes 1, and 2 of the generation, code the + * number of auxiliary records there will be for this word id. + * An auxiliary record is 32 bytes long beginning with a bit of the current + * high prefix letter, followed by a 15 bit code of which aux record in + * the sequence of aux records for this word id it is, followed by three + * 10 byte 2byte generation, 4 byte offset, 4 byte len records. * * @param string $id word id to add aux records for * @param int $file_num which prefix file to read from (always reads diff --git a/src/library/IndexShard.php b/src/library/IndexShard.php index 4dcedaa7b..a5956eba9 100644 --- a/src/library/IndexShard.php +++ b/src/library/IndexShard.php @@ -1299,10 +1299,16 @@ class IndexShard extends PersistentStructure implements * This method re-saves a saved shard without the prefixes and dictionary. * It would typically be called after this information has been stored * in an IndexDictionary obbject so that the data is not redundantly stored + * @param bool $with_logging whether log messages should be written + * as the shard save progresses */ - public function saveWithoutDictionary() + public function saveWithoutDictionary($with_logging = false) { $this->getShardHeader(); + if($with_logging) { + crawlLog("Opening without dictionary version of shard to write..."); + } + $fh = fopen($this->filename . "-tmp", "wb"); $header = pack("N*", 0, 0, $this->word_docs_len, $this->docids_len, @@ -1312,13 +1318,31 @@ class IndexShard extends PersistentStructure implements $this->num_link_docs, $this->len_all_docs, $this->len_all_link_docs); - $word_docs = $this->getWordDocsSubstring(); - $doc_infos = $this->getDocInfoSubstring(); - $fh = fopen($this->filename, "wb"); fwrite($fh, $header); + if($with_logging) { + crawlLog("..without dictionary version of shard header written"); + } + $header = ""; + $word_docs = $this->getWordDocsSubstring(); fwrite($fh, $word_docs); + if($with_logging) { + crawlLog("..without dictionary version of shard word docs written"); + } + $word_docs = ""; + $doc_infos = $this->getDocInfoSubstring(); fwrite($fh, $doc_infos); + if($with_logging) { + crawlLog("..without dictionary version of shard doc infos written"); + } fclose($fh); + $doc_infos = ""; + if (file_exists($this->filename . "-tmp")) { + unlink($this->filename); + rename($this->filename . "-tmp", $this->filename); + } + if($with_logging) { + crawlLog("done replacing version of shard."); + } } /** * Computes the prefix string index for the current words array. diff --git a/src/locale/en_US/resources/Tokenizer.php b/src/locale/en_US/resources/Tokenizer.php index c1a18ede7..2b714bbb1 100755 --- a/src/locale/en_US/resources/Tokenizer.php +++ b/src/locale/en_US/resources/Tokenizer.php @@ -279,7 +279,6 @@ class Tokenizer */ public static function tagTokenizePartOfSpeech($text) { - static $dictionary = []; static $dictionary = []; $lexicon_file = C\LOCALE_DIR . "/en_US/resources/lexicon.txt.gz"; if (empty($dictionary)) {