viewgit/inc/functions.php:22 Function utf8_encode() is deprecated [8192]

Try to reduc memory foot print of saving without dictionary, a=chris

Chris Pollett [2018-06-18 22:Jun:th]
Try to reduc memory foot print of saving without dictionary, a=chris
Filename
src/executables/ArcTool.php
src/library/IndexArchiveBundle.php
src/library/IndexDictionary.php
src/library/IndexShard.php
src/locale/en_US/resources/Tokenizer.php
diff --git a/src/executables/ArcTool.php b/src/executables/ArcTool.php
index f498bddde..d4c18e2a5 100755
--- a/src/executables/ArcTool.php
+++ b/src/executables/ArcTool.php
@@ -590,7 +590,7 @@ class ArcTool implements CrawlConstants
                     $shard = new IndexShard($shard_name, $i,
                         C\NUM_DOCS_PER_GENERATION, true);
                     if ($dictionary->addShardDictionary($shard)) {
-                        $shard->saveWithoutDictionary();
+                        $shard->saveWithoutDictionary(true);
                         file_put_contents($shard_count_file, $i + 1);
                     } else {
                         echo "Problem adding shard $i";
diff --git a/src/library/IndexArchiveBundle.php b/src/library/IndexArchiveBundle.php
index eba02a888..de31ad370 100644
--- a/src/library/IndexArchiveBundle.php
+++ b/src/library/IndexArchiveBundle.php
@@ -252,8 +252,9 @@ class IndexArchiveBundle implements CrawlConstants
     public function addAdvanceGeneration($callback = null)
     {
         $this->addCurrentShardDictionary($callback);
-        echo "Resaving active shard without prefix and dictionary etc\n";
-        $this->getActiveShard()->saveWithoutDictionary();
+        crawlLog("Resaving active shard without prefix and dictionary.");
+        $this->getActiveShard()->saveWithoutDictionary(true);
+        crawlLog("..Done resaving active shard.");
         //Set up new shard
         $this->generation_info['ACTIVE']++;
         $this->generation_info['CURRENT'] =
@@ -263,7 +264,7 @@ class IndexArchiveBundle implements CrawlConstants
         $this->current_shard = new IndexShard(
             $current_index_shard_file, $this->generation_info['ACTIVE'],
                 $this->num_docs_per_generation);
-        file_put_contents($this->dir_name."/generation.txt",
+        file_put_contents($this->dir_name . "/generation.txt",
             serialize($this->generation_info));
     }
     /**
diff --git a/src/library/IndexDictionary.php b/src/library/IndexDictionary.php
index 2dd219f0e..2760d11fa 100644
--- a/src/library/IndexDictionary.php
+++ b/src/library/IndexDictionary.php
@@ -1064,7 +1064,17 @@ class IndexDictionary implements CrawlConstants
     }
     /**
      * Adds auxiliary records for a given word id if after merging info for
-     * a given word id can't be stored in a single record
+     * a given word id can't be stored in a single record.
+     * A typical dictionary entry consists of a 20 byte word id, followed
+     * by the 4 bytes ints generation, offset, and length of the posting lists
+     * in that generation. If the high bit of the prefix characters in the
+     * word id  are flipped, it indicates the presence of auxiliary records for
+     * that word id. In which case bytes 1, and 2 of the generation, code the
+     * number of auxiliary records there will be for this word id.
+     * An auxiliary record is 32 bytes long beginning with a bit of the current
+     * high prefix letter, followed by a 15 bit code of which aux record in
+     * the sequence of aux records for this word id it is, followed by three
+     * 10 byte 2byte generation, 4 byte offset, 4 byte len records.
      *
      * @param string $id word id to add aux records for
      * @param int $file_num which prefix file to read from (always reads
diff --git a/src/library/IndexShard.php b/src/library/IndexShard.php
index 4dcedaa7b..a5956eba9 100644
--- a/src/library/IndexShard.php
+++ b/src/library/IndexShard.php
@@ -1299,10 +1299,16 @@ class IndexShard extends PersistentStructure implements
      * This method re-saves a saved shard without the prefixes and dictionary.
      * It would typically be called after this information has been stored
      * in an IndexDictionary obbject so that the data is not redundantly stored
+     * @param bool $with_logging whether log messages should be written
+     *     as the shard save progresses
      */
-    public function saveWithoutDictionary()
+    public function saveWithoutDictionary($with_logging = false)
     {
         $this->getShardHeader();
+        if($with_logging) {
+            crawlLog("Opening without dictionary version of shard to write...");
+        }
+        $fh = fopen($this->filename . "-tmp", "wb");
         $header =  pack("N*", 0, 0,
             $this->word_docs_len,
             $this->docids_len,
@@ -1312,13 +1318,31 @@ class IndexShard extends PersistentStructure implements
             $this->num_link_docs,
             $this->len_all_docs,
             $this->len_all_link_docs);
-        $word_docs = $this->getWordDocsSubstring();
-        $doc_infos = $this->getDocInfoSubstring();
-        $fh = fopen($this->filename, "wb");
         fwrite($fh, $header);
+        if($with_logging) {
+            crawlLog("..without dictionary version of shard header written");
+        }
+        $header = "";
+        $word_docs = $this->getWordDocsSubstring();
         fwrite($fh, $word_docs);
+        if($with_logging) {
+            crawlLog("..without dictionary version of shard word docs written");
+        }
+        $word_docs = "";
+        $doc_infos = $this->getDocInfoSubstring();
         fwrite($fh, $doc_infos);
+        if($with_logging) {
+            crawlLog("..without dictionary version of shard doc infos written");
+        }
         fclose($fh);
+        $doc_infos = "";
+        if (file_exists($this->filename . "-tmp")) {
+            unlink($this->filename);
+            rename($this->filename . "-tmp", $this->filename);
+        }
+        if($with_logging) {
+            crawlLog("done replacing version of shard.");
+        }
     }
     /**
      * Computes the prefix string index for the current words array.
diff --git a/src/locale/en_US/resources/Tokenizer.php b/src/locale/en_US/resources/Tokenizer.php
index c1a18ede7..2b714bbb1 100755
--- a/src/locale/en_US/resources/Tokenizer.php
+++ b/src/locale/en_US/resources/Tokenizer.php
@@ -279,7 +279,6 @@ class Tokenizer
      */
    public static function tagTokenizePartOfSpeech($text)
     {
-        static $dictionary = [];
         static $dictionary = [];
         $lexicon_file = C\LOCALE_DIR . "/en_US/resources/lexicon.txt.gz";
         if (empty($dictionary)) {
ViewGit