viewgit/inc/functions.php:22 Function utf8_encode() is deprecated [8192]

Fix bug in Indexer restart code, add code to see if can find mem leak in in Indexer, a=chris

Chris Pollett [2019-06-17 04:Jun:th]
Fix bug in Indexer restart code, add code to see if can find mem leak in in Indexer, a=chris
Filename
src/executables/QueueServer.php
src/library/IndexArchiveBundle.php
src/library/IndexDictionary.php
src/library/IndexShard.php
src/library/Utility.php
src/library/WebArchive.php
src/library/WebArchiveBundle.php
src/locale/zh_CN/resources/segment_word_grams.ftr
diff --git a/src/executables/QueueServer.php b/src/executables/QueueServer.php
index f5f8e20e4..8c83094b8 100755
--- a/src/executables/QueueServer.php
+++ b/src/executables/QueueServer.php
@@ -502,7 +502,7 @@ class QueueServer implements CrawlConstants, Join
         L\crawlLog("Checking if both processes still running ...");
         $lines_to_check = C\LOG_LINES_TO_RESTART;
             //about 20-30 minutes of log data
-        $lines = L\tail(C\LOG_DIR."/" . $this->process_name . ".log",
+        $lines = L\tail(C\LOG_DIR . "/" . $this->process_name . ".log",
             $lines_to_check);
         L\crawlLog("...Got " . $this->process_name . ".log lines");
         if (count($lines) < $lines_to_check) {
@@ -522,10 +522,14 @@ class QueueServer implements CrawlConstants, Join
                 L\logLineTimestamp($process_lines[$num_lines - 1]);
             L\crawlLog("...Timestamp of last processed line: ".
                 $timestamp);
-            if ($timestamp > 1408934838) { //rules out false for timestamp
+            if (is_numeric($timestamp)) {
+                /*
+                   Note if 0 then we have seen LOG_LINES_TO_RESTART lines
+                   with no message from other process
+                 */
                 $last_process_timestamp = $timestamp;
                 L\crawlLog("...seems to be a valid timestep, so using.");
-            } else {
+            } else { //hopefully doesn't occur, maybe log file garbled
                 L\crawlLog("...invalid timestep, so using current time.");
             }
         }
@@ -542,6 +546,11 @@ class QueueServer implements CrawlConstants, Join
         foreach ($process_lines as $line) {
             $out_msg .= "!!!!$line\n";
         }
+        if (empty($process_lines)) {
+            $out_msg .= "!!!!No messages seen this log file!\n";
+            $out_msg .= "!!!!$process must have died before: " .
+                L\logLineTimestamp($lines[0]);
+        }
         $error_log = C\CRASH_LOG_NAME;
         if (!file_exists($error_log) || filesize($error_log)>
             C\MAX_LOG_FILE_SIZE) {
@@ -689,7 +698,9 @@ class QueueServer implements CrawlConstants, Join
         }
         switch ($this->crawl_type) {
             case self::WEB_CRAWL:
-                if ($this->isOnlyIndexer()) { return; }
+                if ($this->isOnlyIndexer()) {
+                    return;
+                }
                 $this->processRobotUrls();
                 if (C\USE_ETAG_EXPIRES) {
                     $this->processEtagExpires();
@@ -1222,7 +1233,7 @@ class QueueServer implements CrawlConstants, Join
             $this->index_dirty = false;
             // chmod so apache can also write to these directories
             $this->db->setWorldPermissionsRecursive(
-                C\CRAWL_DIR.'/cache/'. $base_name . $this->crawl_time);
+                C\CRAWL_DIR.'/cache/' . $base_name . $this->crawl_time);
         }
     }
     /**
@@ -1611,9 +1622,9 @@ class QueueServer implements CrawlConstants, Join
     public function processDataFile($base_dir, $callback_method,
         $blocking = false)
     {
-        $dirs = glob($base_dir.'/*', GLOB_ONLYDIR);
+        $dirs = glob($base_dir . '/*', GLOB_ONLYDIR);
         foreach ($dirs as $dir) {
-            $files = glob($dir.'/*.txt');
+            $files = glob($dir . '/*.txt');
             if (isset($old_dir)) {
                 L\crawlLog("Deleting $old_dir\n");
                 $this->db->unlinkRecursive($old_dir);
diff --git a/src/library/IndexArchiveBundle.php b/src/library/IndexArchiveBundle.php
index 6ef729cd2..680d8edd0 100644
--- a/src/library/IndexArchiveBundle.php
+++ b/src/library/IndexArchiveBundle.php
@@ -217,7 +217,8 @@ class IndexArchiveBundle implements CrawlConstants
         $blocking = false)
     {
         $current_num_docs = $this->getActiveShard()->num_docs;
-        crawlLog("Current index shard has ".$current_num_docs." documents.");
+        crawlLog("Current index shard has " . $current_num_docs .
+            " documents.");
         $memory_limit = metricToInt(ini_get("memory_limit"));
         crawlLog("Memory Indexer limit is ".$memory_limit.". Usage is ".
             memory_get_usage());
@@ -231,6 +232,10 @@ class IndexArchiveBundle implements CrawlConstants
             // Save current shard dictionary to main dictionary
             $this->forceSave();
             $this->addAdvanceGeneration($callback);
+            $num_freed = gc_collect_cycles();
+            crawlLog("Indexer force running garbage collector after generation".
+                 "advance. This freed " . $num_freed . " bytes.");
+
             crawlLog("Switch Index Shard time:".
                 changeInMicrotime($switch_time));
         }
@@ -257,7 +262,7 @@ class IndexArchiveBundle implements CrawlConstants
         $this->generation_info['CURRENT'] =
             $this->generation_info['ACTIVE'];
         $current_index_shard_file = $this->dir_name.
-            "/posting_doc_shards/index". $this->generation_info['ACTIVE'];
+            "/posting_doc_shards/index" . $this->generation_info['ACTIVE'];
         $this->current_shard = new IndexShard(
             $current_index_shard_file, $this->generation_info['ACTIVE'],
                 $this->num_docs_per_generation);
@@ -436,11 +441,11 @@ class IndexArchiveBundle implements CrawlConstants
      */
     public static function getArchiveInfo($dir_name)
     {
-        if (file_exists($dir_name."/arc_description.txt")) {
+        if (file_exists($dir_name . "/arc_description.txt")) {
             $crawl = [];
             $info = [];
             $crawl['DESCRIPTION'] = substr(
-                file_get_contents($dir_name."/arc_description.txt"), 0, 256);
+                file_get_contents($dir_name . "/arc_description.txt"), 0, 256);
             $crawl['ARCFILE'] = true;
             $info['VISITED_URLS_COUNT'] = 0;
             $info['COUNT'] = 0;
diff --git a/src/library/IndexDictionary.php b/src/library/IndexDictionary.php
index 146aa08ab..f362ee882 100644
--- a/src/library/IndexDictionary.php
+++ b/src/library/IndexDictionary.php
@@ -293,7 +293,7 @@ class IndexDictionary implements CrawlConstants
             $tier++;
             if ($tier > $this->max_tier) {
                 $this->max_tier = $tier;
-                file_put_contents($this->dir_name."/max_tier.txt",
+                file_put_contents($this->dir_name . "/max_tier.txt",
                     serialize($this->max_tier));
             }
         }
diff --git a/src/library/IndexShard.php b/src/library/IndexShard.php
index 9b20967a6..0df6a9492 100644
--- a/src/library/IndexShard.php
+++ b/src/library/IndexShard.php
@@ -677,7 +677,9 @@ class IndexShard extends PersistentStructure implements CrawlConstants
         $total_posting_len = 0;
         $num_postings_so_far = 0;
         do {
-            if ($next > $end) {break;}
+            if ($next > $end) {
+                break;
+            }
             $posting_start = $next;
             $posting = $this->getPostingAtOffset(
                 $next, $posting_start, $posting_end);
diff --git a/src/library/Utility.php b/src/library/Utility.php
index 2e5f18156..1065829ca 100755
--- a/src/library/Utility.php
+++ b/src/library/Utility.php
@@ -332,7 +332,9 @@ function addDocIndexPostings(&$postings, $add_offset)
             !($tmp = unpack("N*", $post_string))) {continue; }
         $posting_list = call_user_func_array("array_merge",
             array_map(C\NS_LIB . "unpackListModified9", $tmp));
-        if (!is_array($posting_list)) { continue; }
+        if (!is_array($posting_list)) {
+            continue;
+        }
         $doc_index = array_shift($posting_list);
         if (($doc_index & (2 << 26)) > 0) {
             $post0 = ($doc_index & ((2 << 9) - 1));
diff --git a/src/library/WebArchive.php b/src/library/WebArchive.php
index 0d0ce8f10..b1cfdcd20 100755
--- a/src/library/WebArchive.php
+++ b/src/library/WebArchive.php
@@ -181,7 +181,7 @@ class WebArchive
         $len = strlen($info_string) + $compressed_int_len;
         $offset = ftell($fh);
         ftruncate($fh, $offset);
-        $out = $info_string.$this->compressor->compressInt($len);
+        $out = $info_string . $this->compressor->compressInt($len);
         fwrite($fh, $out, $len);
         if ($open_flag) {
             fclose($fh);
diff --git a/src/library/WebArchiveBundle.php b/src/library/WebArchiveBundle.php
index a70e5a7ff..4233d9200 100755
--- a/src/library/WebArchiveBundle.php
+++ b/src/library/WebArchiveBundle.php
@@ -191,8 +191,8 @@ class WebArchiveBundle
         return $this->write_partition;
     }
     /**
-     * Advances the index of the write partition by one and creates the
-     * corresponding web archive.
+     * Sets the write partition to the provided value and if this is not
+     * a read only archive stores, this value persistently to archive info
      *
      * @param int $i the number of the current write partition
      */
@@ -200,6 +200,10 @@ class WebArchiveBundle
     {
         $this->write_partition = $i;
         if (!$this->read_only_archive) {
+            /* clear the partition array just to avoid memory leak in
+                crawling setting
+             */
+            $this->partition = [];
             $info = $this->getArchiveInfo($this->dir_name);
             $info['WRITE_PARTITION'] = $this->write_partition;
             $this->setArchiveInfo($this->dir_name, $info);
@@ -246,7 +250,7 @@ class WebArchiveBundle
             $create_flag = false;
             $compressor = C\NS_LIB . "compressors\\" . $this->compressor;
             $compressor_obj = new $compressor();
-            $archive_name = $this->dir_name."/web_archive_" . $index
+            $archive_name = $this->dir_name . "/web_archive_" . $index
                 . $compressor_obj->fileExtension();
             if (!file_exists($archive_name)) {
                 $create_flag = true;
diff --git a/src/locale/zh_CN/resources/segment_word_grams.ftr b/src/locale/zh_CN/resources/segment_word_grams.ftr
old mode 100755
new mode 100644
index c39e16021..0a49a222a
Binary files a/src/locale/zh_CN/resources/segment_word_grams.ftr and b/src/locale/zh_CN/resources/segment_word_grams.ftr differ
ViewGit