viewgit/inc/functions.php:22 Function utf8_encode() is deprecated [8192]

Fixes bug in getting timestamps from log line, modifies indexer memory constraint checks, a=chris

Chris Pollett [2019-06-19 22:Jun:th]
Fixes bug in getting timestamps from log line, modifies indexer memory constraint checks, a=chris
Filename
src/executables/QueueServer.php
src/library/IndexArchiveBundle.php
src/library/Utility.php
diff --git a/src/executables/QueueServer.php b/src/executables/QueueServer.php
index 7f2d726d8..cf717c194 100755
--- a/src/executables/QueueServer.php
+++ b/src/executables/QueueServer.php
@@ -1290,9 +1290,23 @@ class QueueServer implements CrawlConstants, Join
             $this->updateDisallowedQuotaSites();
         }
         $this->initializeWebQueue();
-        if(empty($info[self::REPEAT_TYPE]) || $info[self::REPEAT_TYPE] < 0) {
+        $this->initializeIndexBundle($info);
+        $info[self::STATUS] = self::CONTINUE_STATE;
+        return $info;
+    }
+    /**
+     * Function used to set up an indexer's IndexArchiveBundle or
+     * DoubleIndexBundle according to the current crawl parameters or
+     * the values stored in an existing bundle.
+     *
+     * @param array $info if initializing a new crawl this should contain
+     *      the crawl parameters
+     */
+    public function initializeIndexBundle($info = [])
+    {
+        if(empty($this->repeat_type) || $this->repeat_type < 0) {
             $class_name = C\NS_LIB . "IndexArchiveBundle";
-            $dir = C\CRAWL_DIR.'/cache/' . self::index_data_base_name .
+            $dir = C\CRAWL_DIR . '/cache/' . self::index_data_base_name .
                 $this->crawl_time;
         } else {
             $class_name = C\NS_LIB . "DoubleIndexBundle";
@@ -1320,8 +1334,7 @@ class QueueServer implements CrawlConstants, Join
                    (might take a while if merging dictionary)
                  */
                 $this->writeCrawlStatus($sites);
-            } else if (!empty($info[self::REPEAT_TYPE]) &&
-                $info[self::REPEAT_TYPE] >= 0) {
+            } else if (!empty($this->repeat_type) && $this->repeat_type >= 0) {
                 $this->index_archive = new $class_name($dir, false,
                     serialize($info), C\NUM_DOCS_PER_GENERATION,
                     $info[self::REPEAT_TYPE]);
@@ -1341,8 +1354,6 @@ class QueueServer implements CrawlConstants, Join
         //Get modified time of initial setting of crawl params
         $this->archive_modified_time =
             $class_name::getParamModifiedTime($dir);
-        $info[self::STATUS] = self::CONTINUE_STATE;
-        return $info;
     }
     /**
      * This is called whenever the crawl options are modified to parse
@@ -1842,19 +1853,19 @@ class QueueServer implements CrawlConstants, Join
                 unset($seen_sites);
             }
             L\crawlLog("C. Indexer init local shard, store ".
-                "Summaries memory usage:". memory_get_usage() .
+                "Summaries memory usage: ". memory_get_usage() .
                 " time: " . L\changeInMicrotime($start_time));
             $start_time = microtime(true);
             // added summary offset info to inverted index data
             $index_shard->changeDocumentOffsets($summary_offsets);
-            L\crawlLog("D. Indexer Update shard offsets. Memory usage:".
+            L\crawlLog("D. Indexer Update shard offsets. Memory usage: ".
                 memory_get_usage() . " time: " .
                 L\changeInMicrotime($start_time));
             $start_time = microtime(true);
             $this->index_archive->addIndexData($index_shard);
             $this->index_dirty = true;
         }
-        L\crawlLog("E. Indexer Add index shard. Memory usage:".
+        L\crawlLog("E. Indexer Add index shard. Memory usage: ".
             memory_get_usage() . " time: " .
             L\changeInMicrotime($start_time));
         L\crawlLog("Indexer Done Index Processing File: $file. Total time: ".
@@ -1867,6 +1878,38 @@ class QueueServer implements CrawlConstants, Join
             //Haven't tracked down yet, but can try to delete twice giving warn
             unlink($file);
         }
+        $this->constrainIndexerMemoryUsage();
+    }
+    /**
+     * Tries to prevent Indexer from crashing do to excessive memory use.
+     * If Indexer is using more that .7 of its allowed memory, tries to
+     * free memory by saving index bunlde to disk, freeing memory, then
+     * reloading.
+     */
+    public function constrainIndexerMemoryUsage()
+    {
+        $memory_limit = L\metricToInt(ini_get("memory_limit"));
+        $current_usage = memory_get_usage();
+        if ((0.7 * $memory_limit) < $current_usage) {
+            L\crawlLog("Indexer memory usage threshold exceeded!!!");
+            L\crawlLog("...Threshold is: " . (0.7 * $memory_limit));
+            L\crawlLog("...Current usage is: " . $current_usage);
+            L\crawlLog("...Trying to free memory by resetting " .
+                "index bundle.");
+            $this->index_archive->forceSave();
+            $this->index_archive = null;
+            $num_freed = L\garbageCollect();
+            L\crawlLog("...Indexer force running garbage collector " .
+                "after reset. This freed " . $num_freed . " bytes.");
+            $this->initializeIndexBundle();
+            $current_usage = memory_get_usage();
+            L\crawlLog("Done index bundle reset, current memory usage is: ".
+                $current_usage);
+            if ((0.7 * $memory_limit) < $current_usage) {
+                L\crawlLog("!!!Usage still exceeds threshold, exiting");
+                exit();
+            }
+        }
     }
     /**
      * Checks how old the oldest robot data is and dumps if older then a
diff --git a/src/library/IndexArchiveBundle.php b/src/library/IndexArchiveBundle.php
index 28fd86628..a294522f4 100644
--- a/src/library/IndexArchiveBundle.php
+++ b/src/library/IndexArchiveBundle.php
@@ -223,8 +223,8 @@ class IndexArchiveBundle implements CrawlConstants
         $before_usage = memory_get_usage();
         crawlLog("Indexer Memory  limit is " . $memory_limit . ". Usage is ".
             $before_usage);
-        if ($current_num_docs + $add_num_docs > $this->num_docs_per_generation
-            || (0.65 * $memory_limit) < $before_usage ) {
+        if ($current_num_docs + $add_num_docs >
+            $this->num_docs_per_generation) {
             if ($blocking == true) {
                 return -1;
             }
@@ -233,16 +233,11 @@ class IndexArchiveBundle implements CrawlConstants
             // Save current shard dictionary to main dictionary
             $this->forceSave();
             $this->addAdvanceGeneration($callback);
-            $num_freed= garbageCollect();
+            $num_freed = garbageCollect();
             crawlLog("Indexer force running garbage collector after generation".
                  " advance. This freed " . $num_freed . " bytes.");
             $after_usage = memory_get_usage();
             crawlLog("Indexer after switch memory usage: $after_usage");
-            if ((0.65 * $memory_limit) < $after_usage) {
-                crawlLog("Index Shard Switching did not free sufficiently ".
-                    "memory, exiting");
-                exit();
-            }
             crawlLog("Switch Index Shard time:".
                 changeInMicrotime($switch_time));
         }
diff --git a/src/library/Utility.php b/src/library/Utility.php
index 809df4bc2..56e95a2e5 100755
--- a/src/library/Utility.php
+++ b/src/library/Utility.php
@@ -2233,7 +2233,7 @@ function lineFilter($lines, $filters)
  */
 function logLineTimestamp($line)
 {
-    preg_match("/^\s*\[(.*)\]/", $line, $matches);
+    preg_match("/^\s*\[\d+\s+(.*)\]/", $line, $matches);
     if (isset($matches[1])) {
         return @strtotime($matches[1]);
     }
ViewGit