viewgit/inc/functions.php:22 Function utf8_encode() is deprecated [8192]

More attempts to track down Indexer memory leak, a-chris

Chris Pollett [2019-06-18 23:Jun:th]
More attempts to track down Indexer memory leak, a-chris
Filename
src/executables/Fetcher.php
src/executables/QueueServer.php
src/library/BloomFilterBundle.php
src/library/IndexArchiveBundle.php
src/library/IndexDictionary.php
src/library/UrlParser.php
src/library/Utility.php
src/library/WebArchive.php
src/library/WebArchiveBundle.php
src/library/WebQueueBundle.php
diff --git a/src/executables/Fetcher.php b/src/executables/Fetcher.php
index 0ddf7436a..9750c4475 100755
--- a/src/executables/Fetcher.php
+++ b/src/executables/Fetcher.php
@@ -687,7 +687,7 @@ class Fetcher implements CrawlConstants
                 }
                 $this->to_crawl_again = [];
                 $this->found_sites = [];
-                gc_collect_cycles();
+                L\garbageCollect();
                 $this->web_archive = new WebArchiveBundle($tmp_base_name,
                     false);
                 $this->crawl_time = $info[self::CRAWL_TIME];
@@ -2588,7 +2588,7 @@ class Fetcher implements CrawlConstants
                 $current_server];
             L\crawlLog(".....add inverted index string.");
             unset($this->found_sites[self::INVERTED_INDEX][$current_server]);
-            gc_collect_cycles();
+            L\garbageCollect();
             $data = L\webencode($out_string);
             L\crawlLog(".....web encode result.");
                 // don't compress index data
diff --git a/src/executables/QueueServer.php b/src/executables/QueueServer.php
index de76315be..7f2d726d8 100755
--- a/src/executables/QueueServer.php
+++ b/src/executables/QueueServer.php
@@ -683,7 +683,7 @@ class QueueServer implements CrawlConstants, Join
              */
             $this->deleteOrphanedBundles();
             $this->processIndexData($blocking);
-            if (time() - $this->last_index_save_time > C\FORCE_SAVE_TIME){
+            if (time() - $this->last_index_save_time > C\FORCE_SAVE_TIME) {
                 L\crawlLog("Periodic Index Save... \n");
                 $start_time = microtime(true);
                 $this->indexSave();
@@ -1281,7 +1281,10 @@ class QueueServer implements CrawlConstants, Join
             }
         }
         /* We now do further processing or disallowed sites to see if any
-           of them are really quota sites
+           of them are really quota sites. We want both indexer and scheduler
+           to be aware of the changes as the indexer is responsible for
+           storing the values persistently into the IndexArchiveBundle
+           in case we need to resume a crawl.
          */
         if ($update_disallow == true) {
             $this->updateDisallowedQuotaSites();
@@ -1409,7 +1412,7 @@ class QueueServer implements CrawlConstants, Join
         } else {
             $this->index_archive = null;
         }
-        gc_collect_cycles(); // garbage collect old crawls
+        L\garbageCollect(); // garbage collect old crawls
     }
     /**
      * Delete all the urls from the web queue does not affect filters
@@ -1491,7 +1494,10 @@ class QueueServer implements CrawlConstants, Join
             }
         }
         /* We now do further processing or disallowed sites to see if any
-           of them are really quota sites
+           of them are really quota sites. We want both indexer and scheduler
+           to be aware of the changes as the indexer is responsible for
+           storing the values persistently into the IndexArchiveBundle
+           in case we need to resume a crawl.
          */
         if ($update_disallow == true) {
             $this->updateDisallowedQuotaSites();
@@ -1794,7 +1800,7 @@ class QueueServer implements CrawlConstants, Join
                     L\crawlHash($link_url_parts[1], true)
                     . L\crawlHash($seen_sites[$i][self::URL], true)
                     . $reftype . substr(L\crawlHash(
-                      UrlParser::getHost($link_url_parts[5])."/", true), 1);
+                      UrlParser::getHost($link_url_parts[5]) . "/", true), 1);
                 $seen_sites[$i][self::IS_DOC] = false;
             } else {
                 $seen_sites[$i][self::IS_DOC] = true;
@@ -1822,7 +1828,7 @@ class QueueServer implements CrawlConstants, Join
                     $generation, self::SUMMARY_OFFSET, $seen_sites,
                     $visited_urls_count);
                 foreach ($seen_sites as $site) {
-                    if ($site[self::IS_DOC]){ // so not link
+                    if ($site[self::IS_DOC]) { // so not link
                         $site_url = str_replace('|', "%7C", $site[self::URL]);
                         $host = UrlParser::getHost($site_url);
                         $hash = L\crawlHash($site_url, true).
diff --git a/src/library/BloomFilterBundle.php b/src/library/BloomFilterBundle.php
index 05b885947..1cdd0e989 100644
--- a/src/library/BloomFilterBundle.php
+++ b/src/library/BloomFilterBundle.php
@@ -30,6 +30,10 @@
  */
 namespace seekquarry\yioop\library;

+/**
+ * Used for garbageCollect
+ */
+require_once __DIR__ . '/Utility.php';
 /**
  *
  * A BloomFilterBundle is a directory of BloomFilterFile.
@@ -97,7 +101,7 @@ class BloomFilterBundle
         } else {
             $last_filter = $this->num_filters - 1;
             $this->current_filter =
-                BloomFilterFile::load($dir_name."/filter_$last_filter.ftr");
+                BloomFilterFile::load($dir_name . "/filter_$last_filter.ftr");
         }
     }
     /**
@@ -113,7 +117,7 @@ class BloomFilterBundle
         if ($this->current_filter_count >= $this->filter_size) {
             $this->current_filter->save();
             $this->current_filter = null;
-            gc_collect_cycles();
+            garbageCollect();
             $last_filter = $this->num_filters;
             $this->current_filter =
                 new BloomFilterFile($this->dir_name."/filter_$last_filter.ftr",
diff --git a/src/library/IndexArchiveBundle.php b/src/library/IndexArchiveBundle.php
index 680d8edd0..28fd86628 100644
--- a/src/library/IndexArchiveBundle.php
+++ b/src/library/IndexArchiveBundle.php
@@ -33,9 +33,9 @@ namespace seekquarry\yioop\library;
 use seekquarry\yioop\configs as C;

 /**
- * Used for crawlLog and crawlHash
+ * Used for crawlLog, crawlHash, and garbageCollect
  */
-require_once __DIR__.'/Utility.php';
+require_once __DIR__ . '/Utility.php';
 /**
  * Encapsulates a set of web page summaries and an inverted word-index of terms
  * from these summaries which allow one to search for summaries containing a
@@ -220,10 +220,11 @@ class IndexArchiveBundle implements CrawlConstants
         crawlLog("Current index shard has " . $current_num_docs .
             " documents.");
         $memory_limit = metricToInt(ini_get("memory_limit"));
-        crawlLog("Memory Indexer limit is ".$memory_limit.". Usage is ".
-            memory_get_usage());
+        $before_usage = memory_get_usage();
+        crawlLog("Indexer Memory  limit is " . $memory_limit . ". Usage is ".
+            $before_usage);
         if ($current_num_docs + $add_num_docs > $this->num_docs_per_generation
-            || (0.55 * $memory_limit) < memory_get_usage() ) {
+            || (0.65 * $memory_limit) < $before_usage ) {
             if ($blocking == true) {
                 return -1;
             }
@@ -232,10 +233,16 @@ class IndexArchiveBundle implements CrawlConstants
             // Save current shard dictionary to main dictionary
             $this->forceSave();
             $this->addAdvanceGeneration($callback);
-            $num_freed = gc_collect_cycles();
+            $num_freed= garbageCollect();
             crawlLog("Indexer force running garbage collector after generation".
-                 "advance. This freed " . $num_freed . " bytes.");
-
+                 " advance. This freed " . $num_freed . " bytes.");
+            $after_usage = memory_get_usage();
+            crawlLog("Indexer after switch memory usage: $after_usage");
+            if ((0.65 * $memory_limit) < $after_usage) {
+                crawlLog("Index Shard Switching did not free sufficiently ".
+                    "memory, exiting");
+                exit();
+            }
             crawlLog("Switch Index Shard time:".
                 changeInMicrotime($switch_time));
         }
diff --git a/src/library/IndexDictionary.php b/src/library/IndexDictionary.php
index f362ee882..5f7bd3c48 100644
--- a/src/library/IndexDictionary.php
+++ b/src/library/IndexDictionary.php
@@ -33,7 +33,7 @@ namespace seekquarry\yioop\library;
 use seekquarry\yioop\configs as C;

 /** For Yioop global defines */
-require_once __DIR__."/../configs/Config.php";
+require_once __DIR__ . "/../configs/Config.php";
 /**
  * Data structure used to store for entries of the form:
  * word id, index shard generation, posting list offset, and length of
@@ -167,7 +167,7 @@ class IndexDictionary implements CrawlConstants
             $this->max_tier = 0;
         } else {
             $this->max_tier = unserialize(
-                file_get_contents($this->dir_name."/max_tier.txt"));
+                file_get_contents($this->dir_name . "/max_tier.txt"));
             $this->calculateActiveTiers();
         }
         $this->parent_archive_bundle = $parent_archive_bundle;
@@ -182,7 +182,7 @@ class IndexDictionary implements CrawlConstants
     public function calculateActiveTiers()
     {
         $this->read_tier = $this->max_tier;
-        $tiers = glob($this->dir_name."/0/*A.dic");
+        $tiers = glob($this->dir_name . "/0/*A.dic");
         natsort($tiers);
         $this->active_tiers = [];
         foreach ($tiers as $tier) {
@@ -328,8 +328,8 @@ class IndexDictionary implements CrawlConstants
      */
     public function mergeTierFiles($prefix, $tier, $out_slot)
     {
-        $file_a = $this->dir_name."/$prefix/$tier"."A.dic";
-        $file_b = $this->dir_name."/$prefix/$tier"."B.dic";
+        $file_a = $this->dir_name . "/$prefix/$tier"."A.dic";
+        $file_b = $this->dir_name . "/$prefix/$tier"."B.dic";
         $size_a = filesize($file_a);
         $size_b = filesize($file_b);
         $prefix_header_size = self::PREFIX_HEADER_SIZE;
diff --git a/src/library/UrlParser.php b/src/library/UrlParser.php
index e00dbf893..b2a8eb17a 100755
--- a/src/library/UrlParser.php
+++ b/src/library/UrlParser.php
@@ -772,7 +772,6 @@ class UrlParser
         static $uncomputed = true;
         $host = UrlParser::getHost($url, false);
         if ($uncomputed) {
-            $localhosts = ["localhost", "127.0.0.1", "::1"];
             if (isset($_SERVER["SERVER_NAME"])) {
                 $localhosts[] = $_SERVER["SERVER_NAME"];
                 $localhosts[] = gethostbyname($_SERVER["SERVER_NAME"]);
@@ -780,7 +779,7 @@ class UrlParser
             if (isset($_SERVER["SERVER_ADDR"])) {
                 $localhosts[] = $_SERVER["SERVER_ADDR"];
             }
-            $uncomputed = true;
+            $uncomputed = false;
         }
         foreach ($localhosts as $localhost) {
             if (stristr($host, $localhost)) {
diff --git a/src/library/Utility.php b/src/library/Utility.php
index 541ed1138..809df4bc2 100755
--- a/src/library/Utility.php
+++ b/src/library/Utility.php
@@ -2372,3 +2372,18 @@ function bchexdec($hex)
     }
     return $dec;
 }
+/**
+ * Runs various system garbage collection functions and returns
+ * number of bytes freed.
+ *
+ * @return int number of bytes freed
+ */
+function garbageCollect()
+{
+    $bytes_collected = 0;
+    if (function_exists("gc_mem_caches")) {
+        $bytes_collected += gc_mem_caches();
+    }
+    $bytes_collected += gc_collect_cycles();
+    return $bytes_collected;
+}
diff --git a/src/library/WebArchive.php b/src/library/WebArchive.php
index b1cfdcd20..4fecdded2 100755
--- a/src/library/WebArchive.php
+++ b/src/library/WebArchive.php
@@ -162,7 +162,9 @@ class WebArchive
      */
     public function writeInfoBlock($fh = null, &$data = null)
     {
-        if ($this->is_string) return;
+        if ($this->is_string) {
+            return;
+        }
         $compressed_int_len = $this->compressor->compressedIntLen();
         $open_flag = false;
         if ($fh == null) {
diff --git a/src/library/WebArchiveBundle.php b/src/library/WebArchiveBundle.php
index 4233d9200..6f0ef6960 100755
--- a/src/library/WebArchiveBundle.php
+++ b/src/library/WebArchiveBundle.php
@@ -156,7 +156,7 @@ class WebArchiveBundle
                 $info['WRITE_PARTITION'] = $this->write_partition;
             }
             file_put_contents(
-                $this->dir_name."/description.txt", serialize($info),
+                $this->dir_name . "/description.txt", serialize($info),
                 LOCK_EX);
         }
     }
@@ -310,7 +310,7 @@ class WebArchiveBundle
             $this->count = $info[$field];
         }
         if (!$this->read_only_archive) {
-            file_put_contents($this->dir_name."/description.txt",
+            file_put_contents($this->dir_name . "/description.txt",
                 serialize($info), LOCK_EX);
         }
     }
@@ -346,9 +346,9 @@ class WebArchiveBundle
      */
     public static function setArchiveInfo($dir_name, $info)
     {
-        if (file_exists($dir_name."/description.txt") && ((isset($this) &&
+        if (file_exists($dir_name . "/description.txt") && ((isset($this) &&
             !$this->read_only_archive) || !isset($this))) {
-            file_put_contents($dir_name."/description.txt", serialize($info),
+            file_put_contents($dir_name . "/description.txt", serialize($info),
                 LOCK_EX);
         }
     }
@@ -359,9 +359,9 @@ class WebArchiveBundle
      */
     public static function getParamModifiedTime($dir_name)
     {
-        if (file_exists($dir_name."/description.txt")) {
+        if (file_exists($dir_name . "/description.txt")) {
             clearstatcache();
-            return filemtime($dir_name."/description.txt");
+            return filemtime($dir_name . "/description.txt");
         }
         return false;
     }
diff --git a/src/library/WebQueueBundle.php b/src/library/WebQueueBundle.php
index a402740da..09b05ff98 100755
--- a/src/library/WebQueueBundle.php
+++ b/src/library/WebQueueBundle.php
@@ -36,7 +36,7 @@ use seekquarry\yioop\library\compressors\NonCompressor;
 /**
  * Used for the crawlHash function
  */
-require_once __DIR__.'/Utility.php';
+require_once __DIR__ . '/Utility.php';
 /**
  * Encapsulates the data structures needed to have a queue of to crawl urls
  *
@@ -825,7 +825,7 @@ class WebQueueBundle implements Notifier
             }
         }
         $this->to_crawl_table = null;
-        gc_collect_cycles();
+        garbageCollect();
         if (file_exists($this->dir_name."/hash_table.dat")) {
             unlink($this->dir_name."/hash_table.dat");
             if (file_exists($this->dir_name."/tmp_table.dat")) {
@@ -867,7 +867,7 @@ class WebQueueBundle implements Notifier
             $this->insertHashTable($hash_url, $data, $probe);
         }
         $this->to_crawl_archive = null;
-        gc_collect_cycles();
+        garbageCollect();
         $tmp_archive->filename = $url_archive_name;
         $this->to_crawl_archive =  $tmp_archive;
     }
@@ -895,7 +895,7 @@ class WebQueueBundle implements Notifier
         $this->crawl_delay_filter = null;
         $this->robot_archive = null;
         $this->robot_table = null;
-        gc_collect_cycles();
+        garbageCollect();
         $this->got_robottxt_filter =
             new BloomFilterFile(
                 $this->dir_name."/got_robottxt.ftr", $this->filter_size);
@@ -952,7 +952,7 @@ class WebQueueBundle implements Notifier
             unlink($this->dir_name . "/dns_table.dat");
         }
         $this->dns_table = null;
-        gc_collect_cycles();
+        garbageCollect();
         $this->dns_table = new HashTable($this->dir_name . "/dns_table.dat",
             $num_values, self::HASH_KEY_SIZE, self::IP_SIZE);
         if ($this->robot_table) {
ViewGit