viewgit/inc/functions.php:22 Function utf8_encode() is deprecated [8192]

fix a bug in results editor if switch from filtering to editing a result, pull out more constants connected to memory profile to try to help out of memory crashes, a=chris

Chris Pollett [2020-06-27 22:Jun:th]
fix a bug in results editor if switch from filtering to editing a result, pull out more constants connected to memory profile to try to help out of memory crashes, a=chris
Filename
src/configs/Config.php
src/configs/TokenTool.php
src/executables/ArcTool.php
src/executables/ClassifierTrainer.php
src/executables/Fetcher.php
src/executables/MediaUpdater.php
src/executables/Mirror.php
src/executables/QueueServer.php
src/index.php
src/library/FetchUrl.php
src/library/IndexArchiveBundle.php
src/library/PhraseParser.php
src/models/SearchverticalsModel.php
diff --git a/src/configs/Config.php b/src/configs/Config.php
index 922ae144d..6b287f0f3 100755
--- a/src/configs/Config.php
+++ b/src/configs/Config.php
@@ -667,22 +667,21 @@ nsconddefine('VERSION_0_TIMESTAMP', 1369754208);
 nsconddefine('VERSION_1_TIMESTAMP', 1528045371);
 /** What version format to use for default indexing **/
 nsconddefine('DEFAULT_CRAWL_FORMAT', 2);
-/** Max memory a QueueServer can use */
-nsconddefine('QUEUE_SERVER_MEMORY_LIMIT', "3000M");
-/** Max memory a Fetcher can use */
-nsconddefine('FETCHER_MEMORY_LIMIT', "1500M");
-defineMemoryProfile();
+/** 1 Gigibyte (GiB)*/
+nsdefine('ONE_GIB', 1073741824);
 /**
  * Code to determine how much memory current machine has
  */
 function defineMemoryProfile()
 {
-    //assume have at least 4GB on a Mac(could use vm_stat)
-    $memory = 4000000000;
+    //assume have at least 4GiB
+    $memory = 4 * ONE_GIB;
     if (strstr(PHP_OS, "WIN")) {
         if (function_exists("exec")) {
             exec('wmic memorychip get capacity', $memory_array);
-            $memory = array_sum($memory_array);
+            if ($memory_array) {
+                $memory = array_sum($memory_array);
+            }
         }
     } else if (stristr(PHP_OS, "LINUX")) {
         set_error_handler(null);
@@ -692,35 +691,40 @@ function defineMemoryProfile()
             $data = preg_split("/\s+/", $mem_data);
             $memory = 1024 * intval($data[1]);
         }
+    } else if (stristr(PHP_OS, "DARWIN")) {
+        exec('sysctl hw.memsize', $memory_array);
+        if (!empty($memory_array)) {
+            preg_match("/\d+/", $memory_array[0], $mem_matches);
+            $memory = $mem_matches[0];
+        }
     }
-    /**
-     * Factor to multiply sizes of Yioop data structures with in low ram memory
-     * setting (2GB)
-     */
-    nsdefine('MEMORY_LOW', 1);
-    /**
-     * Factor to multiply sizes of Yioop data structures with if have more than
-     * (2GB)
-     */
-    nsdefine('MEMORY_STANDARD', 4);
-    if ($memory < 2200000000) {
-        /**
-         * Based on system memory, either the low or high memory factor
-         */
-        nsdefine('MEMORY_PROFILE', MEMORY_LOW);
-    } else {
-        /**
-         * @ignore
-         */
-        nsdefine('MEMORY_PROFILE', MEMORY_STANDARD);
-    }
-    /**
-     * Delay in microseconds between processing pages to try to avoid
-     * CPU overheating. On some systems, you can set this to 0.
-     */
-    nsconddefine('FETCHER_PROCESS_DELAY', 10000);
+    $memory_factor = ceil($memory / (2 * ONE_GIB));
+    nsdefine('MEMORY_PROFILE', min(4, $memory_factor));
+    nsdefine('SYSTEM_RAM', $memory);
 }
-
+//Check system memory then set up limits for prcoesses based on this
+defineMemoryProfile();
+/** Max memory index.php can use */
+nsconddefine('INDEX_FILE_MEMORY_LIMIT', ceil(MEMORY_PROFILE/4) . "000M");
+/** Max memory a QueueServer can use */
+nsconddefine('QUEUE_SERVER_MEMORY_LIMIT', MEMORY_PROFILE . "000M");
+/** Max memory a Fetcher can use */
+nsconddefine('FETCHER_MEMORY_LIMIT', ceil(MEMORY_PROFILE/2) . "000M");
+/** Max memory a MediaUpdater can use */
+nsconddefine('MEDIA_UPDATER_MEMORY_LIMIT', ceil(MEMORY_PROFILE/2) . "000M");
+/** Max memory a Mirror can use */
+nsconddefine('MIRROR_MEMORY_LIMIT', ceil(MEMORY_PROFILE/4) ."000M");
+/** Max memory a ClassifierTrainer can use */
+nsconddefine('CLASSIFIER_TRAINER_LIMIT', ceil(MEMORY_PROFILE/4) ."000M");
+/** Max memory a QueueServer can use */
+nsconddefine('ARC_TOOL_MEMORY_LIMIT', (2 * MEMORY_PROFILE) . "000M");
+/** Max memory a TokenTool can use */
+nsconddefine('TOKEN_TOOL_MEMORY_LIMIT', ceil(MEMORY_PROFILE/2) . "000M");
+/** Used to control fraction of memory filled of current process
+ *  (usually Fetcher or QueueServer) before action (such as switch shard)
+ *  on current class (usually IndexArchiveBundle) is taken.
+ */
+nsconddefine('MEMORY_FILL_FACTOR', 0.65);
 /**
  * bloom filters are used to keep track of which urls are visited,
  * this parameter determines up to how many
@@ -823,6 +827,11 @@ nsconddefine('PROCESS_TIMEOUT', 15 * ONE_MINUTE);
  *  crawl is likely stalled
  */
 nsconddefine("CRAWL_TIMEOUT", 2 * PROCESS_TIMEOUT);
+/**
+ * Delay in microseconds between processing pages to try to avoid
+ * CPU overheating. On some systems, you can set this to 0.
+ */
+nsconddefine('FETCHER_PROCESS_DELAY', 10000);
 /**
  * Number of error page 400 or greater seen from a host before crawl-delay
  * host and dump remainder from current schedule
diff --git a/src/configs/TokenTool.php b/src/configs/TokenTool.php
index 7a6af80b7..a9266b22b 100644
--- a/src/configs/TokenTool.php
+++ b/src/configs/TokenTool.php
@@ -54,10 +54,9 @@ if (php_sapi_name() != 'cli' ||
     defined("seekquarry\\yioop\\configs\\IS_OWN_WEB_SERVER")) {
     echo "BAD REQUEST"; exit();
 }
-ini_set("memory_limit","1500M");
 /** Load in global configuration settings and crawlHash function */
 require_once __DIR__ . "/../library/Utility.php";
-
+ini_set("memory_limit", C\TOKEN_TOOL_MEMORY_LIMIT);
 /*
    The phrase "More at Wikipedia..." with a link concludes the knowledge
    wiki entries we generate from wikipedia data.
@@ -707,6 +706,8 @@ function smartOpen($file_name)
  */
 function translateLocale($locale_tag)
 {
+    global $public_pages;
+    global $help_pages;
     if (!C\nsdefined('TRANSLATE_API_KEY')) {
         echo "You need to get a Yandex translate API key to use this command";
         return;
diff --git a/src/executables/ArcTool.php b/src/executables/ArcTool.php
index fe5f38c31..34ac42e5d 100755
--- a/src/executables/ArcTool.php
+++ b/src/executables/ArcTool.php
@@ -48,7 +48,6 @@ if (php_sapi_name() != 'cli' ||
     defined("seekquarry\\yioop\\configs\\IS_OWN_WEB_SERVER")) {
     echo "BAD REQUEST"; exit();
 }
-ini_set("memory_limit","2500M");
 /** This tool does not need logging*/
 $_SERVER["LOG_TO_FILES"] = false;
 /** USE_CACHE false rules out file cache as well*/
@@ -60,6 +59,9 @@ if (!C\PROFILE) {
         "its web interface on localhost.\n";
     exit();
 }
+ini_set("memory_limit", C\ARC_TOOL_MEMORY_LIMIT);   /*reading in a whole
+    shard might take a fair bit of memory
+*/
 /*
  * We'll set up multi-byte string handling to use UTF-8
  */
@@ -337,9 +339,6 @@ class ArcTool implements CrawlConstants
      */
     public function outputShardInfo($archive_path, $generation)
     {
-        ini_set("memory_limit","8000M"); /*reading in a whole shard might take
-                a bit more memory
-            */
         if (preg_match("/\-\d$/", $archive_path)) {
             $bundle_num = substr($archive_path, -1);
             $archive_path = substr($archive_path, 0, -2);
diff --git a/src/executables/ClassifierTrainer.php b/src/executables/ClassifierTrainer.php
index 1e0a59af4..547f82ef5 100755
--- a/src/executables/ClassifierTrainer.php
+++ b/src/executables/ClassifierTrainer.php
@@ -52,16 +52,16 @@ if (!C\PROFILE) {
         "its web interface on localhost.\n";
     exit();
 }
+/*
+   If possible, set the memory limit high enough to fit all of the features and
+   training documents into memory.
+ */
+ini_set("memory_limit", C\CLASSIFIER_TRAINER_LIMIT);
 /*
     We'll set up multi-byte string handling to use UTF-8
  */
 mb_internal_encoding("UTF-8");
 mb_regex_encoding("UTF-8");
-/*
-   If possible, set the memory limit high enough to fit all of the features and
-   training documents into memory.
- */
-ini_set("memory_limit", "500M");
 /**
  * This class is used to finalize a classifier via the web interface.
  *
@@ -76,7 +76,7 @@ ini_set("memory_limit", "500M");
  * second command-line argument. The following command would be used to run
  * this script directly from the command-line:
  *
- *    $ php bin/ClassifierTrainer.php terminal LABEL
+ *    $ php ClassifierTrainer.php terminal LABEL
  *
  * @author Shawn Tice
  */
@@ -93,14 +93,19 @@ class ClassifierTrainer
     {
         global $argv;
         CrawlDaemon::init($argv, "ClassifierTrainer");
-        $label = $argv[2];
-        L\crawlLog("Initializing classifier trainer log..",
-            $label.'-ClassifierTrainer', true);
-        $classifier = Classifier::getClassifier($label);
-        $classifier->prepareToFinalize();
-        $classifier->finalize();
-        Classifier::setClassifier($classifier);
-        L\crawlLog("Training complete.\n");
+        $label = $argv[2] ?? "";
+        $classifier = null;
+        if (!empty($label)) {
+            L\crawlLog("Initializing classifier trainer log..",
+                $label . '-ClassifierTrainer', true);
+            $classifier = Classifier::getClassifier($label);
+        }
+        if (!empty($classifier)) {
+            $classifier->prepareToFinalize();
+            $classifier->finalize();
+            Classifier::setClassifier($classifier);
+            L\crawlLog("Training complete.\n");
+        }
         CrawlDaemon::stop('ClassifierTrainer', $label);
     }
 }
diff --git a/src/executables/Fetcher.php b/src/executables/Fetcher.php
index 34acbe75d..22485c591 100755
--- a/src/executables/Fetcher.php
+++ b/src/executables/Fetcher.php
@@ -1381,7 +1381,7 @@ class Fetcher implements CrawlConstants
     public function exceedMemoryThreshold()
     {
         return memory_get_usage() > (L\metricToInt(
-            ini_get("memory_limit")) * 0.7);
+            ini_get("memory_limit")) * C\MEMORY_FILL_FACTOR);
     }
     /**
      * At least once, and while memory is low picks at server at random and send
diff --git a/src/executables/MediaUpdater.php b/src/executables/MediaUpdater.php
index 3b3eb5344..d09cd8990 100644
--- a/src/executables/MediaUpdater.php
+++ b/src/executables/MediaUpdater.php
@@ -43,7 +43,6 @@ if (php_sapi_name() != 'cli' ||
     defined("seekquarry\\yioop\\configs\\IS_OWN_WEB_SERVER")) {
     echo "BAD REQUEST"; exit();
 }
-ini_set("memory_limit", "1300M");
 /** We do want logging, but crawl model and others will try to turn off
  * if we don't set this
  */
@@ -55,6 +54,7 @@ if (!C\PROFILE) {
         "its web interface on localhost.\n";
     exit();
 }
+ini_set("memory_limit", C\MEDIA_UPDATER_MEMORY_LIMIT);
 /*
  * We'll set up multi-byte string handling to use UTF-8
  */
diff --git a/src/executables/Mirror.php b/src/executables/Mirror.php
index a3db91b97..8c4cd34e7 100644
--- a/src/executables/Mirror.php
+++ b/src/executables/Mirror.php
@@ -40,8 +40,6 @@ if (php_sapi_name() != 'cli' ||
     defined("seekquarry\\yioop\\configs\\IS_OWN_WEB_SERVER")) {
     echo "BAD REQUEST"; exit();
 }
-ini_set("memory_limit","850M"); //so have enough memory to crawl big pages
-
 /** CRAWLING means don't try to use cache
  * @ignore
  */
@@ -53,6 +51,7 @@ if (!C\PROFILE) {
         "its web interface on localhost.\n";
     exit();
 }
+ini_set("memory_limit", C\MIRROR_MEMORY_LIMIT);
 /*
  * We'll set up multi-byte string handling to use UTF-8
  */
diff --git a/src/executables/QueueServer.php b/src/executables/QueueServer.php
index 3b2541a87..c09bf406e 100755
--- a/src/executables/QueueServer.php
+++ b/src/executables/QueueServer.php
@@ -1986,18 +1986,19 @@ class QueueServer implements CrawlConstants, Join
     }
     /**
      * Tries to prevent Indexer from crashing do to excessive memory use.
-     * If Indexer is using more that .7 of its allowed memory, tries to
-     * free memory by saving index bunlde to disk, freeing memory, then
-     * reloading.
+     * If Indexer is using more that C\MEMORY_FILL_FACTOR of its allowed memory,
+     * tries to  free memory by saving index bundle to disk, freeing memory,
+     * then reloading.
      */
     public function constrainIndexerMemoryUsage()
     {
         $memory_limit = L\metricToInt(ini_get("memory_limit"));
         $current_usage = memory_get_usage();
-        if ((0.7 * $memory_limit) < $current_usage ||
+        if ((C\MEMORY_FILL_FACTOR * $memory_limit) < $current_usage ||
             in_array($this->debug, ['EXCEED_MEMORY', 'EXCEED_MEMORY_HARD'])) {
             L\crawlLog("Indexer memory usage threshold exceeded!!!");
-            L\crawlLog("...Indexer Threshold is: " . (0.7 * $memory_limit));
+            L\crawlLog("...Indexer Threshold is: " . (C\MEMORY_FILL_FACTOR *
+                $memory_limit));
             L\crawlLog("...Indexer Current usage is: " . $current_usage);
             L\crawlLog("...Indexer trying to free memory by resetting " .
                 "index bundle.");
@@ -2014,7 +2015,7 @@ class QueueServer implements CrawlConstants, Join
             if ($this->debug == 'EXCEED_MEMORY') {
                 $this->debug = "";
             }
-            if ((0.7 * $memory_limit) < $current_usage ||
+            if ((C\MEMORY_FILL_FACTOR * $memory_limit) < $current_usage ||
                 $this->debug == 'EXCEED_MEMORY_HARD') {
                 $message_file = C\CRAWL_DIR . "/schedules/" .
                     $this->process_name . "Messages.txt";
diff --git a/src/index.php b/src/index.php
index e74896d97..3a797e438 100644
--- a/src/index.php
+++ b/src/index.php
@@ -78,7 +78,7 @@ function bootstrap($web_site = null, $start_new_session = true)
      * Load global functions related to localization
      */
     require_once __DIR__ . "/library/LocaleFunctions.php";
-    ini_set("memory_limit","1000M");
+    ini_set("memory_limit", C\INDEX_FILE_MEMORY_LIMIT);
     if (!empty($web_site)) {
         if ((empty($_REQUEST['c']) || $_REQUEST['c'] != 'resource')) {
             $web_site->header("X-FRAME-OPTIONS: DENY"); //prevent click-jacking
diff --git a/src/library/FetchUrl.php b/src/library/FetchUrl.php
index 8e01b1e26..e7ef26937 100755
--- a/src/library/FetchUrl.php
+++ b/src/library/FetchUrl.php
@@ -249,7 +249,8 @@ class FetchUrl implements CrawlConstants
         $start = time();
         //Wait for responses
         $running = null;
-        $memory_limit = metricToInt(ini_get("memory_limit")) * 0.7;
+        $memory_limit = metricToInt(ini_get("memory_limit")) *
+            C\MEMORY_FILL_FACTOR;
         $mrc_check = CURLM_CALL_MULTI_PERFORM;
         set_error_handler(null);
         do {
diff --git a/src/library/IndexArchiveBundle.php b/src/library/IndexArchiveBundle.php
index a84eaf628..cef420567 100644
--- a/src/library/IndexArchiveBundle.php
+++ b/src/library/IndexArchiveBundle.php
@@ -123,7 +123,7 @@ class IndexArchiveBundle implements CrawlConstants
     /**
      * Threshold index shard beyond which we force the generation to advance
      */
-    const FORCE_ADVANCE_SIZE = 150000000;
+    const FORCE_ADVANCE_SIZE = 120000000;
     /**
      * Makes or initializes an IndexArchiveBundle with the provided parameters
      *
@@ -200,7 +200,7 @@ class IndexArchiveBundle implements CrawlConstants
         crawlLog("**ADD INDEX DIAGNOSTIC INFO...");
         $start_time = microtime(true);
         $this->getActiveShard()->appendIndexShard($index_shard);
-        crawlLog("Append Index Shard: Memory usage:".memory_get_usage() .
+        crawlLog("Append Index Shard: Memory usage:" . memory_get_usage() .
           " Time: ".(changeInMicrotime($start_time)));
     }
     /**
@@ -236,7 +236,8 @@ class IndexArchiveBundle implements CrawlConstants
             $this->num_docs_per_generation;
         $shard_size_too_big = (file_exists($active_file_name) &&
             filesize($active_file_name) > self::FORCE_ADVANCE_SIZE);
-        $too_close_to_memory_limit = 1.2 * $before_usage > $memory_limit;
+        $too_close_to_memory_limit = $before_usage >
+            C\MEMORY_FILL_FACTOR * $memory_limit;
         if ($too_many_docs || $shard_size_too_big ||
             $too_close_to_memory_limit) {
             if ($blocking == true) {
diff --git a/src/library/PhraseParser.php b/src/library/PhraseParser.php
index 6ded427db..c16882e1b 100755
--- a/src/library/PhraseParser.php
+++ b/src/library/PhraseParser.php
@@ -739,12 +739,16 @@ class PhraseParser
     {
         $char_class = C\NS_LOCALE . $lang . "\\resources\\Tokenizer";
         mb_internal_encoding("UTF-8");
-        if ($pre_terms == []) { return [];}
+        if (empty($pre_terms)) {
+            return [];
+        }
         $terms = [];
         $tokenizer = PhraseParser::getTokenizer($lang);
         if (class_exists($char_class) && isset($char_class::$char_gram_len)) {
             foreach ($pre_terms as $pre_term) {
-                if ($pre_term == "") { continue; }
+                if (empty($pre_term)) {
+                    continue;
+                }
                 if (substr($pre_term, 0, 4) == 'http') {
                     $terms[]  = $pre_term; // don't chargram urls
                     continue;
@@ -755,7 +759,7 @@ class PhraseParser
                 }
             }
         } else {
-            $terms = & $pre_terms;
+            $terms = $pre_terms;
         }
         return $terms;
     }
diff --git a/src/models/SearchverticalsModel.php b/src/models/SearchverticalsModel.php
index 51f2d0a99..413c14533 100644
--- a/src/models/SearchverticalsModel.php
+++ b/src/models/SearchverticalsModel.php
@@ -246,13 +246,14 @@ class SearchverticalsModel extends GroupModel
      * when a given url appears in search results
      * @param int $id if the url has been edited previous then the id of the
      *      group item with the edit. If this is 0/empty then a new group item
-     *      for the edit is created
+     *      for the edit is created. If -1 then deletes the entry
      * @param int $type either SEARCH_FILTER_GROUP_ITEM or
      *  SEARCH_EDIT_GROUP_ITEM
      * @param string $url to change search result for
      * @param string $title new title for search result
      * @param string $description new snippet text for search result
-     * @return int id of edited/created result
+     * @return mixed integer id of edited/created result or if used
+     *      to delete then false
      */
     function updateUrlResult($id, $type, $url, $title, $description)
     {
@@ -265,7 +266,7 @@ class SearchverticalsModel extends GroupModel
         if (empty($id)) {
             $id = $this->addGroupItem($parent_id, C\SEARCH_GROUP_ID, $user_id,
                 $title, $description, $type, $this->last_change, $url);
-        } else if ($type == -1) {
+        } else {
             $item = $this->getEditedPageResult($url);
             $sql = "DELETE FROM GROUP_ITEM  WHERE ID = ?";
             $db->execute($sql, [$id]);
@@ -275,13 +276,10 @@ class SearchverticalsModel extends GroupModel
                 $sql = "DELETE FROM GROUP_ITEM  WHERE URL = ?";
                 $db->execute($sql, [$url . "/"]);
             }
-        } else {
-            $sql = "UPDATE GROUP_ITEM SET TYPE = ?, URL = ?, TITLE = ?, ".
-                "DESCRIPTION = ?, EDIT_DATE = ? WHERE ID = ?";
-            $db->execute($sql, [$type, $url, $title, $description,
-                $this->last_change, $id]);
+            $id = $this->addGroupItem($parent_id, C\SEARCH_GROUP_ID, $user_id,
+                $title, $description, $type, $this->last_change, $url);
         }
-        return $id;
+        return $id ?? false;
     }
     /**
      * Returns any edited search result associated with a url
ViewGit