viewgit/inc/functions.php:22 Function utf8_encode() is deprecated [8192]

Remove randomness from selecting next server in Fetcher, fix notices in UrlParser, zh_CN Tokenizer and Utility, a=chris

Chris Pollett [2020-07-19 22:Jul:th]
Remove randomness from selecting next server in Fetcher, fix notices in UrlParser, zh_CN Tokenizer and Utility, a=chris
Filename
src/configs/Config.php
src/executables/Fetcher.php
src/library/UrlParser.php
src/library/Utility.php
src/locale/zh_CN/resources/Tokenizer.php
diff --git a/src/configs/Config.php b/src/configs/Config.php
index 5ac9b3a32..8e3a29eec 100755
--- a/src/configs/Config.php
+++ b/src/configs/Config.php
@@ -169,7 +169,7 @@ nsdefine('MIN_AD_VERSION', 36);
  * Version number for upgrading locale resource folders and for upgrading
  * public and help wikis
  */
-nsdefine('RESOURCES_WIKI_VERSION', 8);
+nsdefine('RESOURCES_WIKI_VERSION', 9);
 /**
  * nsdefine's the BASE_URL constant for this script
  * if run from the command line as part of index.php HTTP server scrip
diff --git a/src/executables/Fetcher.php b/src/executables/Fetcher.php
index 2bb94b995..2aded9716 100755
--- a/src/executables/Fetcher.php
+++ b/src/executables/Fetcher.php
@@ -1015,7 +1015,7 @@ class Fetcher implements CrawlConstants
         $time = time();
         $this->check_crawl_time = $time;
         $session = md5($time . C\AUTH_KEY);
-        $prefix = $this->fetcher_num . "-" . $this->channel ."-";
+        $prefix = $this->fetcher_num . "-" . $this->channel . "-";
         $robot_instance = $prefix . C\ROBOT_INSTANCE;
         $time_change = false;
         $crawl_time = !is_null($this->crawl_time) ? $this->crawl_time : 0;
@@ -1367,7 +1367,6 @@ class Fetcher implements CrawlConstants
         } else if (isset($info['ARCHIVE_BUNDLE_ERROR'])) {
             L\crawlLog("  ".$info['ARCHIVE_BUNDLE_ERROR']);
         }
-
         L\crawlLog("Time to fetch archive data from name server ".
             L\changeInMicrotime($start_time));
         return $info;
@@ -1384,11 +1383,11 @@ class Fetcher implements CrawlConstants
             ini_get("memory_limit")) * C\MEMORY_FILL_FACTOR);
     }
     /**
-     * At least once, and while memory is low picks at server at random and send
+     * At least once, and while memory is low selects next server and send
      * any fetcher data we have to it.
      *
      * @param bool $at_least_once whether to send to the site info to at least
-     *     queue server or to send only if memory is above threshold
+     *     one queue server or to send only if memory is above threshold
      */
     public function selectCurrentServerAndUpdateIfNeeded($at_least_once)
     {
@@ -1397,11 +1396,9 @@ class Fetcher implements CrawlConstants
         /*  Make sure no queue server starves if to crawl data available.
             Try to keep memory foot print smaller.
          */
+        $cs = $this->current_server;
+        $next_server = ($cs + 1) % $num_servers;
         do {
-            if (!$at_least_once) {
-                $this->current_server = rand(0, $num_servers - 1);
-            }
-            $cs = $this->current_server;
             if ($at_least_once ||
                 (isset($this->found_sites[self::TO_CRAWL][$cs]) &&
                 count($this->found_sites[self::TO_CRAWL][$cs]) > 0) ||
@@ -1409,10 +1406,10 @@ class Fetcher implements CrawlConstants
                 $this->updateScheduler();
                 $at_least_once = false;
             }
+            $cs = ($cs + 1) % $num_servers;
             $i++;
-        } while($this->exceedMemoryThreshold() &&
-            $i < $num_servers * ceil(log($num_servers)) );
-            //coupon collecting expected i before have seen all
+        } while($this->exceedMemoryThreshold() && $i < $num_servers);
+        $this->current_server = $next_server;
     }
     /**
      * Sets parameters for fetching based on provided info struct
diff --git a/src/library/UrlParser.php b/src/library/UrlParser.php
index 667c8d514..4d92a3529 100755
--- a/src/library/UrlParser.php
+++ b/src/library/UrlParser.php
@@ -932,10 +932,13 @@ class UrlParser
         $info_link = [];
         // choose the MAX_LINKS_PER_PAGE many pages with most info (crude)
         foreach ($links as $url => $text) {
-            $num_terms = count(preg_split("/\s+|\-|\_|\~/", $text));
+            $text = (is_string($text)) ? $text : "";
+            $terms = preg_split("/\s+|\-|\_|\~/", $text);
+            $num_terms = count($terms);
             $len_text = strlen($text) + 1;
             $compressed_len = strlen(gzcompress($text)) + 1;
-            $effective_num_terms = $num_terms * ($compressed_len/$len_text);
+            $effective_num_terms = $num_terms *
+                (min($compressed_len/$len_text, 1));
             if (!isset($info_link[$url])) {
                 $info_link[$url] = $effective_num_terms;
             } else {
diff --git a/src/library/Utility.php b/src/library/Utility.php
index d12ae90cc..be5dfbd00 100755
--- a/src/library/Utility.php
+++ b/src/library/Utility.php
@@ -1609,31 +1609,33 @@ function checkTimeInterval($start_time, $duration, $time = -1)
 function convertPixels($value)
 {
     $len = strlen($value);
-    if ($len < 2) return intval($value);
+    if (is_int($value) || $len < 2) {
+        return intval($value);
+    }
     if ($value[$len - 1] == "%") {
         $num = floatval(substr($value, 0, $len - 1));
-        return ($num > 0) ? floor(8*min(100, $num)) : 0;
+        return ($num > 0) ? floor(8 * min(100, $num)) : 0;
     }
     $num = floatval(substr($value, 0, $len - 2));
     $unit = substr($value, $len - 2);
     switch ($unit) {
         case "cm":
         case "pt":
-            return intval(28*$num);
+            return intval(28 * $num);
         break;
         case "em":
         case "pc":
-            return intval(6*$num);
+            return intval(6 * $num);
         break;
         case "ex":
-            return intval(12*$num);
+            return intval(12 * $num);
         break;
         case "in":
             //assume screen 72 dpi as on mac
-            return intval(72*$num);
+            return intval(72 * $num);
         break;
         case "mm":
-            return intval(2.8*$num);
+            return intval(2.8 * $num);
         break;
         case "px":
             return intval($num);
diff --git a/src/locale/zh_CN/resources/Tokenizer.php b/src/locale/zh_CN/resources/Tokenizer.php
index f0769c5a7..5bf9ef0a4 100755
--- a/src/locale/zh_CN/resources/Tokenizer.php
+++ b/src/locale/zh_CN/resources/Tokenizer.php
@@ -894,7 +894,8 @@ class Tokenizer
             if (!empty($tree_vp['NP'])) {
                 $nb = $tree_vp['NP'];
                 $object['CONCISE'] = $tree_vp['NP'];
-                while (is_array($object['CONCISE']["ADD_NP"]["NP"])) {
+                while (isset($object['CONCISE']["ADD_NP"]["NP"]) &&
+                    is_array($object['CONCISE']["ADD_NP"]["NP"])) {
                     $object['CONCISE'] = $object['CONCISE']["ADD_NP"]["NP"];
                 }
                 $object['CONCISE']=$object['CONCISE']["NN"]["NN"];
@@ -961,10 +962,11 @@ class Tokenizer
         $subject = [];
         if (!empty($tree['NP'])) {
             $subject['CONCISE'] = $tree['NP'];
-            while (is_array($subject['CONCISE']["ADD_NP"]["NP"])) {
+            while (isset($subject['CONCISE']["ADD_NP"]["NP"]) &&
+                is_array($subject['CONCISE']["ADD_NP"]["NP"])) {
                 $subject['CONCISE'] = $subject['CONCISE']["ADD_NP"]["NP"];
             }
-            $subject['CONCISE']=$subject['CONCISE']["NN"]["NN"];
+            $subject['CONCISE'] = $subject['CONCISE']["NN"]["NN"] ?? "";
             $raw_subject = "";
             $it = new \RecursiveIteratorIterator(
                 new \RecursiveArrayIterator($tree['NP']));
ViewGit