viewgit/inc/functions.php:22 Function utf8_encode() is deprecated [8192]

Moves cache of whole web pages to the recsponsible queue server rather than the fetcher that crawled it, a=chris

Chris Pollett [2018-06-08 15:Jun:th]
Moves cache of whole web pages to the recsponsible queue server rather than the fetcher that crawled it, a=chris
Filename
src/configs/Config.php
src/controllers/SearchController.php
src/executables/Fetcher.php
src/library/Utility.php
src/models/PhraseModel.php
diff --git a/src/configs/Config.php b/src/configs/Config.php
index e6f9c07f2..47e137718 100755
--- a/src/configs/Config.php
+++ b/src/configs/Config.php
@@ -632,6 +632,10 @@ nsconddefine('MIN_QUEUE_WEIGHT', 1/100000);
 nsconddefine('MAX_ARCHIVE_OBJECT_SIZE', 100000000);
 /** Treat earlier timestamps as being indexes of format version 0 */
 nsconddefine('VERSION_0_TIMESTAMP', 1369754208);
+/** Treat earlier timestamps as being indexes of format version 1 */
+nsconddefine('VERSION_1_TIMESTAMP', 1528045371);
+/** What version format to use for default indexing **/
+nsconddefine('DEFAULT_CRAWL_FORMAT', 2);
 defineMemoryProfile();
 /**
  * Code to determine how much memory current machine has
diff --git a/src/controllers/SearchController.php b/src/controllers/SearchController.php
index 009b490fa..febb9ab14 100755
--- a/src/controllers/SearchController.php
+++ b/src/controllers/SearchController.php
@@ -1404,7 +1404,10 @@ class SearchController extends Controller implements CrawlConstants
         if (count($instance_parts) > 1) {
             $instance_num = intval($instance_parts[0]);
         }
-        if (!empty($crawl_item[self::OFFSET])) {
+        if (!empty($crawl_item[self::PAGE])) {
+            // Version 2 or newer index doesn't store cache pages separately
+            $cache_item = $crawl_item;
+        } else if (!empty($crawl_item[self::OFFSET])) {
             $cache_partition = $crawl_item[self::CACHE_PAGE_PARTITION];
             $cache_item = $crawl_model->getCacheFile($machine,
                 $machine_uri, $cache_partition, $crawl_item[self::OFFSET],
diff --git a/src/executables/Fetcher.php b/src/executables/Fetcher.php
index 346e50176..3d92faa31 100755
--- a/src/executables/Fetcher.php
+++ b/src/executables/Fetcher.php
@@ -1780,16 +1780,22 @@ class Fetcher implements CrawlConstants
                     $processor->scrapers = $this->scrapers;
                 }
                 $page = $site[self::PAGE];
+                $empty_image = false;
                 if (L\generalIsA($page_processor, C\NS_PROCESSORS.
                     "ImageProcessor")) {
                     if (!empty($site[self::CONTENT_SIZE]) &&
                         !empty($site[self::SIZE]) && $site[self::CONTENT_SIZE] >
                         $site[self::SIZE]) {
                         $page = "";
+                        $empty_image = true;
                     }
                 }
-                $doc_info = $processor->handle($page,
-                    $site[self::URL]);
+                if ($empty_image) {
+                    $doc_info = null;
+                } else {
+                    $doc_info = $processor->handle($page,
+                        $site[self::URL]);
+                }
                 if (C\FETCHER_PROCESS_DELAY > 0 ) {
                     usleep(C\FETCHER_PROCESS_DELAY);
                 }
@@ -1967,10 +1973,12 @@ class Fetcher implements CrawlConstants
         } // end for
         $num_pages = count($stored_site_pages);
         $filter_stored = array_filter($stored_site_pages);
-        if ($num_pages > 0 && $this->cache_pages) {
+        if (C\DEFAULT_CRAWL_FORMAT < 2 &&
+            $num_pages > 0 && $this->cache_pages) {
             $cache_page_partition = $this->web_archive->addPages(
                 self::OFFSET, $filter_stored);
         } else if ($num_pages > 0) {
+            // In newer format fetcher archive only counts num cache pages
             $this->web_archive->addCount(count($filter_stored));
         }
         for ($i = 0; $i < $num_pages; $i++) {
@@ -1978,14 +1986,19 @@ class Fetcher implements CrawlConstants
         }
         foreach ($filter_stored as $stored) {
             if (!isset($stored[self::INDEX]) ) {
+                if (C\DEFAULT_CRAWL_FORMAT >= 2) {
+                    $summarized_site_pages[$i][self::PAGE] = "";
+                }
                 continue;
             }
             $i = $stored[self::INDEX];
-            if (isset($stored[self::OFFSET])) {
-                $summarized_site_pages[$i][self::OFFSET] =
-                    $stored[self::OFFSET];
-                $summarized_site_pages[$i][self::CACHE_PAGE_PARTITION] =
-                    $cache_page_partition;
+            if (C\DEFAULT_CRAWL_FORMAT < 2) {
+                if (isset($stored[self::OFFSET])) {
+                    $summarized_site_pages[$i][self::OFFSET] =
+                        $stored[self::OFFSET];
+                    $summarized_site_pages[$i][self::CACHE_PAGE_PARTITION] =
+                        $cache_page_partition;
+                }
             }
         }
         L\crawlLog("  Process pages time: ".L\changeInMicrotime($start_time).
@@ -2141,13 +2154,19 @@ class Fetcher implements CrawlConstants
             self::CACHE_PAGE_VALIDATORS];
         foreach ($summary_fields as $field) {
             if (isset($site[$field])) {
-                $stored_site_pages[$i][$field] = $site[$field];
+                if (C\DEFAULT_CRAWL_FORMAT < 2) {
+                    $stored_site_pages[$i][$field] = $site[$field];
+                }
                 $summarized_site_pages[$i][$field] = $site[$field];
             }
         }
         foreach ($stored_fields as $field) {
             if (isset($site[$field])) {
-                $stored_site_pages[$i][$field] = $site[$field];
+                if (C\DEFAULT_CRAWL_FORMAT < 2) {
+                    $stored_site_pages[$i][$field] = $site[$field];
+                } else {
+                    $summarized_site_pages[$i][$field] = $site[$field];
+                }
             }
         }
     }
@@ -2292,7 +2311,7 @@ class Fetcher implements CrawlConstants
             /* for log file get rid of non-utf-8 characters
                that latter make it hard to view the log
              */
-            L\crawlLog($site_index.". $subdoc_info ".
+            L\crawlLog($site_index . ". $subdoc_info ".
                 iconv("UTF-8", "ISO-8859-1//IGNORE", $site[self::URL]));
         } // end for
         L\crawlLog("  Done Update Found Sites Array Time ".
@@ -2392,7 +2411,7 @@ class Fetcher implements CrawlConstants
     {
         $current_server = $this->current_server;
         $queue_server = $this->queue_servers[$current_server];
-        L\crawlLog("Updating machine: ".$queue_server);
+        L\crawlLog("Updating machine: " . $queue_server);
         $prefix = $this->fetcher_num."-";
         if (count($this->to_crawl) <= 0) {
             $schedule_time = $this->schedule_time;
@@ -2925,7 +2944,7 @@ class Fetcher implements CrawlConstants
             }
             $interim_elapse = L\changeInMicrotime($interim_time);
             if ($interim_elapse > 5) {
-                L\crawlLog("..Inverting ".$site[self::URL]."...took > 5s.");
+                L\crawlLog("..Inverting " . $site[self::URL] . "...took > 5s.");
             }
             L\crawlTimeoutLog("..Still building inverted index. Have ".
                 "processed %s of %s documents.\nLast url processed was %s.",
diff --git a/src/library/Utility.php b/src/library/Utility.php
index 0c9529911..aadb01782 100755
--- a/src/library/Utility.php
+++ b/src/library/Utility.php
@@ -978,8 +978,8 @@ function crawlHashWord($string, $raw = false, $meta_string = "")
  * @param bool $raw whether to base64 the result
  * @return array of hashes with appropriates shifts if needed
  */
-function allCrawlHashPaths($string, $metas = [],
-    $encode_metas = [], $raw = false)
+function allCrawlHashPaths($string, $metas = [], $encode_metas = [],
+    $raw = false)
 {
     $mask = "";
     if ($encode_metas != []) {
@@ -1082,7 +1082,7 @@ function allCrawlHashPaths($string, $metas = [],
                 $hashes[] = $hash;
             }
             if ($j == 0) {break; }
-            $path_string .= " ".$zero;
+            $path_string .= " " . $zero;
         }
         $pos = mb_strpos($string, " ", $pos + 1);
         $encode_metas = [];
diff --git a/src/models/PhraseModel.php b/src/models/PhraseModel.php
index 7258c1821..17dd9532a 100755
--- a/src/models/PhraseModel.php
+++ b/src/models/PhraseModel.php
@@ -322,7 +322,7 @@ class PhraseModel extends ParallelModel
                 $this->query_info['QUERY'] .= "$in2<i>Low</i>:".
                     $result_bounds[0][0]."<br />";
                 $this->query_info['QUERY'] .= $in2 .
-                    "<i>High</i>: ".$result_bounds[0][1]."<br />";
+                    "<i>High</i>: " . $result_bounds[0][1] . "<br />";
                 $prs_cnt++;
             }
             $cache_results = false;
@@ -787,7 +787,8 @@ class PhraseModel extends ParallelModel
         foreach ($meta_words as $meta_word) {
             $pattern = "/(\s)($meta_word(\S)+)/";
             preg_match_all($pattern, $phrase, $matches);
-            if (!in_array($meta_word, ['i:', 'index:', 'w:', 'weight:', '\-'])) {
+            if (!in_array($meta_word,
+                ['i:', 'index:', 'w:', 'weight:', '\-'])) {
                 $matches = $matches[2];
                 $found_metas = array_merge($found_metas, $matches);
                 if (in_array($meta_word, PhraseParser::$materialized_metas)) {
ViewGit