fixes a bug in initializeIndexBundle leftover from earlier refactorization, swallows warning if exif data not readable, tweaks word splitting pruneLinks, a=chris

Chris Pollett [2019-06-21 17:Jun:st]

fixes a bug in initializeIndexBundle leftover from earlier refactorization, swallows warning if exif data not readable, tweaks word splitting pruneLinks, a=chris

Filename
src/executables/QueueServer.php
src/library/UrlParser.php
src/library/processors/JpgProcessor.php

diff --git a/src/executables/QueueServer.php b/src/executables/QueueServer.php
index c10ed9e94..c63975df1 100755
--- a/src/executables/QueueServer.php
+++ b/src/executables/QueueServer.php
@@ -239,7 +239,6 @@ class QueueServer implements CrawlConstants, Join
      * @var int
      */
      public $index_dirty;
-
     /**
      * This keeps track of the time the current archive info was last modified
      * This way the queue server knows if the user has changed the crawl
@@ -285,6 +284,30 @@ class QueueServer implements CrawlConstants, Join
      * @var string
      */
     public $process_name;
+    /**
+     * A mapping between class field names and parameters which might
+     * be sent to a queue server via an info associative array.
+     * @var array
+     */
+    public static $info_parameter_map = [
+        "crawl_order" => self::CRAWL_ORDER,
+        "crawl_type" => self::CRAWL_TYPE,
+        "crawl_index" => self::CRAWL_INDEX,
+        "cache_pages" => self::CACHE_PAGES,
+        "page_range_request" => self::PAGE_RANGE_REQUEST,
+        "max_depth" => self::MAX_DEPTH,
+        "repeat_type" => self::REPEAT_TYPE,
+        "robots_txt" => self::ROBOTS_TXT,
+        "max_description_len" => self::MAX_DESCRIPTION_LEN,
+        "page_recrawl_frequency" => self::PAGE_RECRAWL_FREQUENCY,
+        "indexed_file_types" => self::INDEXED_FILE_TYPES,
+        "restrict_sites_by_url" => self::RESTRICT_SITES_BY_URL,
+        "allowed_sites" => self::ALLOWED_SITES,
+        "disallowed_sites" => self::DISALLOWED_SITES,
+        "page_rules" => self::PAGE_RULES,
+        "indexing_plugins" => self::INDEXING_PLUGINS,
+        "indexing_plugins_data" => self::INDEXING_PLUGINS_DATA,
+    ];
     /**
      * Creates a Queue Server Daemon
      */
@@ -1249,28 +1272,9 @@ class QueueServer implements CrawlConstants, Join
     {
         //to get here we at least have to have a crawl_time
         $this->crawl_time = $info[self::CRAWL_TIME];
-        $read_from_info = [
-            "crawl_order" => self::CRAWL_ORDER,
-            "crawl_type" => self::CRAWL_TYPE,
-            "crawl_index" => self::CRAWL_INDEX,
-            "cache_pages" => self::CACHE_PAGES,
-            "page_range_request" => self::PAGE_RANGE_REQUEST,
-            "max_depth" => self::MAX_DEPTH,
-            "repeat_type" => self::REPEAT_TYPE,
-            "robots_txt" => self::ROBOTS_TXT,
-            "max_description_len" => self::MAX_DESCRIPTION_LEN,
-            "page_recrawl_frequency" => self::PAGE_RECRAWL_FREQUENCY,
-            "indexed_file_types" => self::INDEXED_FILE_TYPES,
-            "restrict_sites_by_url" => self::RESTRICT_SITES_BY_URL,
-            "allowed_sites" => self::ALLOWED_SITES,
-            "disallowed_sites" => self::DISALLOWED_SITES,
-            "page_rules" => self::PAGE_RULES,
-            "indexing_plugins" => self::INDEXING_PLUGINS,
-            "indexing_plugins_data" => self::INDEXING_PLUGINS_DATA,
-        ];
         $try_to_set_from_old_index = [];
         $update_disallow = false;
-        foreach ($read_from_info as $index_field => $info_field) {
+        foreach (self::$info_parameter_map as $index_field => $info_field) {
             if (isset($info[$info_field])) {
                 if ($index_field == "disallowed_sites") {
                     $update_disallow = true;
@@ -1290,7 +1294,7 @@ class QueueServer implements CrawlConstants, Join
             $this->updateDisallowedQuotaSites();
         }
         $this->initializeWebQueue();
-        $this->initializeIndexBundle($info);
+        $this->initializeIndexBundle($info, $try_to_set_from_old_index);
         $info[self::STATUS] = self::CONTINUE_STATE;
         return $info;
     }
@@ -1301,9 +1305,16 @@ class QueueServer implements CrawlConstants, Join
      *
      * @param array $info if initializing a new crawl this should contain
      *      the crawl parameters
+     * @param array $try_to_set_from_old_index parameters of the crawl
+     *      to try to set from values already stored in archive info,
+     *      other parameters are assumed to have been updated since.
      */
-    public function initializeIndexBundle($info = [])
+    public function initializeIndexBundle($info = [],
+        $try_to_set_from_old_index = null)
     {
+        if ($try_to_set_from_old_index === null) {
+            $try_to_set_from_old_index = array_key(self::$info_parameter_map);
+        }
         if(empty($this->repeat_type) || $this->repeat_type < 0) {
             $class_name = C\NS_LIB . "IndexArchiveBundle";
             $dir = C\CRAWL_DIR . '/cache/' . self::index_data_base_name .
@@ -1318,9 +1329,10 @@ class QueueServer implements CrawlConstants, Join
             $archive_info = $class_name::getArchiveInfo($dir);
             $index_info = unserialize($archive_info['DESCRIPTION']);
             foreach ($try_to_set_from_old_index as $index_field) {
-                if (isset($index_info[$read_from_info[$index_field]]) ) {
+                if (isset($index_info[self::$info_parameter_map[$index_field]])
+                    ) {
                     $this->$index_field =
-                        $index_info[$read_from_info[$index_field]];
+                        $index_info[self::$info_parameter_map[$index_field]];
                 }
             }
             $archive_exists = true;
diff --git a/src/library/UrlParser.php b/src/library/UrlParser.php
index 60f66be37..3d98a970f 100755
--- a/src/library/UrlParser.php
+++ b/src/library/UrlParser.php
@@ -906,7 +906,7 @@ class UrlParser
         $info_link = [];
         // choose the MAX_LINKS_PER_PAGE many pages with most info (crude)
         foreach ($links as $url => $info) {
-            $num_terms = count(preg_split("/\s+/", $info));
+            $num_terms = count(preg_split("/\s+|\-|\_|\~/", $info));
             $text = serialize($info);
             $len_text = strlen($text) + 1;
             $compressed_len = strlen(gzcompress($text)) + 1;
diff --git a/src/library/processors/JpgProcessor.php b/src/library/processors/JpgProcessor.php
index d52f463e3..b8633e9a3 100755
--- a/src/library/processors/JpgProcessor.php
+++ b/src/library/processors/JpgProcessor.php
@@ -97,7 +97,7 @@ class JpgProcessor extends ImageProcessor
                 file_put_contents($temp_file, $page);
                 set_error_handler(null);
                 $summary[self::DESCRIPTION] = "$file_name\nEXIF DATA\n".
-                    print_r(exif_read_data($temp_file), true);
+                    print_r(@exif_read_data($temp_file), true);
                 set_error_handler(C\NS_CONFIGS . "yioop_error_handler");
             } else {
                 $summary[self::DESCRIPTION] = $file_name;

ViewGit