viewgit/inc/functions.php:22 Function utf8_encode() is deprecated [8192]
Filename | |
---|---|
src/executables/QueueServer.php | |
src/library/UrlParser.php | |
src/library/processors/JpgProcessor.php |
diff --git a/src/executables/QueueServer.php b/src/executables/QueueServer.php index c10ed9e94..c63975df1 100755 --- a/src/executables/QueueServer.php +++ b/src/executables/QueueServer.php @@ -239,7 +239,6 @@ class QueueServer implements CrawlConstants, Join * @var int */ public $index_dirty; - /** * This keeps track of the time the current archive info was last modified * This way the queue server knows if the user has changed the crawl @@ -285,6 +284,30 @@ class QueueServer implements CrawlConstants, Join * @var string */ public $process_name; + /** + * A mapping between class field names and parameters which might + * be sent to a queue server via an info associative array. + * @var array + */ + public static $info_parameter_map = [ + "crawl_order" => self::CRAWL_ORDER, + "crawl_type" => self::CRAWL_TYPE, + "crawl_index" => self::CRAWL_INDEX, + "cache_pages" => self::CACHE_PAGES, + "page_range_request" => self::PAGE_RANGE_REQUEST, + "max_depth" => self::MAX_DEPTH, + "repeat_type" => self::REPEAT_TYPE, + "robots_txt" => self::ROBOTS_TXT, + "max_description_len" => self::MAX_DESCRIPTION_LEN, + "page_recrawl_frequency" => self::PAGE_RECRAWL_FREQUENCY, + "indexed_file_types" => self::INDEXED_FILE_TYPES, + "restrict_sites_by_url" => self::RESTRICT_SITES_BY_URL, + "allowed_sites" => self::ALLOWED_SITES, + "disallowed_sites" => self::DISALLOWED_SITES, + "page_rules" => self::PAGE_RULES, + "indexing_plugins" => self::INDEXING_PLUGINS, + "indexing_plugins_data" => self::INDEXING_PLUGINS_DATA, + ]; /** * Creates a Queue Server Daemon */ @@ -1249,28 +1272,9 @@ class QueueServer implements CrawlConstants, Join { //to get here we at least have to have a crawl_time $this->crawl_time = $info[self::CRAWL_TIME]; - $read_from_info = [ - "crawl_order" => self::CRAWL_ORDER, - "crawl_type" => self::CRAWL_TYPE, - "crawl_index" => self::CRAWL_INDEX, - "cache_pages" => self::CACHE_PAGES, - "page_range_request" => self::PAGE_RANGE_REQUEST, - "max_depth" => self::MAX_DEPTH, - "repeat_type" => self::REPEAT_TYPE, - "robots_txt" => self::ROBOTS_TXT, - "max_description_len" => self::MAX_DESCRIPTION_LEN, - "page_recrawl_frequency" => self::PAGE_RECRAWL_FREQUENCY, - "indexed_file_types" => self::INDEXED_FILE_TYPES, - "restrict_sites_by_url" => self::RESTRICT_SITES_BY_URL, - "allowed_sites" => self::ALLOWED_SITES, - "disallowed_sites" => self::DISALLOWED_SITES, - "page_rules" => self::PAGE_RULES, - "indexing_plugins" => self::INDEXING_PLUGINS, - "indexing_plugins_data" => self::INDEXING_PLUGINS_DATA, - ]; $try_to_set_from_old_index = []; $update_disallow = false; - foreach ($read_from_info as $index_field => $info_field) { + foreach (self::$info_parameter_map as $index_field => $info_field) { if (isset($info[$info_field])) { if ($index_field == "disallowed_sites") { $update_disallow = true; @@ -1290,7 +1294,7 @@ class QueueServer implements CrawlConstants, Join $this->updateDisallowedQuotaSites(); } $this->initializeWebQueue(); - $this->initializeIndexBundle($info); + $this->initializeIndexBundle($info, $try_to_set_from_old_index); $info[self::STATUS] = self::CONTINUE_STATE; return $info; } @@ -1301,9 +1305,16 @@ class QueueServer implements CrawlConstants, Join * * @param array $info if initializing a new crawl this should contain * the crawl parameters + * @param array $try_to_set_from_old_index parameters of the crawl + * to try to set from values already stored in archive info, + * other parameters are assumed to have been updated since. */ - public function initializeIndexBundle($info = []) + public function initializeIndexBundle($info = [], + $try_to_set_from_old_index = null) { + if ($try_to_set_from_old_index === null) { + $try_to_set_from_old_index = array_key(self::$info_parameter_map); + } if(empty($this->repeat_type) || $this->repeat_type < 0) { $class_name = C\NS_LIB . "IndexArchiveBundle"; $dir = C\CRAWL_DIR . '/cache/' . self::index_data_base_name . @@ -1318,9 +1329,10 @@ class QueueServer implements CrawlConstants, Join $archive_info = $class_name::getArchiveInfo($dir); $index_info = unserialize($archive_info['DESCRIPTION']); foreach ($try_to_set_from_old_index as $index_field) { - if (isset($index_info[$read_from_info[$index_field]]) ) { + if (isset($index_info[self::$info_parameter_map[$index_field]]) + ) { $this->$index_field = - $index_info[$read_from_info[$index_field]]; + $index_info[self::$info_parameter_map[$index_field]]; } } $archive_exists = true; diff --git a/src/library/UrlParser.php b/src/library/UrlParser.php index 60f66be37..3d98a970f 100755 --- a/src/library/UrlParser.php +++ b/src/library/UrlParser.php @@ -906,7 +906,7 @@ class UrlParser $info_link = []; // choose the MAX_LINKS_PER_PAGE many pages with most info (crude) foreach ($links as $url => $info) { - $num_terms = count(preg_split("/\s+/", $info)); + $num_terms = count(preg_split("/\s+|\-|\_|\~/", $info)); $text = serialize($info); $len_text = strlen($text) + 1; $compressed_len = strlen(gzcompress($text)) + 1; diff --git a/src/library/processors/JpgProcessor.php b/src/library/processors/JpgProcessor.php index d52f463e3..b8633e9a3 100755 --- a/src/library/processors/JpgProcessor.php +++ b/src/library/processors/JpgProcessor.php @@ -97,7 +97,7 @@ class JpgProcessor extends ImageProcessor file_put_contents($temp_file, $page); set_error_handler(null); $summary[self::DESCRIPTION] = "$file_name\nEXIF DATA\n". - print_r(exif_read_data($temp_file), true); + print_r(@exif_read_data($temp_file), true); set_error_handler(C\NS_CONFIGS . "yioop_error_handler"); } else { $summary[self::DESCRIPTION] = $file_name;