viewgit/inc/functions.php:22 Function utf8_encode() is deprecated [8192]
Filename | |
---|---|
src/configs/Config.php | |
src/controllers/SearchController.php | |
src/executables/Fetcher.php | |
src/library/Utility.php | |
src/models/PhraseModel.php |
diff --git a/src/configs/Config.php b/src/configs/Config.php index e6f9c07f2..47e137718 100755 --- a/src/configs/Config.php +++ b/src/configs/Config.php @@ -632,6 +632,10 @@ nsconddefine('MIN_QUEUE_WEIGHT', 1/100000); nsconddefine('MAX_ARCHIVE_OBJECT_SIZE', 100000000); /** Treat earlier timestamps as being indexes of format version 0 */ nsconddefine('VERSION_0_TIMESTAMP', 1369754208); +/** Treat earlier timestamps as being indexes of format version 1 */ +nsconddefine('VERSION_1_TIMESTAMP', 1528045371); +/** What version format to use for default indexing **/ +nsconddefine('DEFAULT_CRAWL_FORMAT', 2); defineMemoryProfile(); /** * Code to determine how much memory current machine has diff --git a/src/controllers/SearchController.php b/src/controllers/SearchController.php index 009b490fa..febb9ab14 100755 --- a/src/controllers/SearchController.php +++ b/src/controllers/SearchController.php @@ -1404,7 +1404,10 @@ class SearchController extends Controller implements CrawlConstants if (count($instance_parts) > 1) { $instance_num = intval($instance_parts[0]); } - if (!empty($crawl_item[self::OFFSET])) { + if (!empty($crawl_item[self::PAGE])) { + // Version 2 or newer index doesn't store cache pages separately + $cache_item = $crawl_item; + } else if (!empty($crawl_item[self::OFFSET])) { $cache_partition = $crawl_item[self::CACHE_PAGE_PARTITION]; $cache_item = $crawl_model->getCacheFile($machine, $machine_uri, $cache_partition, $crawl_item[self::OFFSET], diff --git a/src/executables/Fetcher.php b/src/executables/Fetcher.php index 346e50176..3d92faa31 100755 --- a/src/executables/Fetcher.php +++ b/src/executables/Fetcher.php @@ -1780,16 +1780,22 @@ class Fetcher implements CrawlConstants $processor->scrapers = $this->scrapers; } $page = $site[self::PAGE]; + $empty_image = false; if (L\generalIsA($page_processor, C\NS_PROCESSORS. "ImageProcessor")) { if (!empty($site[self::CONTENT_SIZE]) && !empty($site[self::SIZE]) && $site[self::CONTENT_SIZE] > $site[self::SIZE]) { $page = ""; + $empty_image = true; } } - $doc_info = $processor->handle($page, - $site[self::URL]); + if ($empty_image) { + $doc_info = null; + } else { + $doc_info = $processor->handle($page, + $site[self::URL]); + } if (C\FETCHER_PROCESS_DELAY > 0 ) { usleep(C\FETCHER_PROCESS_DELAY); } @@ -1967,10 +1973,12 @@ class Fetcher implements CrawlConstants } // end for $num_pages = count($stored_site_pages); $filter_stored = array_filter($stored_site_pages); - if ($num_pages > 0 && $this->cache_pages) { + if (C\DEFAULT_CRAWL_FORMAT < 2 && + $num_pages > 0 && $this->cache_pages) { $cache_page_partition = $this->web_archive->addPages( self::OFFSET, $filter_stored); } else if ($num_pages > 0) { + // In newer format fetcher archive only counts num cache pages $this->web_archive->addCount(count($filter_stored)); } for ($i = 0; $i < $num_pages; $i++) { @@ -1978,14 +1986,19 @@ class Fetcher implements CrawlConstants } foreach ($filter_stored as $stored) { if (!isset($stored[self::INDEX]) ) { + if (C\DEFAULT_CRAWL_FORMAT >= 2) { + $summarized_site_pages[$i][self::PAGE] = ""; + } continue; } $i = $stored[self::INDEX]; - if (isset($stored[self::OFFSET])) { - $summarized_site_pages[$i][self::OFFSET] = - $stored[self::OFFSET]; - $summarized_site_pages[$i][self::CACHE_PAGE_PARTITION] = - $cache_page_partition; + if (C\DEFAULT_CRAWL_FORMAT < 2) { + if (isset($stored[self::OFFSET])) { + $summarized_site_pages[$i][self::OFFSET] = + $stored[self::OFFSET]; + $summarized_site_pages[$i][self::CACHE_PAGE_PARTITION] = + $cache_page_partition; + } } } L\crawlLog(" Process pages time: ".L\changeInMicrotime($start_time). @@ -2141,13 +2154,19 @@ class Fetcher implements CrawlConstants self::CACHE_PAGE_VALIDATORS]; foreach ($summary_fields as $field) { if (isset($site[$field])) { - $stored_site_pages[$i][$field] = $site[$field]; + if (C\DEFAULT_CRAWL_FORMAT < 2) { + $stored_site_pages[$i][$field] = $site[$field]; + } $summarized_site_pages[$i][$field] = $site[$field]; } } foreach ($stored_fields as $field) { if (isset($site[$field])) { - $stored_site_pages[$i][$field] = $site[$field]; + if (C\DEFAULT_CRAWL_FORMAT < 2) { + $stored_site_pages[$i][$field] = $site[$field]; + } else { + $summarized_site_pages[$i][$field] = $site[$field]; + } } } } @@ -2292,7 +2311,7 @@ class Fetcher implements CrawlConstants /* for log file get rid of non-utf-8 characters that latter make it hard to view the log */ - L\crawlLog($site_index.". $subdoc_info ". + L\crawlLog($site_index . ". $subdoc_info ". iconv("UTF-8", "ISO-8859-1//IGNORE", $site[self::URL])); } // end for L\crawlLog(" Done Update Found Sites Array Time ". @@ -2392,7 +2411,7 @@ class Fetcher implements CrawlConstants { $current_server = $this->current_server; $queue_server = $this->queue_servers[$current_server]; - L\crawlLog("Updating machine: ".$queue_server); + L\crawlLog("Updating machine: " . $queue_server); $prefix = $this->fetcher_num."-"; if (count($this->to_crawl) <= 0) { $schedule_time = $this->schedule_time; @@ -2925,7 +2944,7 @@ class Fetcher implements CrawlConstants } $interim_elapse = L\changeInMicrotime($interim_time); if ($interim_elapse > 5) { - L\crawlLog("..Inverting ".$site[self::URL]."...took > 5s."); + L\crawlLog("..Inverting " . $site[self::URL] . "...took > 5s."); } L\crawlTimeoutLog("..Still building inverted index. Have ". "processed %s of %s documents.\nLast url processed was %s.", diff --git a/src/library/Utility.php b/src/library/Utility.php index 0c9529911..aadb01782 100755 --- a/src/library/Utility.php +++ b/src/library/Utility.php @@ -978,8 +978,8 @@ function crawlHashWord($string, $raw = false, $meta_string = "") * @param bool $raw whether to base64 the result * @return array of hashes with appropriates shifts if needed */ -function allCrawlHashPaths($string, $metas = [], - $encode_metas = [], $raw = false) +function allCrawlHashPaths($string, $metas = [], $encode_metas = [], + $raw = false) { $mask = ""; if ($encode_metas != []) { @@ -1082,7 +1082,7 @@ function allCrawlHashPaths($string, $metas = [], $hashes[] = $hash; } if ($j == 0) {break; } - $path_string .= " ".$zero; + $path_string .= " " . $zero; } $pos = mb_strpos($string, " ", $pos + 1); $encode_metas = []; diff --git a/src/models/PhraseModel.php b/src/models/PhraseModel.php index 7258c1821..17dd9532a 100755 --- a/src/models/PhraseModel.php +++ b/src/models/PhraseModel.php @@ -322,7 +322,7 @@ class PhraseModel extends ParallelModel $this->query_info['QUERY'] .= "$in2<i>Low</i>:". $result_bounds[0][0]."<br />"; $this->query_info['QUERY'] .= $in2 . - "<i>High</i>: ".$result_bounds[0][1]."<br />"; + "<i>High</i>: " . $result_bounds[0][1] . "<br />"; $prs_cnt++; } $cache_results = false; @@ -787,7 +787,8 @@ class PhraseModel extends ParallelModel foreach ($meta_words as $meta_word) { $pattern = "/(\s)($meta_word(\S)+)/"; preg_match_all($pattern, $phrase, $matches); - if (!in_array($meta_word, ['i:', 'index:', 'w:', 'weight:', '\-'])) { + if (!in_array($meta_word, + ['i:', 'index:', 'w:', 'weight:', '\-'])) { $matches = $matches[2]; $found_metas = array_merge($found_metas, $matches); if (in_array($meta_word, PhraseParser::$materialized_metas)) {