viewgit/inc/functions.php:22 Function utf8_encode() is deprecated [8192]
diff --git a/src/executables/Fetcher.php b/src/executables/Fetcher.php index 0ddf7436a..9750c4475 100755 --- a/src/executables/Fetcher.php +++ b/src/executables/Fetcher.php @@ -687,7 +687,7 @@ class Fetcher implements CrawlConstants } $this->to_crawl_again = []; $this->found_sites = []; - gc_collect_cycles(); + L\garbageCollect(); $this->web_archive = new WebArchiveBundle($tmp_base_name, false); $this->crawl_time = $info[self::CRAWL_TIME]; @@ -2588,7 +2588,7 @@ class Fetcher implements CrawlConstants $current_server]; L\crawlLog(".....add inverted index string."); unset($this->found_sites[self::INVERTED_INDEX][$current_server]); - gc_collect_cycles(); + L\garbageCollect(); $data = L\webencode($out_string); L\crawlLog(".....web encode result."); // don't compress index data diff --git a/src/executables/QueueServer.php b/src/executables/QueueServer.php index de76315be..7f2d726d8 100755 --- a/src/executables/QueueServer.php +++ b/src/executables/QueueServer.php @@ -683,7 +683,7 @@ class QueueServer implements CrawlConstants, Join */ $this->deleteOrphanedBundles(); $this->processIndexData($blocking); - if (time() - $this->last_index_save_time > C\FORCE_SAVE_TIME){ + if (time() - $this->last_index_save_time > C\FORCE_SAVE_TIME) { L\crawlLog("Periodic Index Save... \n"); $start_time = microtime(true); $this->indexSave(); @@ -1281,7 +1281,10 @@ class QueueServer implements CrawlConstants, Join } } /* We now do further processing or disallowed sites to see if any - of them are really quota sites + of them are really quota sites. We want both indexer and scheduler + to be aware of the changes as the indexer is responsible for + storing the values persistently into the IndexArchiveBundle + in case we need to resume a crawl. */ if ($update_disallow == true) { $this->updateDisallowedQuotaSites(); @@ -1409,7 +1412,7 @@ class QueueServer implements CrawlConstants, Join } else { $this->index_archive = null; } - gc_collect_cycles(); // garbage collect old crawls + L\garbageCollect(); // garbage collect old crawls } /** * Delete all the urls from the web queue does not affect filters @@ -1491,7 +1494,10 @@ class QueueServer implements CrawlConstants, Join } } /* We now do further processing or disallowed sites to see if any - of them are really quota sites + of them are really quota sites. We want both indexer and scheduler + to be aware of the changes as the indexer is responsible for + storing the values persistently into the IndexArchiveBundle + in case we need to resume a crawl. */ if ($update_disallow == true) { $this->updateDisallowedQuotaSites(); @@ -1794,7 +1800,7 @@ class QueueServer implements CrawlConstants, Join L\crawlHash($link_url_parts[1], true) . L\crawlHash($seen_sites[$i][self::URL], true) . $reftype . substr(L\crawlHash( - UrlParser::getHost($link_url_parts[5])."/", true), 1); + UrlParser::getHost($link_url_parts[5]) . "/", true), 1); $seen_sites[$i][self::IS_DOC] = false; } else { $seen_sites[$i][self::IS_DOC] = true; @@ -1822,7 +1828,7 @@ class QueueServer implements CrawlConstants, Join $generation, self::SUMMARY_OFFSET, $seen_sites, $visited_urls_count); foreach ($seen_sites as $site) { - if ($site[self::IS_DOC]){ // so not link + if ($site[self::IS_DOC]) { // so not link $site_url = str_replace('|', "%7C", $site[self::URL]); $host = UrlParser::getHost($site_url); $hash = L\crawlHash($site_url, true). diff --git a/src/library/BloomFilterBundle.php b/src/library/BloomFilterBundle.php index 05b885947..1cdd0e989 100644 --- a/src/library/BloomFilterBundle.php +++ b/src/library/BloomFilterBundle.php @@ -30,6 +30,10 @@ */ namespace seekquarry\yioop\library; +/** + * Used for garbageCollect + */ +require_once __DIR__ . '/Utility.php'; /** * * A BloomFilterBundle is a directory of BloomFilterFile. @@ -97,7 +101,7 @@ class BloomFilterBundle } else { $last_filter = $this->num_filters - 1; $this->current_filter = - BloomFilterFile::load($dir_name."/filter_$last_filter.ftr"); + BloomFilterFile::load($dir_name . "/filter_$last_filter.ftr"); } } /** @@ -113,7 +117,7 @@ class BloomFilterBundle if ($this->current_filter_count >= $this->filter_size) { $this->current_filter->save(); $this->current_filter = null; - gc_collect_cycles(); + garbageCollect(); $last_filter = $this->num_filters; $this->current_filter = new BloomFilterFile($this->dir_name."/filter_$last_filter.ftr", diff --git a/src/library/IndexArchiveBundle.php b/src/library/IndexArchiveBundle.php index 680d8edd0..28fd86628 100644 --- a/src/library/IndexArchiveBundle.php +++ b/src/library/IndexArchiveBundle.php @@ -33,9 +33,9 @@ namespace seekquarry\yioop\library; use seekquarry\yioop\configs as C; /** - * Used for crawlLog and crawlHash + * Used for crawlLog, crawlHash, and garbageCollect */ -require_once __DIR__.'/Utility.php'; +require_once __DIR__ . '/Utility.php'; /** * Encapsulates a set of web page summaries and an inverted word-index of terms * from these summaries which allow one to search for summaries containing a @@ -220,10 +220,11 @@ class IndexArchiveBundle implements CrawlConstants crawlLog("Current index shard has " . $current_num_docs . " documents."); $memory_limit = metricToInt(ini_get("memory_limit")); - crawlLog("Memory Indexer limit is ".$memory_limit.". Usage is ". - memory_get_usage()); + $before_usage = memory_get_usage(); + crawlLog("Indexer Memory limit is " . $memory_limit . ". Usage is ". + $before_usage); if ($current_num_docs + $add_num_docs > $this->num_docs_per_generation - || (0.55 * $memory_limit) < memory_get_usage() ) { + || (0.65 * $memory_limit) < $before_usage ) { if ($blocking == true) { return -1; } @@ -232,10 +233,16 @@ class IndexArchiveBundle implements CrawlConstants // Save current shard dictionary to main dictionary $this->forceSave(); $this->addAdvanceGeneration($callback); - $num_freed = gc_collect_cycles(); + $num_freed= garbageCollect(); crawlLog("Indexer force running garbage collector after generation". - "advance. This freed " . $num_freed . " bytes."); - + " advance. This freed " . $num_freed . " bytes."); + $after_usage = memory_get_usage(); + crawlLog("Indexer after switch memory usage: $after_usage"); + if ((0.65 * $memory_limit) < $after_usage) { + crawlLog("Index Shard Switching did not free sufficiently ". + "memory, exiting"); + exit(); + } crawlLog("Switch Index Shard time:". changeInMicrotime($switch_time)); } diff --git a/src/library/IndexDictionary.php b/src/library/IndexDictionary.php index f362ee882..5f7bd3c48 100644 --- a/src/library/IndexDictionary.php +++ b/src/library/IndexDictionary.php @@ -33,7 +33,7 @@ namespace seekquarry\yioop\library; use seekquarry\yioop\configs as C; /** For Yioop global defines */ -require_once __DIR__."/../configs/Config.php"; +require_once __DIR__ . "/../configs/Config.php"; /** * Data structure used to store for entries of the form: * word id, index shard generation, posting list offset, and length of @@ -167,7 +167,7 @@ class IndexDictionary implements CrawlConstants $this->max_tier = 0; } else { $this->max_tier = unserialize( - file_get_contents($this->dir_name."/max_tier.txt")); + file_get_contents($this->dir_name . "/max_tier.txt")); $this->calculateActiveTiers(); } $this->parent_archive_bundle = $parent_archive_bundle; @@ -182,7 +182,7 @@ class IndexDictionary implements CrawlConstants public function calculateActiveTiers() { $this->read_tier = $this->max_tier; - $tiers = glob($this->dir_name."/0/*A.dic"); + $tiers = glob($this->dir_name . "/0/*A.dic"); natsort($tiers); $this->active_tiers = []; foreach ($tiers as $tier) { @@ -328,8 +328,8 @@ class IndexDictionary implements CrawlConstants */ public function mergeTierFiles($prefix, $tier, $out_slot) { - $file_a = $this->dir_name."/$prefix/$tier"."A.dic"; - $file_b = $this->dir_name."/$prefix/$tier"."B.dic"; + $file_a = $this->dir_name . "/$prefix/$tier"."A.dic"; + $file_b = $this->dir_name . "/$prefix/$tier"."B.dic"; $size_a = filesize($file_a); $size_b = filesize($file_b); $prefix_header_size = self::PREFIX_HEADER_SIZE; diff --git a/src/library/UrlParser.php b/src/library/UrlParser.php index e00dbf893..b2a8eb17a 100755 --- a/src/library/UrlParser.php +++ b/src/library/UrlParser.php @@ -772,7 +772,6 @@ class UrlParser static $uncomputed = true; $host = UrlParser::getHost($url, false); if ($uncomputed) { - $localhosts = ["localhost", "127.0.0.1", "::1"]; if (isset($_SERVER["SERVER_NAME"])) { $localhosts[] = $_SERVER["SERVER_NAME"]; $localhosts[] = gethostbyname($_SERVER["SERVER_NAME"]); @@ -780,7 +779,7 @@ class UrlParser if (isset($_SERVER["SERVER_ADDR"])) { $localhosts[] = $_SERVER["SERVER_ADDR"]; } - $uncomputed = true; + $uncomputed = false; } foreach ($localhosts as $localhost) { if (stristr($host, $localhost)) { diff --git a/src/library/Utility.php b/src/library/Utility.php index 541ed1138..809df4bc2 100755 --- a/src/library/Utility.php +++ b/src/library/Utility.php @@ -2372,3 +2372,18 @@ function bchexdec($hex) } return $dec; } +/** + * Runs various system garbage collection functions and returns + * number of bytes freed. + * + * @return int number of bytes freed + */ +function garbageCollect() +{ + $bytes_collected = 0; + if (function_exists("gc_mem_caches")) { + $bytes_collected += gc_mem_caches(); + } + $bytes_collected += gc_collect_cycles(); + return $bytes_collected; +} diff --git a/src/library/WebArchive.php b/src/library/WebArchive.php index b1cfdcd20..4fecdded2 100755 --- a/src/library/WebArchive.php +++ b/src/library/WebArchive.php @@ -162,7 +162,9 @@ class WebArchive */ public function writeInfoBlock($fh = null, &$data = null) { - if ($this->is_string) return; + if ($this->is_string) { + return; + } $compressed_int_len = $this->compressor->compressedIntLen(); $open_flag = false; if ($fh == null) { diff --git a/src/library/WebArchiveBundle.php b/src/library/WebArchiveBundle.php index 4233d9200..6f0ef6960 100755 --- a/src/library/WebArchiveBundle.php +++ b/src/library/WebArchiveBundle.php @@ -156,7 +156,7 @@ class WebArchiveBundle $info['WRITE_PARTITION'] = $this->write_partition; } file_put_contents( - $this->dir_name."/description.txt", serialize($info), + $this->dir_name . "/description.txt", serialize($info), LOCK_EX); } } @@ -310,7 +310,7 @@ class WebArchiveBundle $this->count = $info[$field]; } if (!$this->read_only_archive) { - file_put_contents($this->dir_name."/description.txt", + file_put_contents($this->dir_name . "/description.txt", serialize($info), LOCK_EX); } } @@ -346,9 +346,9 @@ class WebArchiveBundle */ public static function setArchiveInfo($dir_name, $info) { - if (file_exists($dir_name."/description.txt") && ((isset($this) && + if (file_exists($dir_name . "/description.txt") && ((isset($this) && !$this->read_only_archive) || !isset($this))) { - file_put_contents($dir_name."/description.txt", serialize($info), + file_put_contents($dir_name . "/description.txt", serialize($info), LOCK_EX); } } @@ -359,9 +359,9 @@ class WebArchiveBundle */ public static function getParamModifiedTime($dir_name) { - if (file_exists($dir_name."/description.txt")) { + if (file_exists($dir_name . "/description.txt")) { clearstatcache(); - return filemtime($dir_name."/description.txt"); + return filemtime($dir_name . "/description.txt"); } return false; } diff --git a/src/library/WebQueueBundle.php b/src/library/WebQueueBundle.php index a402740da..09b05ff98 100755 --- a/src/library/WebQueueBundle.php +++ b/src/library/WebQueueBundle.php @@ -36,7 +36,7 @@ use seekquarry\yioop\library\compressors\NonCompressor; /** * Used for the crawlHash function */ -require_once __DIR__.'/Utility.php'; +require_once __DIR__ . '/Utility.php'; /** * Encapsulates the data structures needed to have a queue of to crawl urls * @@ -825,7 +825,7 @@ class WebQueueBundle implements Notifier } } $this->to_crawl_table = null; - gc_collect_cycles(); + garbageCollect(); if (file_exists($this->dir_name."/hash_table.dat")) { unlink($this->dir_name."/hash_table.dat"); if (file_exists($this->dir_name."/tmp_table.dat")) { @@ -867,7 +867,7 @@ class WebQueueBundle implements Notifier $this->insertHashTable($hash_url, $data, $probe); } $this->to_crawl_archive = null; - gc_collect_cycles(); + garbageCollect(); $tmp_archive->filename = $url_archive_name; $this->to_crawl_archive = $tmp_archive; } @@ -895,7 +895,7 @@ class WebQueueBundle implements Notifier $this->crawl_delay_filter = null; $this->robot_archive = null; $this->robot_table = null; - gc_collect_cycles(); + garbageCollect(); $this->got_robottxt_filter = new BloomFilterFile( $this->dir_name."/got_robottxt.ftr", $this->filter_size); @@ -952,7 +952,7 @@ class WebQueueBundle implements Notifier unlink($this->dir_name . "/dns_table.dat"); } $this->dns_table = null; - gc_collect_cycles(); + garbageCollect(); $this->dns_table = new HashTable($this->dir_name . "/dns_table.dat", $num_values, self::HASH_KEY_SIZE, self::IP_SIZE); if ($this->robot_table) {