viewgit/inc/functions.php:22 Function utf8_encode() is deprecated [8192]
diff --git a/src/configs/Config.php b/src/configs/Config.php index 233bf16b9..aa2be2f1c 100755 --- a/src/configs/Config.php +++ b/src/configs/Config.php @@ -656,6 +656,10 @@ nsconddefine('USE_ETAG_EXPIRES', true); nsconddefine('MAXIMUM_CRAWL_DELAY', 64); /** maximum number of active crawl-delayed hosts */ nsconddefine('MAX_WAITING_HOSTS', 250); +/** maximum fraction of URLS in the Queue that are crawl-delayed and waiting + * before delete from queue new crawl-delayed urls + */ +nsconddefine('WAITING_URL_FRACTION', 0.1); /** Minimum weight in priority queue before rebuild */ nsconddefine('MIN_QUEUE_WEIGHT', 83); /** largest sized object allowed in a web archive (used to sanity check diff --git a/src/executables/QueueServer.php b/src/executables/QueueServer.php index a4c1b7890..b312340b9 100755 --- a/src/executables/QueueServer.php +++ b/src/executables/QueueServer.php @@ -659,6 +659,7 @@ class QueueServer implements CrawlConstants, Join foreach ($save_point_files as $save_point_file) { @unlink($save_point_file); } + $this->waiting_hosts = []; $this->initializeWebQueue(); $dir_name = C\CRAWL_DIR . '/cache/' . self::double_index_base_name . $this->crawl_time; @@ -1114,7 +1115,7 @@ class QueueServer implements CrawlConstants, Join return; } L\crawlLog("Writing queue contents back to schedules..."); - $dir = C\CRAWL_DIR."/schedules/" . self::schedule_data_base_name . + $dir = C\CRAWL_DIR . "/schedules/" . self::schedule_data_base_name . $this->crawl_time; if (!file_exists($dir)) { mkdir($dir); @@ -1296,6 +1297,7 @@ class QueueServer implements CrawlConstants, Join if ($update_disallow == true) { $this->updateDisallowedQuotaSites(); } + $this->waiting_hosts = []; $this->initializeWebQueue(); $this->initializeIndexBundle($info, $try_to_set_from_old_index); $info[self::STATUS] = self::CONTINUE_STATE; @@ -1957,7 +1959,7 @@ class QueueServer implements CrawlConstants, Join } L\crawlLog("Scheduler: Checking for robots.txt files to process..."); $robot_dir = C\CRAWL_DIR."/schedules/". - self::robot_data_base_name.$this->crawl_time; + self::robot_data_base_name . $this->crawl_time; $this->processDataFile($robot_dir, "processRobotArchive"); L\crawlLog("Scheduler done robot check and process. "); } @@ -2242,7 +2244,8 @@ class QueueServer implements CrawlConstants, Join $this->web_queue->adjustQueueWeight($url, $weight, false); } else if ($this->allowedToCrawlSite($url) && - !$this->disallowedToCrawlSite($url)) { + !$this->disallowedToCrawlSite($url) && + $this->withinQuota($url, 0)) { if (!$this->web_queue->containsGotRobotTxt($host_url) && !$robots_in_queue && !isset($added_urls[$host_with_robots]) @@ -2310,7 +2313,7 @@ class QueueServer implements CrawlConstants, Join $data_string = L\webencode( gzcompress(serialize($schedule_data))); $data_hash = L\crawlHash($data_string); - file_put_contents($dir."/At" . $this->crawl_time . + file_put_contents($dir . "/At" . $this->crawl_time . "From127-0-0-1WithHash$data_hash.txt", $data_string); $data_string = ""; $schedule_data[self::TO_CRAWL] = []; @@ -2471,6 +2474,10 @@ class QueueServer implements CrawlConstants, Join call slots. Crawled-delayed host urls are spaced by a certain number of slots */ + $num_waiting_urls = 0; + $max_links = max(C\MAX_LINKS_PER_PAGE, C\MAX_LINKS_PER_SITEMAP); + $max_queue_size = C\NUM_URLS_QUEUE_RAM - + C\SEEN_URLS_BEFORE_UPDATE_SCHEDULER * $max_links; while ($i <= $count && $fetch_size < C\MAX_FETCH_SIZE) { L\crawlTimeoutLog("..Scheduler: still producing fetch batch. ". "Examining location %s in queue of %s.", $i, $count); @@ -2576,7 +2583,7 @@ class QueueServer implements CrawlConstants, Join $delay = $this->web_queue->getCrawlDelay($host_url); } if (!$this->withinQuota($url)) { - //we've not allowed to schedule $url till next hour + //we're not allowed to schedule $url till next hour $delete_urls[$i] = $url; //delete from queue (so no clog) but don't mark seen $i++; @@ -2625,9 +2632,18 @@ class QueueServer implements CrawlConstants, Join seen after only scheduling them */ $fetch_size++; - } else if ($no_flags) { - $this->web_queue->setQueueFlag($url, - $delay + WebQueueBundle::SCHEDULABLE); + } else { + if ($num_waiting_urls < + C\WAITING_URL_FRACTION * $max_queue_size) { + $num_waiting_urls++; + if ($no_flags) { + $this->web_queue->setQueueFlag($url, + $delay + WebQueueBundle::SCHEDULABLE); + } + } else { + // has crawl delay but too many already waiting + $delete_urls[$i] = $url; + } } } else if (!$is_waiting_host) { // has crawl delay but too many already waiting @@ -2662,6 +2678,8 @@ class QueueServer implements CrawlConstants, Join "so far:". L\changeInMicrotime($start_time)); L\crawlLog("...Scheduler: Examined urls while making fetch batch:" . ($i - 1)); + L\crawlLog("...Scheduler: Number of waiting urls seen in queue:" . + $num_waiting_urls); $num_deletes = count($delete_urls); $k = 0; foreach ($delete_urls as $delete_url) { @@ -2898,9 +2916,11 @@ class QueueServer implements CrawlConstants, Join * This method also resets the quota queue every over * * @param string $url to check if within quota + * @param int $bump_count how much to bump quota count if url is from a + * site with a quota * @return bool whether $url exceeds the hourly quota of the site it is from */ - public function withinQuota($url) + public function withinQuota($url, $bump_count = 1) { if (!($site = UrlParser::urlMemberSiteArray( $url, $this->quota_sites_keys, @@ -2909,7 +2929,7 @@ class QueueServer implements CrawlConstants, Join } list($quota, $current_count) = $this->quota_sites[$site]; if ($current_count < $quota) { - $this->quota_sites[$site] = [$quota, $current_count + 1]; + $this->quota_sites[$site] = [$quota, $current_count + $bump_count]; $flag = true; } else { L\crawlLog("Quota exceeded removing " . diff --git a/src/library/BloomFilterBundle.php b/src/library/BloomFilterBundle.php index 1cdd0e989..04ae13dc0 100644 --- a/src/library/BloomFilterBundle.php +++ b/src/library/BloomFilterBundle.php @@ -93,7 +93,7 @@ class BloomFilterBundle $this->loadMetaData(); if ($this->num_filters == 0) { $this->current_filter = - new BloomFilterFile($dir_name."/filter_0.ftr", $filter_size); + new BloomFilterFile($dir_name . "/filter_0.ftr", $filter_size); $this->num_filters++; $this->filter_size = $filter_size; $this->current_filter->save(); @@ -120,8 +120,8 @@ class BloomFilterBundle garbageCollect(); $last_filter = $this->num_filters; $this->current_filter = - new BloomFilterFile($this->dir_name."/filter_$last_filter.ftr", - $this->filter_size); + new BloomFilterFile($this->dir_name . + "/filter_$last_filter.ftr", $this->filter_size); $this->current_filter_count = 0; $this->num_filters++; $this->saveMetaData(); @@ -146,13 +146,12 @@ class BloomFilterBundle if ($i == $num_filters - 1) { $tmp_filter = $this->current_filter; } else { - $tmp_filter = - BloomFilterFile::load($this->dir_name."/filter_$i.ftr"); + $tmp_filter = BloomFilterFile::load($this->dir_name . + "/filter_$i.ftr"); } - for ($j = 0; $j < $count; $j++) { if ($field_names === null) { - $tmp = & $arr[$j]; + $tmp = $arr[$j]; if ($tmp !== false && $tmp_filter->contains($tmp)) { /* We deliberately don't try to add anything that has @@ -165,7 +164,7 @@ class BloomFilterBundle } } else { //now do the same strategy for the array of fields case foreach ($field_names as $field_name) { - $tmp = & $arr[$j][$field_name]; + $tmp = $arr[$j][$field_name]; if ($tmp !== false && $tmp_filter->contains($tmp)) { unset($arr[$j]); break; @@ -186,7 +185,7 @@ class BloomFilterBundle */ public function loadMetaData() { - if (file_exists($this->dir_name.'/meta.txt')) { + if (file_exists($this->dir_name . '/meta.txt')) { $meta = unserialize( file_get_contents($this->dir_name.'/meta.txt') ); $this->num_filters = $meta['NUM_FILTERS'];