Attempts to tweak queue so won't get clogged with waiting crawl-delayed urls or urls with quotas, a=chris

Chris Pollett [2019-07-04 02:Jul:th]

Attempts to tweak queue so won't get clogged with waiting crawl-delayed urls or urls with quotas, a=chris

Filename
src/configs/Config.php
src/executables/QueueServer.php
src/library/BloomFilterBundle.php

diff --git a/src/configs/Config.php b/src/configs/Config.php
index 233bf16b9..aa2be2f1c 100755
--- a/src/configs/Config.php
+++ b/src/configs/Config.php
@@ -656,6 +656,10 @@ nsconddefine('USE_ETAG_EXPIRES', true);
 nsconddefine('MAXIMUM_CRAWL_DELAY', 64);
 /** maximum number of active crawl-delayed hosts */
 nsconddefine('MAX_WAITING_HOSTS', 250);
+/** maximum fraction of URLS in the Queue that are crawl-delayed and waiting
+ * before delete from queue new crawl-delayed urls
+ */
+nsconddefine('WAITING_URL_FRACTION', 0.1);
 /** Minimum weight in priority queue before rebuild */
 nsconddefine('MIN_QUEUE_WEIGHT', 83);
 /**  largest sized object allowed in a web archive (used to sanity check
diff --git a/src/executables/QueueServer.php b/src/executables/QueueServer.php
index a4c1b7890..b312340b9 100755
--- a/src/executables/QueueServer.php
+++ b/src/executables/QueueServer.php
@@ -659,6 +659,7 @@ class QueueServer implements CrawlConstants, Join
             foreach ($save_point_files as $save_point_file) {
                 @unlink($save_point_file);
             }
+            $this->waiting_hosts = [];
             $this->initializeWebQueue();
             $dir_name = C\CRAWL_DIR . '/cache/' . self::double_index_base_name .
                 $this->crawl_time;
@@ -1114,7 +1115,7 @@ class QueueServer implements CrawlConstants, Join
             return;
         }
         L\crawlLog("Writing queue contents back to schedules...");
-        $dir = C\CRAWL_DIR."/schedules/" . self::schedule_data_base_name .
+        $dir = C\CRAWL_DIR . "/schedules/" . self::schedule_data_base_name .
             $this->crawl_time;
         if (!file_exists($dir)) {
             mkdir($dir);
@@ -1296,6 +1297,7 @@ class QueueServer implements CrawlConstants, Join
         if ($update_disallow == true) {
             $this->updateDisallowedQuotaSites();
         }
+        $this->waiting_hosts = [];
         $this->initializeWebQueue();
         $this->initializeIndexBundle($info, $try_to_set_from_old_index);
         $info[self::STATUS] = self::CONTINUE_STATE;
@@ -1957,7 +1959,7 @@ class QueueServer implements CrawlConstants, Join
         }
         L\crawlLog("Scheduler: Checking for robots.txt files to process...");
         $robot_dir = C\CRAWL_DIR."/schedules/".
-            self::robot_data_base_name.$this->crawl_time;
+            self::robot_data_base_name . $this->crawl_time;
         $this->processDataFile($robot_dir, "processRobotArchive");
         L\crawlLog("Scheduler done robot check and process. ");
     }
@@ -2242,7 +2244,8 @@ class QueueServer implements CrawlConstants, Join
                     $this->web_queue->adjustQueueWeight($url,
                         $weight, false);
                 } else if ($this->allowedToCrawlSite($url) &&
-                    !$this->disallowedToCrawlSite($url)) {
+                    !$this->disallowedToCrawlSite($url) &&
+                    $this->withinQuota($url, 0)) {
                     if (!$this->web_queue->containsGotRobotTxt($host_url)
                         && !$robots_in_queue
                         && !isset($added_urls[$host_with_robots])
@@ -2310,7 +2313,7 @@ class QueueServer implements CrawlConstants, Join
                 $data_string = L\webencode(
                     gzcompress(serialize($schedule_data)));
                 $data_hash = L\crawlHash($data_string);
-                file_put_contents($dir."/At" . $this->crawl_time .
+                file_put_contents($dir . "/At" . $this->crawl_time .
                     "From127-0-0-1WithHash$data_hash.txt", $data_string);
                 $data_string = "";
                 $schedule_data[self::TO_CRAWL] = [];
@@ -2471,6 +2474,10 @@ class QueueServer implements CrawlConstants, Join
                 call slots. Crawled-delayed host urls are spaced by a certain
                 number of slots
         */
+        $num_waiting_urls = 0;
+        $max_links = max(C\MAX_LINKS_PER_PAGE, C\MAX_LINKS_PER_SITEMAP);
+        $max_queue_size =  C\NUM_URLS_QUEUE_RAM -
+            C\SEEN_URLS_BEFORE_UPDATE_SCHEDULER * $max_links;
         while ($i <= $count && $fetch_size < C\MAX_FETCH_SIZE) {
             L\crawlTimeoutLog("..Scheduler: still producing fetch batch. ".
                 "Examining location %s in queue of %s.", $i, $count);
@@ -2576,7 +2583,7 @@ class QueueServer implements CrawlConstants, Join
                 $delay = $this->web_queue->getCrawlDelay($host_url);
             }
             if (!$this->withinQuota($url)) {
-                //we've not allowed to schedule $url till next hour
+                //we're not allowed to schedule $url till next hour
                 $delete_urls[$i] = $url;
                 //delete from queue (so no clog) but don't mark seen
                 $i++;
@@ -2625,9 +2632,18 @@ class QueueServer implements CrawlConstants, Join
                            seen after only scheduling them
                          */
                         $fetch_size++;
-                    } else if ($no_flags) {
-                        $this->web_queue->setQueueFlag($url,
-                            $delay + WebQueueBundle::SCHEDULABLE);
+                    } else {
+                        if ($num_waiting_urls <
+                            C\WAITING_URL_FRACTION * $max_queue_size) {
+                            $num_waiting_urls++;
+                            if ($no_flags) {
+                                $this->web_queue->setQueueFlag($url,
+                                    $delay + WebQueueBundle::SCHEDULABLE);
+                            }
+                        } else {
+                            // has crawl delay but too many already waiting
+                            $delete_urls[$i] = $url;
+                        }
                     }
                 } else if (!$is_waiting_host) {
                     // has crawl delay but too many already waiting
@@ -2662,6 +2678,8 @@ class QueueServer implements CrawlConstants, Join
             "so far:". L\changeInMicrotime($start_time));
         L\crawlLog("...Scheduler: Examined urls while making fetch batch:" .
             ($i - 1));
+        L\crawlLog("...Scheduler: Number of waiting urls seen in queue:" .
+            $num_waiting_urls);
         $num_deletes = count($delete_urls);
         $k = 0;
         foreach ($delete_urls as $delete_url) {
@@ -2898,9 +2916,11 @@ class QueueServer implements CrawlConstants, Join
      * This method also resets the quota queue every over
      *
      * @param string $url to check if within quota
+     * @param int $bump_count how much to bump quota count if url is from a
+     *      site with a quota
      * @return bool whether $url exceeds the hourly quota of the site it is from
      */
-    public function withinQuota($url)
+    public function withinQuota($url, $bump_count = 1)
     {
         if (!($site = UrlParser::urlMemberSiteArray(
             $url, $this->quota_sites_keys,
@@ -2909,7 +2929,7 @@ class QueueServer implements CrawlConstants, Join
         }
         list($quota, $current_count) = $this->quota_sites[$site];
         if ($current_count < $quota) {
-            $this->quota_sites[$site] = [$quota, $current_count + 1];
+            $this->quota_sites[$site] = [$quota, $current_count + $bump_count];
             $flag = true;
         } else {
             L\crawlLog("Quota exceeded removing " .
diff --git a/src/library/BloomFilterBundle.php b/src/library/BloomFilterBundle.php
index 1cdd0e989..04ae13dc0 100644
--- a/src/library/BloomFilterBundle.php
+++ b/src/library/BloomFilterBundle.php
@@ -93,7 +93,7 @@ class BloomFilterBundle
         $this->loadMetaData();
         if ($this->num_filters == 0) {
             $this->current_filter =
-                new BloomFilterFile($dir_name."/filter_0.ftr", $filter_size);
+                new BloomFilterFile($dir_name . "/filter_0.ftr", $filter_size);
             $this->num_filters++;
             $this->filter_size = $filter_size;
             $this->current_filter->save();
@@ -120,8 +120,8 @@ class BloomFilterBundle
             garbageCollect();
             $last_filter = $this->num_filters;
             $this->current_filter =
-                new BloomFilterFile($this->dir_name."/filter_$last_filter.ftr",
-                    $this->filter_size);
+                new BloomFilterFile($this->dir_name .
+                    "/filter_$last_filter.ftr", $this->filter_size);
             $this->current_filter_count = 0;
             $this->num_filters++;
             $this->saveMetaData();
@@ -146,13 +146,12 @@ class BloomFilterBundle
             if ($i == $num_filters - 1) {
                 $tmp_filter = $this->current_filter;
             } else {
-                $tmp_filter =
-                    BloomFilterFile::load($this->dir_name."/filter_$i.ftr");
+                $tmp_filter = BloomFilterFile::load($this->dir_name .
+                    "/filter_$i.ftr");
             }
-
             for ($j = 0; $j < $count; $j++) {
                 if ($field_names === null) {
-                    $tmp = & $arr[$j];
+                    $tmp = $arr[$j];
                     if ($tmp !== false && $tmp_filter->contains($tmp)) {
                     /*
                         We deliberately don't try to add anything that has
@@ -165,7 +164,7 @@ class BloomFilterBundle
                     }
                 } else { //now do the same strategy for the array of fields case
                     foreach ($field_names as $field_name) {
-                        $tmp = & $arr[$j][$field_name];
+                        $tmp = $arr[$j][$field_name];
                         if ($tmp !== false && $tmp_filter->contains($tmp)) {
                             unset($arr[$j]);
                             break;
@@ -186,7 +185,7 @@ class BloomFilterBundle
      */
     public function loadMetaData()
     {
-        if (file_exists($this->dir_name.'/meta.txt')) {
+        if (file_exists($this->dir_name . '/meta.txt')) {
             $meta = unserialize(
                 file_get_contents($this->dir_name.'/meta.txt') );
             $this->num_filters = $meta['NUM_FILTERS'];

ViewGit