Better detection for when a url is a sitemap so will queued appropriately. Fixes to delayed host handling

Chris Pollett [2023-12-04 09:Dec:th]

Better detection for when a url is a sitemap so will queued appropriately. Fixes to delayed host handling

Filename
src/configs/Config.php
src/executables/Fetcher.php
src/executables/QueueServer.php
src/library/CrawlQueueBundle.php
src/library/processors/RobotProcessor.php

diff --git a/src/configs/Config.php b/src/configs/Config.php
index b3c56e6ef..537b25f1b 100755
--- a/src/configs/Config.php
+++ b/src/configs/Config.php
@@ -820,8 +820,6 @@ nsconddefine('USE_ETAG_EXPIRES', true);
  * maximum value for this is 255
  */
 nsconddefine('MAXIMUM_CRAWL_DELAY', 64);
-/** maximum number of active crawl-delayed hosts */
-nsconddefine('MAX_WAITING_HOSTS', 250);
 /** maximum fraction of URLS in the Queue that are crawl-delayed and waiting
  * before delete from queue new crawl-delayed urls
  */
diff --git a/src/executables/Fetcher.php b/src/executables/Fetcher.php
index e87396a5a..89c28338a 100755
--- a/src/executables/Fetcher.php
+++ b/src/executables/Fetcher.php
@@ -1775,8 +1775,8 @@ class Fetcher implements CrawlConstants
                 && (is_numeric($site[self::HTTP_CODE] ) &&
                 $site[self::HTTP_CODE] > 0 && $site[self::HTTP_CODE] != 416) ) {
                 $downloaded[] = $site;
-            } else if (substr($site[self::URL], -10) == "robots.txt"  &&
-                $time - $this->crawl_time > C\ONE_DAY) {
+            } else if (substr($site[self::URL], -strlen("robots.txt"))
+                == "robots.txt"  && $time - $this->crawl_time > C\ONE_DAY) {
                 /*
                     Assume slow to respond robots sites after the first day
                     of crawling are probably spammy (slow to respond and
@@ -2345,6 +2345,12 @@ class Fetcher implements CrawlConstants
             $doc_type = UrlParser::getDocumentType($url);
             if (!in_array($doc_type, $this->all_file_types)) {
                 $doc_type = "unknown";
+                $url_parts = explode("#", $url);
+                $url = $url_parts[0];
+                $fragment = $url_parts[1] ?? "";
+                if ($fragment == "sitemap") {
+                    $doc_type = "xml";
+                }
             }
             if (!in_array($doc_type, $this->indexed_file_types)) {
                 continue;
@@ -2593,7 +2599,10 @@ class Fetcher implements CrawlConstants
             $url = $link_urls[$i];
             if (strlen($url) > 0) {
                 // explicitly get rid of any fragment component of url
-                $url = explode("#", $url)[0];
+                $fragment_parts = explode("#", $url);
+                if (($fragment_parts[1] ?? "") != "sitemap") {
+                    $url = $fragment_parts[0];
+                }
                 if (strlen($url) == 0) {
                     continue;
                 }
diff --git a/src/executables/QueueServer.php b/src/executables/QueueServer.php
index dd390f43a..7853a5988 100644
--- a/src/executables/QueueServer.php
+++ b/src/executables/QueueServer.php
@@ -256,15 +256,6 @@ class QueueServer implements CrawlConstants
      * @var int
      */
     public $crawl_time;
-    /**
-     * This is a list of hosts whose robots.txt file had a Crawl-delay directive
-     * and which we have produced a schedule with urls for, but we have not
-     * heard back from the fetcher who was processing those urls. Hosts on
-     * this list will not be scheduled for more downloads until the fetcher
-     * with earlier urls has gotten back to the queue server.
-     * @var array
-     */
-    public $waiting_hosts;
     /**
      * IP address as a string of the fetcher that most recently spoke with the
      * queue server.
@@ -411,7 +402,6 @@ class QueueServer implements CrawlConstants
         $this->messages_bundle = null;
         $this->indexing_plugins = [];
         $this->indexing_plugins_data = [];
-        $this->waiting_hosts = [];
         $this->server_name = "IndexerAndScheduler";
         $this->process_name = "0-QueueServer";
         $this->crawl_status_file_name =
@@ -760,7 +750,6 @@ class QueueServer implements CrawlConstants
             foreach ($save_point_files as $save_point_file) {
                 @unlink($save_point_file);
             }
-            $this->waiting_hosts = [];
             $this->initializeCrawlQueue();
             $dir_name = C\CACHE_DIR . "/" . self::double_index_base_name .
                 $this->crawl_time;
@@ -1289,7 +1278,6 @@ class QueueServer implements CrawlConstants
         if ($update_disallow == true) {
             $this->updateDisallowedQuotaSites();
         }
-        $this->waiting_hosts = [];
         $this->initializeCrawlQueue();
         $this->initializeIndexBundle($info, $try_to_set_from_old_index);
         $messages_folder = C\SCHEDULES_DIR . "/" .
@@ -1844,8 +1832,7 @@ class QueueServer implements CrawlConstants
                 $crawl_delay = $robot_info[self::CRAWL_DELAY] ?? 0;
                 $row = ["HOSTNAME" => $robot_host,
                     "CAPTURE_TIME" => $time, "CRAWL_DELAY" => $crawl_delay,
-                    "ROBOT_PATHS" => $robot_info[self::ROBOT_PATHS] ?? [],
-                    "FLAGS" => 0];
+                    "ROBOT_PATHS" => $robot_info[self::ROBOT_PATHS] ?? []];
                 $robot_rows[] = $row;
                 if (isset($robot_info[self::IP_ADDRESSES])) {
                     $final_ip = array_pop($robot_info[self::IP_ADDRESSES]);
@@ -1985,9 +1972,9 @@ class QueueServer implements CrawlConstants
         }
         if (isset($sites[self::SCHEDULE_TIME])) {
             $start_time = microtime(true);
-            L\crawlLog("...Scheduler Notify Hosts Crawl Delayed By Schedule ".
+            L\crawlLog("...Scheduler Notify Hosts Delayed By Schedule ".
                 $sites[self::SCHEDULE_TIME] . "...");
-            $this->crawl_queue->notifyCrawlDelayedHosts(
+            $this->crawl_queue->notifyDelayedHosts(
                 $sites[self::SCHEDULE_TIME]);
             L\crawlLog(" time: ". L\changeInMicrotime($start_time));
         }
@@ -2036,17 +2023,19 @@ class QueueServer implements CrawlConstants
                 if ($url[0] != 'h' && trim($url) == "localhost") {
                     $url = "http://localhost/";
                 }
-                $this->crawl_queue->addSeenUrlFilter($url);
                 $hard_coded = (strpos($url, "###!") > 0 );
                 $host_url = UrlParser::getHost($url);
                 $hash_host = L\crawlHash($host_url);
                 if (strlen($host_url) < $http_scheme_len) {
                     continue;
                 }
+                $this->crawl_queue->addSeenUrlFilter($url);
                 if (!$hard_coded) {
-                    if (!$this->allowedToCrawlSite($url) ||
-                        $this->disallowedToCrawlSite($url) ||
-                        !$this->withinQuota($url, 0)) {
+                    //sitemaps will have #sitemap add to the url, so strip
+                    $check_url = explode("#", $url)[0];
+                    if (!$this->allowedToCrawlSite($check_url) ||
+                        $this->disallowedToCrawlSite($check_url) ||
+                        !$this->withinQuota($check_url, 0)) {
                         continue;
                     }
                 }
@@ -2057,7 +2046,7 @@ class QueueServer implements CrawlConstants
                 } else {
                     $robot_time = $seen_robot_time[$hash_host];
                 }
-                if (!$hard_coded && empty($robot_time)) {
+                if ((!$hard_coded && empty($robot_time))) {
                     if($robot_time === false &&
                         !isset($seen_robot_time[$hash_host])) {
                         $host_with_robots = ($scheme == "gopher") ? $host_url .
@@ -2368,6 +2357,7 @@ class QueueServer implements CrawlConstants
             $host_url = UrlParser::getHost($url);
             $hash_host = L\crawlHash($host_url);
             $hard_coded = (strpos($url, "###!") > 0 );
+            $url = ($hard_coded) ? $url : explode("#", $url)[0];
             $scheme = UrlParser::getScheme($host_url);
             if ($scheme == "gopher") {
                 $is_robot =
@@ -2400,17 +2390,17 @@ class QueueServer implements CrawlConstants
                 $robots_okay = true;
             } else if (!isset($hard_coded) || !$hard_coded) {
                 $robots_okay = $this->crawl_queue->checkRobotOkay($url);
+                if (!$this->allowedToCrawlSite($url) ||
+                    $this->disallowedToCrawlSite($url)) {
+                    /* This is checked when added to queue,
+                       we check again here in case allowed and disallowed
+                       sites have changed since then
+                     */
+                    $robots_okay = false;
+                }
             } else {
                 $robots_okay = true;
             }
-            if (!$this->allowedToCrawlSite($url) ||
-                $this->disallowedToCrawlSite($url)) {
-                /* This is checked when added to queue,
-                   we check again here in case allowed and disallowed
-                   sites have changed since then
-                 */
-                $robots_okay = false;
-            }
             if (!$robots_okay) {
                 L\crawlLog("FB Scheduler: $url removed by robot".
                     " policy");
@@ -2418,27 +2408,32 @@ class QueueServer implements CrawlConstants
             }
             $robot_data = $this->crawl_queue->getRobotData($host_url);
             $delay = $robot_data['CRAWL_DELAY'] ?? 0;
-            $flags = $robot_data['FLAGS'] ?? 0;
+            $is_delayed = $this->crawl_queue->isDelayedHost($host_url);
+            if ($is_delayed) {
+                L\crawlLog("$host_url is delayed");
+            }
             /* if company level domain delays, then we're delayed
                 as well
              */
             $cld = UrlParser::getCompanyLevelDomain($host_url);
             $cld_host = UrlParser::getScheme($host_url) . "://" . $cld;
-            $cld_flags = 0;
             $use_cld = false;
             if ($cld_host != $host_url) {
                 $cld_robot_data = $this->crawl_queue->getRobotData($cld_host);
-                $cld_flags = $cld_robot_data['FLAGS'] ?? 0;
                 if ($delay == 0) {
                     $delay = $cld_robot_data['CRAWL_DELAY'] ?? 0;
                     if ($delay > 0) {
                         $use_cld = true;
                     }
                 }
+                if (!$is_delayed) {
+                    $is_delayed = $this->crawl_queue->isDelayedHost($cld_host);
+                    if ($is_delayed) {
+                        L\crawlLog("$cld_host is delayed");
+                    }
+                }
             }
-            if (!$this->withinQuota($url) ||
-                ($flags & CrawlQueueBundle::WAITING_HOST) > 0 ||
-                ($cld_flags & CrawlQueueBundle::WAITING_HOST) > 0) {
+            if ($is_delayed || !$this->withinQuota($url)) {
                 L\crawlLog("FB Scheduler: $url rescheduled due to quota".
                     " or crawl delay.");
                 $reschedule_tuples[] = $current_tuple;
@@ -2494,8 +2489,7 @@ class QueueServer implements CrawlConstants
                     $add_delayed_hosts[L\crawlHash($host_url)] = $host_url;
                 }
                 if (count($add_delayed_hosts) > $max_buffer_before_write) {
-                    $add_delayed_hosts = array_values($add_delayed_hosts);
-                    $this->crawl_queue->addCrawlDelayedHosts($schedule_time,
+                    $this->crawl_queue->addDelayedHosts($schedule_time,
                         $add_delayed_hosts);
                     $add_delayed_hosts = [];
                 }
@@ -2527,8 +2521,7 @@ class QueueServer implements CrawlConstants
                 $this->crawl_order);
         }
         if (!empty($add_delayed_hosts)) {
-            $add_delayed_hosts = array_values($add_delayed_hosts);
-            $this->crawl_queue->addCrawlDelayedHosts($schedule_time,
+            $this->crawl_queue->addDelayedHosts($schedule_time,
                 $add_delayed_hosts);
         }
         $new_time = microtime(true);
diff --git a/src/library/CrawlQueueBundle.php b/src/library/CrawlQueueBundle.php
index 679ce5fd7..8063def6d 100644
--- a/src/library/CrawlQueueBundle.php
+++ b/src/library/CrawlQueueBundle.php
@@ -117,14 +117,6 @@ class CrawlQueueBundle
      * Length of an IPv6 ip address (IPv4 address are padded)
      */
     const IP_SIZE = 16;
-    /**
-     * Url type flag
-     */
-     const NO_FLAGS = 0;
-    /**
-     * Url type flag
-     */
-     const WAITING_HOST = 1;
     /** Size of int
      */
     const INT_SIZE = 4;
@@ -145,9 +137,15 @@ class CrawlQueueBundle
      */
     const URL_FILES_EXTENSION = ".txt.gz";
     /**
-     * Number of bytes in for hash table key
+     * Folder used to keep track of host that have been delayed, so
+     * urls from that host are not allowed to be schedule
+     */
+    const DELAYED_FOLDER = "DelayedHosts";
+    /**
+     * Within DELAYED_FOLDER a folder containing hashes of host urls
+     * for each delayed host
      */
-    const CRAWL_DELAYED_FOLDER = "CrawlDelayedHosts";
+    const HASHES_FOLDER = "Hashes";
     /**
      * Number of bytes in for hash table key
      */
@@ -174,14 +172,21 @@ class CrawlQueueBundle
         if (!file_exists($this->dir_name)) {
             mkdir($this->dir_name);
         }
-        if (!file_exists($this->dir_name . "/" . self::CRAWL_DELAYED_FOLDER)) {
-            mkdir($this->dir_name . "/" . self::CRAWL_DELAYED_FOLDER);
+        $delayed_path = $this->dir_name . "/" . self::DELAYED_FOLDER;
+        if (!file_exists($delayed_path)) {
+            mkdir($delayed_path);
+        }
+        $delayed_hash_path = $delayed_path . "/" . self::HASHES_FOLDER;
+        if (!file_exists($delayed_hash_path)) {
+            mkdir($delayed_hash_path);
         }
-        if (!file_exists($this->dir_name . "/" . self::ROBOT_WAIT_FOLDER)) {
-            mkdir($this->dir_name . "/" . self::ROBOT_WAIT_FOLDER);
+        $robot_wait_path = $this->dir_name . "/" . self::ROBOT_WAIT_FOLDER;
+        if (!file_exists($robot_wait_path)) {
+            mkdir($robot_wait_path);
         }
-        if (!file_exists($this->dir_name . "/" . self::URL_QUEUE_FOLDER)) {
-            mkdir($this->dir_name . "/" . self::URL_QUEUE_FOLDER);
+        $url_queue_path = $this->dir_name . "/" . self::URL_QUEUE_FOLDER;
+        if (!file_exists($url_queue_path)) {
+            mkdir($url_queue_path);
         }
         /* Hash table containing DNS cache this is cleared whenever robot
            filters cleared
@@ -204,7 +209,7 @@ class CrawlQueueBundle
         $this->robot_table = new LinearHashTable($dir_name .
             "/RobotData", ["PRIMARY KEY" => "HOSTNAME",
             "CAPTURE_TIME" => "INT", "CRAWL_DELAY" => "INT",
-            "ROBOT_PATHS" => "SERIAL", "FLAGS" => "INT"],
+            "ROBOT_PATHS" => "SERIAL"],
             LinearHashTable::MAX_ITEMS_PER_FILE,
             LinearHashTable::PARTITION_SIZE_THRESHOLD,
             C\NS_COMPRESSORS . "GzipCompressor");
@@ -286,7 +291,7 @@ class CrawlQueueBundle
      * of tiers. Url tuples are sorted into a tier based on the number of
      * urls that have been downloaded for that url's host and their weight.
      * Naively, without weight, a url goes into tier
-     * floor(log(# of urls downloaded already for its host))
+     * floor(log10(# of urls downloaded already for its host))
      * Within a tier, urls are stored in folders by day recieved and then into
      * a file from a sequence of files according to order received. Each file
      * in the sequence is able to store 1MB compressed many url tuples.
@@ -351,9 +356,19 @@ class CrawlQueueBundle
         if ($crawl_order == CrawlConstants::BREADTH_FIRST) {
             return $cld_data;
         }
+        /* The more urls we see from a cld the heavier we weigh them
+           This will tend to assign future urls to a higher tier and
+           hence will have to wait longer to be crawled
+         */
         $cld_data['WEIGHTED_SEEN_URLS'] += min(1, 1 + log(1 + $weight, 5));
         $linking_cld = UrlParser::getCompanyLevelDomain($linking_url);
         $linking_cld_data = $this->domain_table->get($linking_cld);
+        /*
+           WEIGHTED_INCOMING_URLS will be subtract from WEIGHTED_SEEN_URLS
+           when determining the tier of a to-be-queued url. The code
+           below tries to estimate the quality of the referer of the
+           current url and updates WEIGHTED_INCOMING_URLS accordingly
+         */
         if (!empty($linking_cld_data)) {
             $linking_url_tier = floor(log10(min(1,
                 $linking_cld_data['WEIGHTED_SEEN_URLS'] -
@@ -388,8 +403,10 @@ class CrawlQueueBundle
         $tier = floor(log10(max(1, $cld_data['WEIGHTED_SEEN_URLS'] -
             $cld_data['WEIGHTED_INCOMING_URLS'])));
         $robots_txt = "robots.txt";
+        $site_map_fragment = "#sitemap";
         // put sitemaps in a higher queue
-        if (in_array(substr($url, -3), [".gz", ".bz", "xml"])) {
+        if (in_array(substr($url, -3), [".gz", ".bz", "xml"]) ||
+            substr($url, -strlen($site_map_fragment)) == $site_map_fragment) {
             $tier += C\SITEMAP_TIER_PENALTY;
         } else if (UrlParser::guessMimeTypeFromFileName($url) != "text/html"
             && substr($url, -strlen($robots_txt)) != $robots_txt) {
@@ -404,11 +421,11 @@ class CrawlQueueBundle
         return $tier;
     }
     /**
-     * This method is used to send urls that are in the waiting hosts folder
-     * for hosts listed in $this->robot_notify_hosts
-     * to be received to be moved to the queue because host membership in
-     * $this->robot_notify_hosts indicates that a robots.txt
-     * file has just been received for the particular domain.
+     * This method is used to move urls that are in the ROBOT_WAIT_FOLDER
+     * whose hosts are listed in $this->robot_notify_hosts (i.e., got robot.txt)
+     * to the appropriate tier of the send fetcher queue. It these
+     * empties $this->robot_notify_hosts.
+     *
      * @param string $crawl_order one of CrawlConstants::BREADTH_FIRST or
      *  CrawlConstants::HOST_BUDGETING
      */
@@ -526,7 +543,7 @@ class CrawlQueueBundle
         }
         $time = ($timestamp == 0) ? time() : $timestamp;
         $day = floor($time/C\ONE_DAY);
-        $dir .= "/$day";
+        $dir .= "/" . sprintf("%'.09d", $day);
         if (!file_exists($dir)) {
             mkdir($dir);
             chmod($dir, 0777);
@@ -581,7 +598,7 @@ class CrawlQueueBundle
      */
     public function chooseFetchBatchQueueFolder($crawl_order)
     {
-        static $last_folder = 0;
+        static $last_folder = -1; // will be incremented before used
         static $exp_max_folder = 1;
         $url_queue_folder = $this->dir_name . "/" . self::URL_QUEUE_FOLDER;
         $sub_dirs = glob("$url_queue_folder/*", GLOB_ONLYDIR);
@@ -600,10 +617,14 @@ class CrawlQueueBundle
             return ($is_empty) ? false : $sub_dir;
         }
         /* the hope of the following is to prevent looking at sitemaps
-           too early in the crawl before all the seed sites are donwloaded
+           too early in the crawl before all the seed sites are downloaded
          */
         $exp_max_folder++;
-        $pre_max_folder = ceil(log($exp_max_folder, 2));
+        $pre_max_folder = floor(log10($exp_max_folder)); /*$exp_max_folder ==2,
+            so $pre_max_folder ==1, second time $exp_max_folder ==3,
+            so $pre_max_folder ==1, third time $exp_max_folder ==4,
+            so $pre_max_folder ==2, etc. when == C\SITEMAP_TIER_PENALTY,
+            all folders will be available below */
         if ($pre_max_folder >=  C\SITEMAP_TIER_PENALTY) {
             $pre_max_folder = count($sub_dirs);
         }
@@ -612,25 +633,57 @@ class CrawlQueueBundle
             $last_folder + 1 : 0;
         return $sub_dirs[$last_folder];
     }
+    /**
+     *
+     */
+    public function getDelayedHashFolder($hash_url)
+    {
+        $path = $this->dir_name . "/" . self::DELAYED_FOLDER . "/" .
+            self::HASHES_FOLDER;
+        return "$path/{$hash_url[0]}/{$hash_url[1]}";
+    }
     /**
      * For a timestamp $schedule_time of a fetch batch of urls to be downloaded
-     * and for a list of crawl-delayed hosts in that batch, add the hosts to
-     * a a $schedule time file in the CrawlDelayedHosts queue so they can be
+     * and for a list of delayed hosts in that batch, add the hosts to
+     * a a $schedule time file in the DelayedHosts queue so they can be
      * notified when that fetch batch is done processing. Until notified any
      * url from one of these crawl delayed hosts will be rescheduled rather than
      * but in a fetch batch for download.
      *
-     * @param int schedule_time
-     * @param array $host_urls array of urls for hosts that are crawl delayed
+     * @param int schedule_time time of the schedules these urls are waiting on
+     * @param array $host_urls array of urls
+     *    for hosts that are crawl delayed
      *    and for which there is a schedule currently running on fetchers
      *    which might download from that host
      */
-    public function addCrawlDelayedHosts($schedule_time, $host_urls)
+    public function addDelayedHosts($schedule_time, $host_urls)
+    {
+        foreach ($host_urls as $host_url) {
+            $hash_host = crawlHash($host_url);
+            $hash_folder = $this->getDelayedHashFolder($hash_host);
+            if (!file_exists($hash_folder)) {
+                makePath($hash_folder);
+            }
+            file_put_contents("$hash_folder/$hash_host", "1");
+        }
+        $host_urls = array_values($host_urls);
+        $delayed_folder = $this->dir_name . "/" . self::DELAYED_FOLDER;
+        $this->addUrlsDirectory($delayed_folder, $host_urls, $schedule_time);
+    }
+    /**
+     * Checks whether urls for a particular $host_url are being delayed
+     * with respect to scheduling into fetch batches.
+     *
+     * @param string $host_url or crawlHash of host url
+     * @param boolean $is_hash_host if true then $host_url is treated as a
+     *   crawlHash of host url rather than just a host url
+     * @return boolean whether urls for the given host are being delayed or not
+     */
+    public function isDelayedHost($host_url, $is_hash_host = false)
     {
-        $crawl_delayed_folder =
-            $this->dir_name . "/" . self::CRAWL_DELAYED_FOLDER;
-        $this->addUrlsDirectory($crawl_delayed_folder, $host_urls,
-            $schedule_time);
+        $hash_host = ($is_hash_host) ? $host_url : crawlHash($host_url);
+        return file_exists($this->getDelayedHashFolder($hash_host) .
+            "/$hash_host");
     }
     /**
      * For each host in the crawl-delayed hosts queue waiting on the
@@ -641,88 +694,70 @@ class CrawlQueueBundle
      * @param int $timestamp of a fetch batch schedule to notify
      *    crawl-delayed hosts that it has completed download.
      */
-    public function notifyCrawlDelayedHosts($timestamp)
+    public function notifyDelayedHosts($timestamp)
     {
         crawlLog("Scheduler: Notifying hosts that were crawl delayed by ".
             "Schedule $timestamp");
         $db_class = C\NS_DATASOURCES . ucfirst(C\DBMS) . "Manager";
         $db = new $db_class();
-        $crawl_delayed_folder =
-            $this->dir_name . "/" . self::CRAWL_DELAYED_FOLDER;
-        $day_folders = $this->getDayFolders($crawl_delayed_folder);
-        //maximum crawl delay will honor is one day
-        $yesterday = floor((time() - C\ONE_DAY)/C\ONE_DAY);
+        $delayed_folder = $this->dir_name . "/" . self::DELAYED_FOLDER;
+        $day_folders = $this->getDayFolders($delayed_folder);
         if (empty($day_folders)) {
             return; //no one is waiting
         }
-        $robot_rows = [];
+        /* maximum crawl delay will honor is one day,
+          so let's notify anything older
+         */
+        $yesterday = floor((time() - C\ONE_DAY)/C\ONE_DAY);
         foreach ($day_folders as $day_folder) {
             $day_timestamp = intval(substr($day_folder, -9));
             if ($day_timestamp >= $yesterday) {
                 continue;
             }
+            crawlLog(
+                "Scheduler: Notifying long waiting hosts in $day_folder!");
             $waiting_host_files = $this->getUrlsFiles($day_folder);
             if (!empty($waiting_host_files)) {
                 foreach ($waiting_host_files as $waiting_host_file) {
-                    $robot_rows = $this->processWaitingHostFile(
-                        $waiting_host_file, $robot_rows);
+                    $this->processWaitingHostFile($waiting_host_file);
                 }
             }
             $db->unlinkRecursive($day_folder);
         }
+        // Now let's notify those hosts waiting on $timestamp
         $stamp_day = floor($timestamp/C\ONE_DAY);
-        $file_name = "$crawl_delayed_folder/$stamp_day/".
+        $file_name = "$delayed_folder/" . sprintf("%'.09d", $stamp_day). "/".
             sprintf("%'.09d", $timestamp) . self::URL_FILES_EXTENSION;
         if (file_exists($file_name)) {
-            $robot_rows = $this->processWaitingHostFile($file_name,
-                $robot_rows);
-            if (!empty($robot_rows)) {
-                if (C\nsdefined('VERBOSE_LOGGING') && C\VERBOSE_LOGGING) {
-                    crawlLog(
-                        "Scheduler: Notifying the following list of hosts:");
-                    $i = 0;
-                    foreach ($robot_rows as $robot_row) {
-                        $i++;
-                        crawlLog("$i. ". $robot_row['HOSTNAME'] ?? "");
-                    }
-                }
-                $this->robot_table->put($robot_rows);
-            }
+            $this->processWaitingHostFile($file_name);
         }
     }
     /**
-     * Used by @see notifyCrawlDelayedHosts($timestamp).
-     * For each host listed in the file $file_name get its robot info from
-     * robot_table, clear its FLAG column, store the update into
-     * a temporary array $robot_rows. Every MAX_URL_BUFFER_BEFORE_WRITE
-     * many such hosts, write the updates in $robot_rows back to the
-     * robot_table on disk. If last batch of modified rows has been written
-     * when done file, return these in $robot_rows
+     * Used by @see notifyDelayedHosts($timestamp).
+     * For each host listed in the file $file_name deleted its file from
+     * the delayed hash folder. Once all host urls in the file are processed.
+     * deletes the file.
      *
-     *  @param string $file_name to get hosts to clear flag columns of
-     *  @param array $robot_rows rows of updated hosts potentially from a
-     *      previously processed file.
-     *  @return array $robot_rows leftover updated robot host rows that haven't
-     *      been written to disk yet
+     *  @param string $file_name to with a list of hosts.
      */
-    public function processWaitingHostFile($file_name, $robot_rows)
+    public function processWaitingHostFile($file_name)
     {
         $waiting_hosts = $this->getUrlsFileContents($file_name);
         if (empty($waiting_hosts)) {
             return [];
         }
+        crawlLog("Scheduler: Notifying the following list of hosts:");
+        $i = 0;
         foreach ($waiting_hosts as $waiting_host) {
-            $robot_data = $this->robot_table->get($waiting_host);
-            $robot_data["FLAGS"] = 0;
-            $robot_rows[] = $robot_data;
-            if (count($robot_rows) > self::MAX_URL_BUFFER_BEFORE_WRITE) {
-                crawlLog("Scheduler: Notifying the following list of hosts ".
-                    print_r($robot_rows, true));
-                $this->robot_table->put($robot_rows);
-                $robot_rows = [];
+            $hash_host = crawlHash($waiting_host);
+            if ($this->isDelayedHost($hash_host, true)) {
+                $i++;
+                crawlLog("$i. $waiting_host");
+                unlink($this->getDelayedHashFolder($hash_host) .
+                    "/$hash_host");
             }
         }
-        return $robot_rows;
+        unlink($file_name);
     }
     /**
      * Checks if the given $url is allowed to be crawled based on stored
@@ -760,7 +795,7 @@ class CrawlQueueBundle
      * For a provided hostname, returns the robots.txt
      * information stored in the the robot table: [HOSTNAME,
      * CAPTURE_TIME, CRAWL_DELAY, ROBOT_PATHS => [ALLOWED_SITES,
-     *  DISALLOWED_SITES], FLAGS (for not whether should wait for notification
+     *  DISALLOWED_SITES], FLAGS (for whether should wait for notification
      * from a schedule being downloaded before continuing crawling the site).
      *
      * @param string $host hostname to look up robots.tx info for.
diff --git a/src/library/processors/RobotProcessor.php b/src/library/processors/RobotProcessor.php
index e4d4c453e..c7db7bc68 100644
--- a/src/library/processors/RobotProcessor.php
+++ b/src/library/processors/RobotProcessor.php
@@ -149,7 +149,12 @@ class RobotProcessor extends PageProcessor
                     $tmp_url = UrlParser::canonicalLink($value, $host_url);
                     if (!UrlParser::checkRecursiveUrl($tmp_url)
                         && strlen($tmp_url) < C\MAX_URL_LEN) {
-                        $summary[self::LINKS][] = $tmp_url;
+                        /*
+                          Sometimes sitemap files don't end in xml
+                          so add a fragment to flag as a sitemap when
+                          determining queue tier
+                         */
+                        $summary[self::LINKS][] = $tmp_url . "#sitemap";
                     }
                 break;
                 case "allow":

ViewGit