diff --git a/src/configs/Config.php b/src/configs/Config.php
index b3c56e6ef..537b25f1b 100755
--- a/src/configs/Config.php
+++ b/src/configs/Config.php
@@ -820,8 +820,6 @@ nsconddefine('USE_ETAG_EXPIRES', true);
* maximum value for this is 255
*/
nsconddefine('MAXIMUM_CRAWL_DELAY', 64);
-/** maximum number of active crawl-delayed hosts */
-nsconddefine('MAX_WAITING_HOSTS', 250);
/** maximum fraction of URLS in the Queue that are crawl-delayed and waiting
* before delete from queue new crawl-delayed urls
*/
diff --git a/src/executables/Fetcher.php b/src/executables/Fetcher.php
index e87396a5a..89c28338a 100755
--- a/src/executables/Fetcher.php
+++ b/src/executables/Fetcher.php
@@ -1775,8 +1775,8 @@ class Fetcher implements CrawlConstants
&& (is_numeric($site[self::HTTP_CODE] ) &&
$site[self::HTTP_CODE] > 0 && $site[self::HTTP_CODE] != 416) ) {
$downloaded[] = $site;
- } else if (substr($site[self::URL], -10) == "robots.txt" &&
- $time - $this->crawl_time > C\ONE_DAY) {
+ } else if (substr($site[self::URL], -strlen("robots.txt"))
+ == "robots.txt" && $time - $this->crawl_time > C\ONE_DAY) {
/*
Assume slow to respond robots sites after the first day
of crawling are probably spammy (slow to respond and
@@ -2345,6 +2345,12 @@ class Fetcher implements CrawlConstants
$doc_type = UrlParser::getDocumentType($url);
if (!in_array($doc_type, $this->all_file_types)) {
$doc_type = "unknown";
+ $url_parts = explode("#", $url);
+ $url = $url_parts[0];
+ $fragment = $url_parts[1] ?? "";
+ if ($fragment == "sitemap") {
+ $doc_type = "xml";
+ }
}
if (!in_array($doc_type, $this->indexed_file_types)) {
continue;
@@ -2593,7 +2599,10 @@ class Fetcher implements CrawlConstants
$url = $link_urls[$i];
if (strlen($url) > 0) {
// explicitly get rid of any fragment component of url
- $url = explode("#", $url)[0];
+ $fragment_parts = explode("#", $url);
+ if (($fragment_parts[1] ?? "") != "sitemap") {
+ $url = $fragment_parts[0];
+ }
if (strlen($url) == 0) {
continue;
}
diff --git a/src/executables/QueueServer.php b/src/executables/QueueServer.php
index dd390f43a..7853a5988 100644
--- a/src/executables/QueueServer.php
+++ b/src/executables/QueueServer.php
@@ -256,15 +256,6 @@ class QueueServer implements CrawlConstants
* @var int
*/
public $crawl_time;
- /**
- * This is a list of hosts whose robots.txt file had a Crawl-delay directive
- * and which we have produced a schedule with urls for, but we have not
- * heard back from the fetcher who was processing those urls. Hosts on
- * this list will not be scheduled for more downloads until the fetcher
- * with earlier urls has gotten back to the queue server.
- * @var array
- */
- public $waiting_hosts;
/**
* IP address as a string of the fetcher that most recently spoke with the
* queue server.
@@ -411,7 +402,6 @@ class QueueServer implements CrawlConstants
$this->messages_bundle = null;
$this->indexing_plugins = [];
$this->indexing_plugins_data = [];
- $this->waiting_hosts = [];
$this->server_name = "IndexerAndScheduler";
$this->process_name = "0-QueueServer";
$this->crawl_status_file_name =
@@ -760,7 +750,6 @@ class QueueServer implements CrawlConstants
foreach ($save_point_files as $save_point_file) {
@unlink($save_point_file);
}
- $this->waiting_hosts = [];
$this->initializeCrawlQueue();
$dir_name = C\CACHE_DIR . "/" . self::double_index_base_name .
$this->crawl_time;
@@ -1289,7 +1278,6 @@ class QueueServer implements CrawlConstants
if ($update_disallow == true) {
$this->updateDisallowedQuotaSites();
}
- $this->waiting_hosts = [];
$this->initializeCrawlQueue();
$this->initializeIndexBundle($info, $try_to_set_from_old_index);
$messages_folder = C\SCHEDULES_DIR . "/" .
@@ -1844,8 +1832,7 @@ class QueueServer implements CrawlConstants
$crawl_delay = $robot_info[self::CRAWL_DELAY] ?? 0;
$row = ["HOSTNAME" => $robot_host,
"CAPTURE_TIME" => $time, "CRAWL_DELAY" => $crawl_delay,
- "ROBOT_PATHS" => $robot_info[self::ROBOT_PATHS] ?? [],
- "FLAGS" => 0];
+ "ROBOT_PATHS" => $robot_info[self::ROBOT_PATHS] ?? []];
$robot_rows[] = $row;
if (isset($robot_info[self::IP_ADDRESSES])) {
$final_ip = array_pop($robot_info[self::IP_ADDRESSES]);
@@ -1985,9 +1972,9 @@ class QueueServer implements CrawlConstants
}
if (isset($sites[self::SCHEDULE_TIME])) {
$start_time = microtime(true);
- L\crawlLog("...Scheduler Notify Hosts Crawl Delayed By Schedule ".
+ L\crawlLog("...Scheduler Notify Hosts Delayed By Schedule ".
$sites[self::SCHEDULE_TIME] . "...");
- $this->crawl_queue->notifyCrawlDelayedHosts(
+ $this->crawl_queue->notifyDelayedHosts(
$sites[self::SCHEDULE_TIME]);
L\crawlLog(" time: ". L\changeInMicrotime($start_time));
}
@@ -2036,17 +2023,19 @@ class QueueServer implements CrawlConstants
if ($url[0] != 'h' && trim($url) == "localhost") {
$url = "http://localhost/";
}
- $this->crawl_queue->addSeenUrlFilter($url);
$hard_coded = (strpos($url, "###!") > 0 );
$host_url = UrlParser::getHost($url);
$hash_host = L\crawlHash($host_url);
if (strlen($host_url) < $http_scheme_len) {
continue;
}
+ $this->crawl_queue->addSeenUrlFilter($url);
if (!$hard_coded) {
- if (!$this->allowedToCrawlSite($url) ||
- $this->disallowedToCrawlSite($url) ||
- !$this->withinQuota($url, 0)) {
+ //sitemaps will have #sitemap add to the url, so strip
+ $check_url = explode("#", $url)[0];
+ if (!$this->allowedToCrawlSite($check_url) ||
+ $this->disallowedToCrawlSite($check_url) ||
+ !$this->withinQuota($check_url, 0)) {
continue;
}
}
@@ -2057,7 +2046,7 @@ class QueueServer implements CrawlConstants
} else {
$robot_time = $seen_robot_time[$hash_host];
}
- if (!$hard_coded && empty($robot_time)) {
+ if ((!$hard_coded && empty($robot_time))) {
if($robot_time === false &&
!isset($seen_robot_time[$hash_host])) {
$host_with_robots = ($scheme == "gopher") ? $host_url .
@@ -2368,6 +2357,7 @@ class QueueServer implements CrawlConstants
$host_url = UrlParser::getHost($url);
$hash_host = L\crawlHash($host_url);
$hard_coded = (strpos($url, "###!") > 0 );
+ $url = ($hard_coded) ? $url : explode("#", $url)[0];
$scheme = UrlParser::getScheme($host_url);
if ($scheme == "gopher") {
$is_robot =
@@ -2400,17 +2390,17 @@ class QueueServer implements CrawlConstants
$robots_okay = true;
} else if (!isset($hard_coded) || !$hard_coded) {
$robots_okay = $this->crawl_queue->checkRobotOkay($url);
+ if (!$this->allowedToCrawlSite($url) ||
+ $this->disallowedToCrawlSite($url)) {
+ /* This is checked when added to queue,
+ we check again here in case allowed and disallowed
+ sites have changed since then
+ */
+ $robots_okay = false;
+ }
} else {
$robots_okay = true;
}
- if (!$this->allowedToCrawlSite($url) ||
- $this->disallowedToCrawlSite($url)) {
- /* This is checked when added to queue,
- we check again here in case allowed and disallowed
- sites have changed since then
- */
- $robots_okay = false;
- }
if (!$robots_okay) {
L\crawlLog("FB Scheduler: $url removed by robot".
" policy");
@@ -2418,27 +2408,32 @@ class QueueServer implements CrawlConstants
}
$robot_data = $this->crawl_queue->getRobotData($host_url);
$delay = $robot_data['CRAWL_DELAY'] ?? 0;
- $flags = $robot_data['FLAGS'] ?? 0;
+ $is_delayed = $this->crawl_queue->isDelayedHost($host_url);
+ if ($is_delayed) {
+ L\crawlLog("$host_url is delayed");
+ }
/* if company level domain delays, then we're delayed
as well
*/
$cld = UrlParser::getCompanyLevelDomain($host_url);
$cld_host = UrlParser::getScheme($host_url) . "://" . $cld;
- $cld_flags = 0;
$use_cld = false;
if ($cld_host != $host_url) {
$cld_robot_data = $this->crawl_queue->getRobotData($cld_host);
- $cld_flags = $cld_robot_data['FLAGS'] ?? 0;
if ($delay == 0) {
$delay = $cld_robot_data['CRAWL_DELAY'] ?? 0;
if ($delay > 0) {
$use_cld = true;
}
}
+ if (!$is_delayed) {
+ $is_delayed = $this->crawl_queue->isDelayedHost($cld_host);
+ if ($is_delayed) {
+ L\crawlLog("$cld_host is delayed");
+ }
+ }
}
- if (!$this->withinQuota($url) ||
- ($flags & CrawlQueueBundle::WAITING_HOST) > 0 ||
- ($cld_flags & CrawlQueueBundle::WAITING_HOST) > 0) {
+ if ($is_delayed || !$this->withinQuota($url)) {
L\crawlLog("FB Scheduler: $url rescheduled due to quota".
" or crawl delay.");
$reschedule_tuples[] = $current_tuple;
@@ -2494,8 +2489,7 @@ class QueueServer implements CrawlConstants
$add_delayed_hosts[L\crawlHash($host_url)] = $host_url;
}
if (count($add_delayed_hosts) > $max_buffer_before_write) {
- $add_delayed_hosts = array_values($add_delayed_hosts);
- $this->crawl_queue->addCrawlDelayedHosts($schedule_time,
+ $this->crawl_queue->addDelayedHosts($schedule_time,
$add_delayed_hosts);
$add_delayed_hosts = [];
}
@@ -2527,8 +2521,7 @@ class QueueServer implements CrawlConstants
$this->crawl_order);
}
if (!empty($add_delayed_hosts)) {
- $add_delayed_hosts = array_values($add_delayed_hosts);
- $this->crawl_queue->addCrawlDelayedHosts($schedule_time,
+ $this->crawl_queue->addDelayedHosts($schedule_time,
$add_delayed_hosts);
}
$new_time = microtime(true);
diff --git a/src/library/CrawlQueueBundle.php b/src/library/CrawlQueueBundle.php
index 679ce5fd7..8063def6d 100644
--- a/src/library/CrawlQueueBundle.php
+++ b/src/library/CrawlQueueBundle.php
@@ -117,14 +117,6 @@ class CrawlQueueBundle
* Length of an IPv6 ip address (IPv4 address are padded)
*/
const IP_SIZE = 16;
- /**
- * Url type flag
- */
- const NO_FLAGS = 0;
- /**
- * Url type flag
- */
- const WAITING_HOST = 1;
/** Size of int
*/
const INT_SIZE = 4;
@@ -145,9 +137,15 @@ class CrawlQueueBundle
*/
const URL_FILES_EXTENSION = ".txt.gz";
/**
- * Number of bytes in for hash table key
+ * Folder used to keep track of host that have been delayed, so
+ * urls from that host are not allowed to be schedule
+ */
+ const DELAYED_FOLDER = "DelayedHosts";
+ /**
+ * Within DELAYED_FOLDER a folder containing hashes of host urls
+ * for each delayed host
*/
- const CRAWL_DELAYED_FOLDER = "CrawlDelayedHosts";
+ const HASHES_FOLDER = "Hashes";
/**
* Number of bytes in for hash table key
*/
@@ -174,14 +172,21 @@ class CrawlQueueBundle
if (!file_exists($this->dir_name)) {
mkdir($this->dir_name);
}
- if (!file_exists($this->dir_name . "/" . self::CRAWL_DELAYED_FOLDER)) {
- mkdir($this->dir_name . "/" . self::CRAWL_DELAYED_FOLDER);
+ $delayed_path = $this->dir_name . "/" . self::DELAYED_FOLDER;
+ if (!file_exists($delayed_path)) {
+ mkdir($delayed_path);
+ }
+ $delayed_hash_path = $delayed_path . "/" . self::HASHES_FOLDER;
+ if (!file_exists($delayed_hash_path)) {
+ mkdir($delayed_hash_path);
}
- if (!file_exists($this->dir_name . "/" . self::ROBOT_WAIT_FOLDER)) {
- mkdir($this->dir_name . "/" . self::ROBOT_WAIT_FOLDER);
+ $robot_wait_path = $this->dir_name . "/" . self::ROBOT_WAIT_FOLDER;
+ if (!file_exists($robot_wait_path)) {
+ mkdir($robot_wait_path);
}
- if (!file_exists($this->dir_name . "/" . self::URL_QUEUE_FOLDER)) {
- mkdir($this->dir_name . "/" . self::URL_QUEUE_FOLDER);
+ $url_queue_path = $this->dir_name . "/" . self::URL_QUEUE_FOLDER;
+ if (!file_exists($url_queue_path)) {
+ mkdir($url_queue_path);
}
/* Hash table containing DNS cache this is cleared whenever robot
filters cleared
@@ -204,7 +209,7 @@ class CrawlQueueBundle
$this->robot_table = new LinearHashTable($dir_name .
"/RobotData", ["PRIMARY KEY" => "HOSTNAME",
"CAPTURE_TIME" => "INT", "CRAWL_DELAY" => "INT",
- "ROBOT_PATHS" => "SERIAL", "FLAGS" => "INT"],
+ "ROBOT_PATHS" => "SERIAL"],
LinearHashTable::MAX_ITEMS_PER_FILE,
LinearHashTable::PARTITION_SIZE_THRESHOLD,
C\NS_COMPRESSORS . "GzipCompressor");
@@ -286,7 +291,7 @@ class CrawlQueueBundle
* of tiers. Url tuples are sorted into a tier based on the number of
* urls that have been downloaded for that url's host and their weight.
* Naively, without weight, a url goes into tier
- * floor(log(# of urls downloaded already for its host))
+ * floor(log10(# of urls downloaded already for its host))
* Within a tier, urls are stored in folders by day recieved and then into
* a file from a sequence of files according to order received. Each file
* in the sequence is able to store 1MB compressed many url tuples.
@@ -351,9 +356,19 @@ class CrawlQueueBundle
if ($crawl_order == CrawlConstants::BREADTH_FIRST) {
return $cld_data;
}
+ /* The more urls we see from a cld the heavier we weigh them
+ This will tend to assign future urls to a higher tier and
+ hence will have to wait longer to be crawled
+ */
$cld_data['WEIGHTED_SEEN_URLS'] += min(1, 1 + log(1 + $weight, 5));
$linking_cld = UrlParser::getCompanyLevelDomain($linking_url);
$linking_cld_data = $this->domain_table->get($linking_cld);
+ /*
+ WEIGHTED_INCOMING_URLS will be subtract from WEIGHTED_SEEN_URLS
+ when determining the tier of a to-be-queued url. The code
+ below tries to estimate the quality of the referer of the
+ current url and updates WEIGHTED_INCOMING_URLS accordingly
+ */
if (!empty($linking_cld_data)) {
$linking_url_tier = floor(log10(min(1,
$linking_cld_data['WEIGHTED_SEEN_URLS'] -
@@ -388,8 +403,10 @@ class CrawlQueueBundle
$tier = floor(log10(max(1, $cld_data['WEIGHTED_SEEN_URLS'] -
$cld_data['WEIGHTED_INCOMING_URLS'])));
$robots_txt = "robots.txt";
+ $site_map_fragment = "#sitemap";
// put sitemaps in a higher queue
- if (in_array(substr($url, -3), [".gz", ".bz", "xml"])) {
+ if (in_array(substr($url, -3), [".gz", ".bz", "xml"]) ||
+ substr($url, -strlen($site_map_fragment)) == $site_map_fragment) {
$tier += C\SITEMAP_TIER_PENALTY;
} else if (UrlParser::guessMimeTypeFromFileName($url) != "text/html"
&& substr($url, -strlen($robots_txt)) != $robots_txt) {
@@ -404,11 +421,11 @@ class CrawlQueueBundle
return $tier;
}
/**
- * This method is used to send urls that are in the waiting hosts folder
- * for hosts listed in $this->robot_notify_hosts
- * to be received to be moved to the queue because host membership in
- * $this->robot_notify_hosts indicates that a robots.txt
- * file has just been received for the particular domain.
+ * This method is used to move urls that are in the ROBOT_WAIT_FOLDER
+ * whose hosts are listed in $this->robot_notify_hosts (i.e., got robot.txt)
+ * to the appropriate tier of the send fetcher queue. It these
+ * empties $this->robot_notify_hosts.
+ *
* @param string $crawl_order one of CrawlConstants::BREADTH_FIRST or
* CrawlConstants::HOST_BUDGETING
*/
@@ -526,7 +543,7 @@ class CrawlQueueBundle
}
$time = ($timestamp == 0) ? time() : $timestamp;
$day = floor($time/C\ONE_DAY);
- $dir .= "/$day";
+ $dir .= "/" . sprintf("%'.09d", $day);
if (!file_exists($dir)) {
mkdir($dir);
chmod($dir, 0777);
@@ -581,7 +598,7 @@ class CrawlQueueBundle
*/
public function chooseFetchBatchQueueFolder($crawl_order)
{
- static $last_folder = 0;
+ static $last_folder = -1; // will be incremented before used
static $exp_max_folder = 1;
$url_queue_folder = $this->dir_name . "/" . self::URL_QUEUE_FOLDER;
$sub_dirs = glob("$url_queue_folder/*", GLOB_ONLYDIR);
@@ -600,10 +617,14 @@ class CrawlQueueBundle
return ($is_empty) ? false : $sub_dir;
}
/* the hope of the following is to prevent looking at sitemaps
- too early in the crawl before all the seed sites are donwloaded
+ too early in the crawl before all the seed sites are downloaded
*/
$exp_max_folder++;
- $pre_max_folder = ceil(log($exp_max_folder, 2));
+ $pre_max_folder = floor(log10($exp_max_folder)); /*$exp_max_folder ==2,
+ so $pre_max_folder ==1, second time $exp_max_folder ==3,
+ so $pre_max_folder ==1, third time $exp_max_folder ==4,
+ so $pre_max_folder ==2, etc. when == C\SITEMAP_TIER_PENALTY,
+ all folders will be available below */
if ($pre_max_folder >= C\SITEMAP_TIER_PENALTY) {
$pre_max_folder = count($sub_dirs);
}
@@ -612,25 +633,57 @@ class CrawlQueueBundle
$last_folder + 1 : 0;
return $sub_dirs[$last_folder];
}
+ /**
+ *
+ */
+ public function getDelayedHashFolder($hash_url)
+ {
+ $path = $this->dir_name . "/" . self::DELAYED_FOLDER . "/" .
+ self::HASHES_FOLDER;
+ return "$path/{$hash_url[0]}/{$hash_url[1]}";
+ }
/**
* For a timestamp $schedule_time of a fetch batch of urls to be downloaded
- * and for a list of crawl-delayed hosts in that batch, add the hosts to
- * a a $schedule time file in the CrawlDelayedHosts queue so they can be
+ * and for a list of delayed hosts in that batch, add the hosts to
+ * a a $schedule time file in the DelayedHosts queue so they can be
* notified when that fetch batch is done processing. Until notified any
* url from one of these crawl delayed hosts will be rescheduled rather than
* but in a fetch batch for download.
*
- * @param int schedule_time
- * @param array $host_urls array of urls for hosts that are crawl delayed
+ * @param int schedule_time time of the schedules these urls are waiting on
+ * @param array $host_urls array of urls
+ * for hosts that are crawl delayed
* and for which there is a schedule currently running on fetchers
* which might download from that host
*/
- public function addCrawlDelayedHosts($schedule_time, $host_urls)
+ public function addDelayedHosts($schedule_time, $host_urls)
+ {
+ foreach ($host_urls as $host_url) {
+ $hash_host = crawlHash($host_url);
+ $hash_folder = $this->getDelayedHashFolder($hash_host);
+ if (!file_exists($hash_folder)) {
+ makePath($hash_folder);
+ }
+ file_put_contents("$hash_folder/$hash_host", "1");
+ }
+ $host_urls = array_values($host_urls);
+ $delayed_folder = $this->dir_name . "/" . self::DELAYED_FOLDER;
+ $this->addUrlsDirectory($delayed_folder, $host_urls, $schedule_time);
+ }
+ /**
+ * Checks whether urls for a particular $host_url are being delayed
+ * with respect to scheduling into fetch batches.
+ *
+ * @param string $host_url or crawlHash of host url
+ * @param boolean $is_hash_host if true then $host_url is treated as a
+ * crawlHash of host url rather than just a host url
+ * @return boolean whether urls for the given host are being delayed or not
+ */
+ public function isDelayedHost($host_url, $is_hash_host = false)
{
- $crawl_delayed_folder =
- $this->dir_name . "/" . self::CRAWL_DELAYED_FOLDER;
- $this->addUrlsDirectory($crawl_delayed_folder, $host_urls,
- $schedule_time);
+ $hash_host = ($is_hash_host) ? $host_url : crawlHash($host_url);
+ return file_exists($this->getDelayedHashFolder($hash_host) .
+ "/$hash_host");
}
/**
* For each host in the crawl-delayed hosts queue waiting on the
@@ -641,88 +694,70 @@ class CrawlQueueBundle
* @param int $timestamp of a fetch batch schedule to notify
* crawl-delayed hosts that it has completed download.
*/
- public function notifyCrawlDelayedHosts($timestamp)
+ public function notifyDelayedHosts($timestamp)
{
crawlLog("Scheduler: Notifying hosts that were crawl delayed by ".
"Schedule $timestamp");
$db_class = C\NS_DATASOURCES . ucfirst(C\DBMS) . "Manager";
$db = new $db_class();
- $crawl_delayed_folder =
- $this->dir_name . "/" . self::CRAWL_DELAYED_FOLDER;
- $day_folders = $this->getDayFolders($crawl_delayed_folder);
- //maximum crawl delay will honor is one day
- $yesterday = floor((time() - C\ONE_DAY)/C\ONE_DAY);
+ $delayed_folder = $this->dir_name . "/" . self::DELAYED_FOLDER;
+ $day_folders = $this->getDayFolders($delayed_folder);
if (empty($day_folders)) {
return; //no one is waiting
}
- $robot_rows = [];
+ /* maximum crawl delay will honor is one day,
+ so let's notify anything older
+ */
+ $yesterday = floor((time() - C\ONE_DAY)/C\ONE_DAY);
foreach ($day_folders as $day_folder) {
$day_timestamp = intval(substr($day_folder, -9));
if ($day_timestamp >= $yesterday) {
continue;
}
+ crawlLog(
+ "Scheduler: Notifying long waiting hosts in $day_folder!");
$waiting_host_files = $this->getUrlsFiles($day_folder);
if (!empty($waiting_host_files)) {
foreach ($waiting_host_files as $waiting_host_file) {
- $robot_rows = $this->processWaitingHostFile(
- $waiting_host_file, $robot_rows);
+ $this->processWaitingHostFile($waiting_host_file);
}
}
$db->unlinkRecursive($day_folder);
}
+ // Now let's notify those hosts waiting on $timestamp
$stamp_day = floor($timestamp/C\ONE_DAY);
- $file_name = "$crawl_delayed_folder/$stamp_day/".
+ $file_name = "$delayed_folder/" . sprintf("%'.09d", $stamp_day). "/".
sprintf("%'.09d", $timestamp) . self::URL_FILES_EXTENSION;
if (file_exists($file_name)) {
- $robot_rows = $this->processWaitingHostFile($file_name,
- $robot_rows);
- if (!empty($robot_rows)) {
- if (C\nsdefined('VERBOSE_LOGGING') && C\VERBOSE_LOGGING) {
- crawlLog(
- "Scheduler: Notifying the following list of hosts:");
- $i = 0;
- foreach ($robot_rows as $robot_row) {
- $i++;
- crawlLog("$i. ". $robot_row['HOSTNAME'] ?? "");
- }
- }
- $this->robot_table->put($robot_rows);
- }
+ $this->processWaitingHostFile($file_name);
}
}
/**
- * Used by @see notifyCrawlDelayedHosts($timestamp).
- * For each host listed in the file $file_name get its robot info from
- * robot_table, clear its FLAG column, store the update into
- * a temporary array $robot_rows. Every MAX_URL_BUFFER_BEFORE_WRITE
- * many such hosts, write the updates in $robot_rows back to the
- * robot_table on disk. If last batch of modified rows has been written
- * when done file, return these in $robot_rows
+ * Used by @see notifyDelayedHosts($timestamp).
+ * For each host listed in the file $file_name deleted its file from
+ * the delayed hash folder. Once all host urls in the file are processed.
+ * deletes the file.
*
- * @param string $file_name to get hosts to clear flag columns of
- * @param array $robot_rows rows of updated hosts potentially from a
- * previously processed file.
- * @return array $robot_rows leftover updated robot host rows that haven't
- * been written to disk yet
+ * @param string $file_name to with a list of hosts.
*/
- public function processWaitingHostFile($file_name, $robot_rows)
+ public function processWaitingHostFile($file_name)
{
$waiting_hosts = $this->getUrlsFileContents($file_name);
if (empty($waiting_hosts)) {
return [];
}
+ crawlLog("Scheduler: Notifying the following list of hosts:");
+ $i = 0;
foreach ($waiting_hosts as $waiting_host) {
- $robot_data = $this->robot_table->get($waiting_host);
- $robot_data["FLAGS"] = 0;
- $robot_rows[] = $robot_data;
- if (count($robot_rows) > self::MAX_URL_BUFFER_BEFORE_WRITE) {
- crawlLog("Scheduler: Notifying the following list of hosts ".
- print_r($robot_rows, true));
- $this->robot_table->put($robot_rows);
- $robot_rows = [];
+ $hash_host = crawlHash($waiting_host);
+ if ($this->isDelayedHost($hash_host, true)) {
+ $i++;
+ crawlLog("$i. $waiting_host");
+ unlink($this->getDelayedHashFolder($hash_host) .
+ "/$hash_host");
}
}
- return $robot_rows;
+ unlink($file_name);
}
/**
* Checks if the given $url is allowed to be crawled based on stored
@@ -760,7 +795,7 @@ class CrawlQueueBundle
* For a provided hostname, returns the robots.txt
* information stored in the the robot table: [HOSTNAME,
* CAPTURE_TIME, CRAWL_DELAY, ROBOT_PATHS => [ALLOWED_SITES,
- * DISALLOWED_SITES], FLAGS (for not whether should wait for notification
+ * DISALLOWED_SITES], FLAGS (for whether should wait for notification
* from a schedule being downloaded before continuing crawling the site).
*
* @param string $host hostname to look up robots.tx info for.
diff --git a/src/library/processors/RobotProcessor.php b/src/library/processors/RobotProcessor.php
index e4d4c453e..c7db7bc68 100644
--- a/src/library/processors/RobotProcessor.php
+++ b/src/library/processors/RobotProcessor.php
@@ -149,7 +149,12 @@ class RobotProcessor extends PageProcessor
$tmp_url = UrlParser::canonicalLink($value, $host_url);
if (!UrlParser::checkRecursiveUrl($tmp_url)
&& strlen($tmp_url) < C\MAX_URL_LEN) {
- $summary[self::LINKS][] = $tmp_url;
+ /*
+ Sometimes sitemap files don't end in xml
+ so add a fragment to flag as a sitemap when
+ determining queue tier
+ */
+ $summary[self::LINKS][] = $tmp_url . "#sitemap";
}
break;
case "allow":