viewgit/inc/functions.php:22 Function utf8_encode() is deprecated [8192]
<?php /** * SeekQuarry/Yioop -- * Open Source Pure PHP Search Engine, Crawler, and Indexer * * Copyright (C) 2009 - 2022 Chris Pollett chris@pollett.org * * LICENSE: * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <https://www.gnu.org/licenses/>. * * END LICENSE * * @author Chris Pollett chris@pollett.org * @license https://www.gnu.org/licenses/ GPL3 * @link https://www.seekquarry.com/ * @copyright 2009 - 2022 * @filesource */ namespace seekquarry\yioop\library; use seekquarry\yioop\configs as C; use seekquarry\yioop\library\compressors\NonCompressor; /** * Used for the crawlHash function */ require_once __DIR__ . '/Utility.php'; /** * Encapsulates the data structures needed to have a queue of to crawl urls * * @author Chris Pollett */ class CrawlQueueBundle { /** * The folder name of this CrawlQueueBundle * @var string */ public $dir_name; /** * number of entries the priority queue used by this web queue bundle * can store * @var int */ public $num_urls_ram; /** * Number items that can be stored in a partition of the page exists filter * bundle * @var int */ public $filter_size; /** * Array of hosts for which a robots.txt file has just been received and * processed for which urls from that host are still waiting to be notified * for queueing. * @var array */ public $robot_notify_hosts; /** * LinearHashTable of information about company level domains that have * been crawled. Information includes number of SEEN_URLS, number of * WEIGHTED_SEEN_URLS, number of WEIGHTED_INCOMING_URLS. * (A company level domain is google.com or google.co.uk, but not * fo.la.google.com, www.google.com, foo.google.com or foo.google.co.uk) * @var LinearHashTable */ public $domain_table; /** * host-ip table used for dns look-up, comes from robot.txt data and * deleted with same frequency * @var object */ public $dns_table; /** * RAM cache of recent robot.txt stuff crawlHash(host) => robot.txt info * @var array */ public $robot_cache = []; /** * Time when cache of recent robot.txt for host done * crawlHash(host) => timestamp * @var array */ public $robot_cache_times = []; /** * BloomFilter used to keep track of which urls we've already seen * @var object */ public $url_exists_filter_bundle; /** * HashTable used to store offsets into WebArchive that stores robot paths * @var LinearHashTable */ public $robot_table; /** * Holds etag and expires http data * @var LinearHashTable */ public $etag_table; /** * Number of bytes in for hash table key */ const HASH_KEY_SIZE = 8; /** * Length of an IPv6 ip address (IPv4 address are padded) */ const IP_SIZE = 16; /** * Url type flag */ const NO_FLAGS = 0; /** * Url type flag */ const WAITING_HOST = 1; /** Size of int */ const INT_SIZE = 4; /** Size of notify buffer */ const MAX_URL_FILE_SIZE = 1000000; /** * When writing urls to robot_table, how many to buffer at a time and * then bulk put. */ const MAX_URL_BUFFER_BEFORE_WRITE = 500; /** * File extension to used for files of serialized url data */ const URL_FILES_EXTENSION = ".txt.gz"; /** * Number of bytes in for hash table key */ const CRAWL_DELAYED_FOLDER = "CrawlDelayedHosts"; /** * Number of bytes in for hash table key */ const URL_QUEUE_FOLDER = "UrlQueue"; /** * Number of bytes in for hash table key */ const ROBOT_WAIT_FOLDER = "WaitRobotUrls"; /** * Makes a CrawlQueueBundle with the provided parameters * * @param string $dir_name folder name used by this CrawlQueueBundle * @param int $filter_size size of each partition in the page exists * BloomFilterBundle * @param int $num_urls_ram number of entries in ram for the priority queue */ public function __construct($dir_name, $filter_size, $num_urls_ram) { $this->dir_name = $dir_name; $this->filter_size = $filter_size; $this->num_urls_ram = $num_urls_ram; $this->robot_notify_hosts = []; if (!file_exists($this->dir_name)) { mkdir($this->dir_name); } if (!file_exists($this->dir_name . "/" . self::CRAWL_DELAYED_FOLDER)) { mkdir($this->dir_name . "/" . self::CRAWL_DELAYED_FOLDER); } if (!file_exists($this->dir_name . "/" . self::ROBOT_WAIT_FOLDER)) { mkdir($this->dir_name . "/" . self::ROBOT_WAIT_FOLDER); } if (!file_exists($this->dir_name . "/" . self::URL_QUEUE_FOLDER)) { mkdir($this->dir_name . "/" . self::URL_QUEUE_FOLDER); } /* Hash table containing DNS cache this is cleared whenever robot filters cleared */ if (file_exists($dir_name . "/dns_table.dat")) { $this->dns_table = HashTable::load($dir_name . "/dns_table.dat"); } else { $this->dns_table = new HashTable($dir_name . "/dns_table.dat", 4 * $num_urls_ram, self::HASH_KEY_SIZE, self::IP_SIZE); } //filter bundle to check if we have already visited a URL $this->url_exists_filter_bundle = new BloomFilterBundle( $dir_name . "/UrlExistsFilterBundle", $filter_size); // set-up table for each host to keep track of number of urls downloaded $this->domain_table = new LinearHashTable($dir_name . "/CLDData", ["PRIMARY KEY" => "COMPANY_LEVEL_DOMAIN", "SEEN_URLS" => "INT", "WEIGHTED_SEEN_URLS" => "DOUBLE", "WEIGHTED_INCOMING_URLS" => "DOUBLE"]); //set up storage for robots.txt info $this->robot_table = new LinearHashTable($dir_name . "/RobotData", ["PRIMARY KEY" => "HOSTNAME", "CAPTURE_TIME" => "INT", "CRAWL_DELAY" => "INT", "ROBOT_PATHS" => "SERIAL", "FLAGS" => "INT"], LinearHashTable::MAX_ITEMS_PER_FILE, LinearHashTable::PARTITION_SIZE_THRESHOLD, C\NS_COMPRESSORS . "GzipCompressor"); //Initialize table for cache page validation data $this->etag_table = new LinearHashTable($dir_name . '/EtagExpiresInfo', ["PRIMARY KEY" => "URL", "ETAG" => "TEXT", "EXPIRES" => "INT"]); $this->notify_buffer = []; } //Filter and Filter Bundle Methods /** * Adds the supplied url to the url_exists_filter_bundle * @param string $url url to add */ public function addSeenUrlFilter($url) { $this->url_exists_filter_bundle->add($url); } /** * Removes all url objects from $url_array which have been seen * @param array &$url_array objects to check if have been seen * @param array $field_names an array of components of a url_array element * which contain a url to check if seen. If null, assumes url_array * is just and array of urls not an array of url infos (i.e., array of * array), and just directly checks those strings */ public function differenceSeenUrls(&$url_array, $field_names = null) { $this->url_exists_filter_bundle->differenceFilter( $url_array, $field_names); } /** * Returns the timestamp of the last time host's robots.txt file was * downloaded * @param string $host url to check * @return int|bool returns false if no capture of robots.txt yet, * otherwise returns an integer timestamp */ public function gotRobotTxtTime($host) { $row = $this->robot_table->get($host, ["CAPTURE_TIME"]); if ($row !== false) { return $row["CAPTURE_TIME"]; } return false; } /** * Adds an array of url tuples to the queue of urls of waiting for * robots.txt files to be received. This queue consists of a folder * CrawlQueueBundle::ROBOT_WAIT_FOLDER with subfolders the hash of the * name of a host that doesn't have a robots.txt file received yet. * $url_tuple are then sorted to the appropriate host subfolder and are * stored in subfolders by the day recieved and then a file in a sequence * files according to order received. Each file in the sequence is able * to store 1MB compressed many url tuples. * * @param array $url_tuples array of tuples of the form * (url, weight, referer) */ public function addWaitRobotQueue($url_tuples) { $robot_wait_dir = $this->dir_name . "/" . self::ROBOT_WAIT_FOLDER; //Group by host $tuples_by_host = []; foreach ($url_tuples as $url_tuple) { list($url, ) = $url_tuple; $host = UrlParser::getHost($url); $hash_host = crawlHash($host); $tuples_by_host[$hash_host] ??= []; $tuples_by_host[$hash_host][] = $url_tuple; } foreach($tuples_by_host as $hash_host => $host_tuples) { $hash_host_dir = $robot_wait_dir . "/$hash_host"; $this->addUrlsDirectory($hash_host_dir, $host_tuples); } } /** * Adds an array of url tuples to the queue of urls about to be scheduled * into fetches batches to be downloaded by fetchers. This queue consists * of tiers. Url tuples are sorted into a tier based on the number of * urls that have been downloaded for that url's host and their weight. * Naively, without weight, a url goes into tier * floor(log(# of urls downloaded already for its host)) * Within a tier, urls are stored in folders by day recieved and then into * a file from a sequence of files according to order received. Each file * in the sequence is able to store 1MB compressed many url tuples. * * @param array $url_tuples array of tuples of the form * (url, weight, referer) * @param string $crawl_order one of CrawlConstants::BREADTH_FIRST or * CrawlConstants::HOST_BUDGETING */ public function addSendFetcherQueue($url_tuples, $crawl_order) { $url_queue_folder = $this->dir_name . "/" . self::URL_QUEUE_FOLDER; //Group by host $seen_clds = []; $out_queues = []; foreach ($url_tuples as $url_tuple) { list($url, ) = $url_tuple; $cld = UrlParser::getCompanyLevelDomain($url); if (isset($seen_clds[$cld])) { $cld_data = $seen_clds[$cld]; } else { $cld_data = $this->domain_table->get($cld); } if (empty($cld_data)) { $cld_data = ['COMPANY_LEVEL_DOMAIN' => $cld, 'SEEN_URLS' => 0, 'WEIGHTED_SEEN_URLS' => 0.0, 'WEIGHTED_INCOMING_URLS' => 0.0]; } $cld_data = $this->updateCompanyLevelDomainData($url_tuple, $cld_data, $crawl_order); $tier = $this->computeTierUrl($url_tuple, $cld_data, $crawl_order); $out_queues[$tier] ??= []; $out_queues[$tier][] = $url_tuple; $seen_clds[$cld] = $cld_data; } foreach ($seen_clds as $cld => $cld_data) { $this->domain_table->put($cld_data); } foreach ($out_queues as $tier => $url_tuples) { $out_folder = "$url_queue_folder/Tier$tier"; $this->addUrlsDirectory($out_folder, $url_tuples); } } /** * Computes an update to the company level domain data provided in * cld_data, updating the WEIGHTED_SEEN_URLS and WEIGHTED_INCOMING_URLS * fields according to information about a discovered url in $url_tuple * * @param array $url_tuple $url_tuple 5-tuple contains a url, its weight, * the depth in the crawl where it was found, the url that refered to it, * and thaturl's weight * @param array $cld_data company level domain data to update * @param string $crawl_order one of CrawlConstants::BREADTH_FIRST or * CrawlConstants::HOST_BUDGETING * @return int tier $url should be queue into */ public function updateCompanyLevelDomainData($url_tuple, $cld_data, $crawl_order) { list($url, $weight, $depth, $linking_url, $linking_weight) = $url_tuple; $cld_data['SEEN_URLS']++; if ($crawl_order == CrawlConstants::BREADTH_FIRST) { return $cld_data; } $cld_data['WEIGHTED_SEEN_URLS'] += min(1, 1 + log(1 + $weight, 5)); $linking_cld = UrlParser::getCompanyLevelDomain($linking_url); $linking_cld_data = $this->domain_table->get($linking_cld); if (!empty($linking_cld_data)) { $linking_url_tier = floor(log10(min(1, $linking_cld_data['WEIGHTED_SEEN_URLS'] - $linking_cld_data['WEIGHTED_INCOMING_URLS']))); if ($cld_data['COMPANY_LEVEL_DOMAIN'] != $linking_cld_data['COMPANY_LEVEL_DOMAIN']) { $cld_data['WEIGHTED_INCOMING_URLS'] += 1/( (1.1 + $linking_url_tier + log(1 + $linking_weight, 5))); } } return $cld_data; } /** * Used to compute which send-fetcher-queue tier a url should be added * to based, on the data related to the url in $url_tuple, * its company level domain data, and the crawl order being used * * @param array $url_tuple 5-tuple contains a url, its weight, the depth * in the crawl where it was found, the url that refered to it, and that * url's weight * @param array $cld_data * @param string $crawl_order one of CrawlConstants::BREADTH_FIRST or * CrawlConstants::HOST_BUDGETING * @return int tier $url should be queue into */ public function computeTierUrl($url_tuple, $cld_data, $crawl_order) { list($url, $weight, $depth, ) = $url_tuple; if ($crawl_order == CrawlConstants::BREADTH_FIRST) { return $depth; } $tier = floor(log10(max(1, $cld_data['WEIGHTED_SEEN_URLS'] - $cld_data['WEIGHTED_INCOMING_URLS']))); $robots_txt = "robots.txt"; // put sitemaps in a higher queue if (in_array(substr($url, -3), [".gz", ".bz", "xml"])) { $tier += C\SITEMAP_TIER_PENALTY; } else if (UrlParser::guessMimeTypeFromFileName($url) != "text/html" && substr($url, -strlen($robots_txt)) != $robots_txt) { //slightly penalize non html documents $tier++; } if (C\nsdefined('VERBOSE_LOGGING') && C\VERBOSE_LOGGING) { crawlLog("Computed tier $tier for $url based on seen score " . $cld_data['WEIGHTED_SEEN_URLS'] . " and incoming score " . $cld_data['WEIGHTED_INCOMING_URLS']); } return $tier; } /** * This method is used to send urls that are in the waiting hosts folder * for hosts listed in $this->robot_notify_hosts * to be received to be moved to the queue because host membership in * $this->robot_notify_hosts indicates that a robots.txt * file has just been received for the particular domain. * @param string $crawl_order one of CrawlConstants::BREADTH_FIRST or * CrawlConstants::HOST_BUDGETING */ public function processReceivedRobotTxtUrls($crawl_order) { $db_class = C\NS_DATASOURCES . ucfirst(C\DBMS) . "Manager"; $db = new $db_class(); $robot_wait_dir = $this->dir_name . "/" . self::ROBOT_WAIT_FOLDER; $robot_notify_hosts = $this->robot_notify_hosts; $len_dot_txt_gz = strlen(self::URL_FILES_EXTENSION); foreach ($robot_notify_hosts as $host) { $hash_host = crawlHash($host); $host_subfolder = "$robot_wait_dir/$hash_host"; if (!file_exists($host_subfolder)) { continue; } $day_folders = $this->getDayFolders($host_subfolder); if (empty($day_folders)) { continue; } foreach ($day_folders as $day_folder) { $url_files = $this->getUrlsFiles($day_folder); if (empty($url_files)) { continue; } foreach ($url_files as $url_file) { $url_info = $this->getUrlsFileContents($url_file); if (is_array($url_info)) { $this->addSendFetcherQueue($url_info, $crawl_order); } unlink($url_file); } } $db->unlinkRecursive("$robot_wait_dir/$hash_host", true); } $this->robot_notify_hosts = []; } /** * Returns an array of all the days folders for a crawl queue. * By design queues in a CrawlQueueBundle consist of a sequence of * subfolders with day timestamps (floor(unixstamp/86400)), and then * files within these folders. This function returns a list of the * day folder paths for such a queue. * Note this function assumes that there aren't too many days to exceed * memory. If a crawl runs at most a few years, this should be the case * @param string $dir folder qhich is acting as a CrawlQueueBundle queue * @return array of paths to day folders */ public function getDayFolders($dir) { $digit = "[0123456789]"; $folders = glob("$dir/$digit*", GLOB_ONLYDIR); return $folders; } /** * Returns an array of all the url info files in a queue subfolder of * a queue for a CrawlQueueBundle. Url info files are usually stored * in a file with a nine digit number followed by the queues file * extension (usually .txt.gz) and store up to 1MB of compressed url info. * This function assumes the paths to the number of url info files in the * provided can fit in memory * * @param string $dir folder containing url info files * @return array of paths to each url info file found. */ public function getUrlsFiles($dir) { $digit = "[0123456789]"; $files = glob("$dir/$digit*" . self::URL_FILES_EXTENSION); return $files; } /** * Returns the unserialized contents of a url info file after decompression. * Assumes the resulting structure is small enough to fit in memory * * @param string $file_name name of url info file * @return array of uncompressed, unserialized contents of this file. */ public function getUrlsFileContents($file_name) { $contents = gzdecode(file_get_contents($file_name)); return unserialize($contents); } /** * Serializes and compress the url info (such as url tuples (url, weight, * referer)) provided in $url_data and save the results into $file_name * @param string $file_name name of file to store unrl info into * @param array $url_data data to be serialized, compressed, and stored. */ public function putUrlsFileContents($file_name, $url_data) { $contents = gzencode(serialize($url_data)); return file_put_contents($file_name, $contents); } /** * Adds the url info (such as url tuples (url, weight, referer)) to * the appropriate file in a subfolder of the folder $dir used to * implement a CrawlBundleQueue. If $timestamp is 0, then will store * data in $dir/current day time stamp/last_file_in_folder.txt.gz . If * last file exceed 1MB a new last file is started. If $timestamp > 0 * then data is stored in * $dir/$timestamp's day time stamp/$timestamp.txt.gz * * @param string $dir folder to store data into a subfolder of * @param array $url_info information to serialized, compress, and store * @param int $timestamp to use during storage to determine path as * described above */ public function addUrlsDirectory($dir, $url_info, $timestamp = 0) { if (!file_exists($dir)) { mkdir($dir); chmod($dir, 0777); } $time = ($timestamp == 0) ? time() : $timestamp; $day = floor($time/C\ONE_DAY); $dir .= "/$day"; if (!file_exists($dir)) { mkdir($dir); chmod($dir, 0777); } $out_file_data = []; if ($timestamp > 0) { $out_file = "$dir/" . sprintf("%'.09d", $timestamp). self::URL_FILES_EXTENSION; if (file_exists($out_file)) { $out_file_data = unserialize(gzdecode( file_get_contents($out_file))); if (!is_array($out_file_data)) { $out_file_data = []; } } } else { $count_file = "$dir/count.txt"; //using numbers rather than timestamps prevents race conditions $original_file_num = 1; if (file_exists($count_file)) { $original_file_num = max(intval(file_get_contents( $count_file)), 1); } $file_num = $original_file_num; $out_file = "$dir/" . sprintf("%'.09d", $file_num) . self::URL_FILES_EXTENSION; while (file_exists($out_file)) { $out_file_size = filesize($out_file); if ($out_file_size < self::MAX_URL_FILE_SIZE) { $out_file_data = unserialize(gzdecode( file_get_contents($out_file))); if (!is_array($out_file_data)) { $out_file_data = []; } break; } $file_num++; $out_file = "$dir/" . sprintf("%'.09d", $file_num) . self::URL_FILES_EXTENSION; } } $out_file_data = array_merge($out_file_data, $url_info); $this->putUrlsFileContents($out_file, $out_file_data); } /** * Returns the path to the send-fetcher-queue tier to use to make the next * fetch batch of urls to download. * @param string $crawl_order one of CrawlConstants::BREADTH_FIRST or * CrawlConstants::HOST_BUDGETING * * @return string path to send-fetcher-queue tier */ public function chooseFetchBatchQueueFolder($crawl_order) { static $last_folder = 0; static $exp_max_folder = 1; $url_queue_folder = $this->dir_name . "/" . self::URL_QUEUE_FOLDER; $sub_dirs = glob("$url_queue_folder/*", GLOB_ONLYDIR); if (empty($sub_dirs)) { return false; } if ($crawl_order == CrawlConstants::BREADTH_FIRST) { $is_empty = true; foreach($sub_dirs as $sub_dir) { $day_folders = $this->getDayFolders($sub_dir); if (!empty($day_folders)) { $is_empty = false; break; } } return ($is_empty) ? false : $sub_dir; } /* the hope of the following is to prevent looking at sitemaps too early in the crawl before all the seed sites are donwloaded */ $exp_max_folder++; $pre_max_folder = ceil(log($exp_max_folder, 2)); if ($pre_max_folder >= C\SITEMAP_TIER_PENALTY) { $pre_max_folder = count($sub_dirs); } $max_folder = min(count($sub_dirs), $pre_max_folder); $last_folder = ($last_folder < $max_folder - 1) ? $last_folder + 1 : 0; return $sub_dirs[$last_folder]; } /** * For a timestamp $schedule_time of a fetch batch of urls to be downloaded * and for a list of crawl-delayed hosts in that batch, add the hosts to * a a $schedule time file in the CrawlDelayedHosts queue so they can be * notified when that fetch batch is done processing. Until notified any * url from one of these crawl delayed hosts will be rescheduled rather than * but in a fetch batch for download. * * @param int schedule_time * @param array $host_urls array of urls for hosts that are crawl delayed * and for which there is a schedule currently running on fetchers * which might download from that host */ public function addCrawlDelayedHosts($schedule_time, $host_urls) { $crawl_delayed_folder = $this->dir_name . "/" . self::CRAWL_DELAYED_FOLDER; $this->addUrlsDirectory($crawl_delayed_folder, $host_urls, $schedule_time); } /** * For each host in the crawl-delayed hosts queue waiting on the * fetch batch schedule with $timestamp timestamp, clear their FLAGS * variable in the robot table so that urls with this host are allowed to * be scheduled into future fetch batches for download. * * @param int $timestamp of a fetch batch schedule to notify * crawl-delayed hosts that it has completed download. */ public function notifyCrawlDelayedHosts($timestamp) { crawlLog("Scheduler: Notifying hosts that were crawl delayed by ". "Schedule $timestamp"); $db_class = C\NS_DATASOURCES . ucfirst(C\DBMS) . "Manager"; $db = new $db_class(); $crawl_delayed_folder = $this->dir_name . "/" . self::CRAWL_DELAYED_FOLDER; $day_folders = $this->getDayFolders($crawl_delayed_folder); //maximum crawl delay will honor is one day $yesterday = floor((time() - C\ONE_DAY)/C\ONE_DAY); if(empty($day_folders)) { return; //no one is waiting } $robot_rows = []; foreach ($day_folders as $day_folder) { $day_timestamp = intval(substr($day_folder, -9)); if ($day_timestamp >= $yesterday) { continue; } $waiting_host_files = $this->getUrlsFiles($day_folder); if (!empty($waiting_host_files)) { foreach ($waiting_host_files as $waiting_host_file) { $robot_rows = $this->processWaitingHostFile( $waiting_host_file, $robot_rows); } } $db->unlinkRecursive($day_folder); } $stamp_day = floor($timestamp/C\ONE_DAY); $file_name = "$crawl_delayed_folder/$stamp_day/". sprintf("%'.09d", $timestamp) . self::URL_FILES_EXTENSION; if (file_exists($file_name)) { $robot_rows = $this->processWaitingHostFile($file_name, $robot_rows); if (!empty($robot_rows)) { if (C\nsdefined('VERBOSE_LOGGING') && C\VERBOSE_LOGGING) { crawlLog( "Scheduler: Notifying the following list of hosts:"); $i = 0; foreach ($robot_rows as $robot_row) { $i++; crawlLog("$i. ". $robot_row['HOSTNAME'] ?? ""); } } $this->robot_table->put($robot_rows); } } } /** * Used by @see notifyCrawlDelayedHosts($timestamp). * For each host listed in the file $file_name get its robot info from * robot_table, clear its FLAG column, store the update into * a temporary array $robot_rows. Every MAX_URL_BUFFER_BEFORE_WRITE * many such hosts, write the updates in $robot_rows back to the * robot_table on disk. If last batch of modified rows has been written * when done file, return these in $robot_rows * * @param string $file_name to get hosts to clear flag columns of * @param array $robot_rows rows of updated hosts potentially from a * previously processed file. * @return array $robot_rows leftover updated robot host rows that haven't * been written to disk yet */ public function processWaitingHostFile($file_name, $robot_rows) { $waiting_hosts = $this->getUrlsFileContents($file_name); if (empty($waiting_hosts)) { return []; } foreach ($waiting_hosts as $waiting_host) { $robot_data = $this->robot_table->get($waiting_host); $robot_data["FLAGS"] = 0; $robot_rows[] = $robot_data; if (count($robot_rows) > self::MAX_URL_BUFFER_BEFORE_WRITE) { crawlLog("Scheduler: Notifying the following list of hosts ". print_r($robot_rows, true)); $this->robot_table->put($robot_rows); $robot_rows = []; } } return $robot_rows; } /** * Checks if the given $url is allowed to be crawled based on stored * robots.txt info. * * @param string $url to check * @return bool whether it was allowed or not */ public function checkRobotOkay($url) { list($host, $path) = UrlParser::getHostAndPath($url, true, true); $path = urldecode($path); $robot_data = $this->getRobotData($host); $robot_paths = $robot_data["ROBOT_PATHS"] ?? []; //these should have been urldecoded in RobotProcessor if (empty($robot_paths)) { $robots_okay = false; $robots_not_okay = true; } else { $robots_okay = true; $robots_not_okay = false; } if (!empty($robot_paths[CrawlConstants::DISALLOWED_SITES])) { $robots_not_okay = UrlParser::isPathMemberRegexPaths($path, $robot_paths[CrawlConstants::DISALLOWED_SITES]); $robots_okay = !$robots_not_okay; } if (!empty($robot_paths[CrawlConstants::ALLOWED_SITES])) { $robots_okay = UrlParser::isPathMemberRegexPaths($path, $robot_paths[CrawlConstants::ALLOWED_SITES]); } return $robots_okay || !$robots_not_okay; } /** * For a provided hostname, returns the robots.txt * information stored in the the robot table: [HOSTNAME, * CAPTURE_TIME, CRAWL_DELAY, ROBOT_PATHS => [ALLOWED_SITES, * DISALLOWED_SITES], FLAGS (for not whether should wait for notification * from a schedule being downloaded before continuing crawling the site). * * @param string $host hostname to look up robots.tx info for. * (no trailing / in hostname. i.e., https:/www.yahoo.com, not * https:/www.yahoo.com/) * @return array robot table row as described above */ public function getRobotData($host) { $key = crawlHash($host, true); if (isset($this->robot_cache[$key])) { $robot_data = $this->robot_cache[$key]; $this->robot_cache_times[$key] = microtime(true); } else { $robot_data = $this->robot_table->get($host); if (!empty($robot_data)) { $this->robot_cache[$key] = $robot_data; $cache_now = microtime(true); $this->robot_cache_times[$key] = $cache_now; if (count($this->robot_cache) > C\SIZE_ROBOT_TXT_CACHE) { asort($this->robot_cache_times); reset($this->robot_cache_times); $evict_key = key($this->robot_cache_times); unset($this->robot_cache_times[$evict_key], $this->robot_cache[$evict_key]); } } } return $robot_data; } /** * Gets the timestamp of the oldest dns address still stored in * the queue bundle * @return int a Unix timestamp */ public function getDnsAge() { $time = time(); if (file_exists($this->dir_name . "/dns_timestamp.txt")) { $creation_time = intval( file_get_contents($this->dir_name . "/dns_timestamp.txt")); } else { $creation_time = $time; } return ($time - $creation_time); } /** * Add an entry to the web_queue_bundles DNS cache * * @param string $host hostname to add to DNS Lookup table * @param string $ip_address in presentation format (not as int) to add * to table */ public function addDNSCache($host, $ip_address) { $pad = "000000000000"; $hash_host = crawlHash($host, true); $packed_ip = inet_pton($ip_address); if (strlen($packed_ip) == 4) { $packed_ip .= $pad; } $this->dns_table->insert($hash_host, $packed_ip); } /** * Used to lookup an entry in the DNS cache * * @param string $host hostname to add to DNS Lookup table * @return string ipv4 or ipv6 address written as a string */ public function dnsLookup($host) { $pad = "000000000000"; $hash_host = crawlHash($host, true); $packed_ip = $this->dns_table->lookup($hash_host); if (!$packed_ip) return false; $maybe_pad = substr($packed_ip, 4); $maybe_ip4 = substr($packed_ip, 0, 4); if (strcmp($maybe_pad, $pad) == 0) { $ip_address = inet_ntop($maybe_ip4); } else { $ip_address = inet_ntop($packed_ip); } if (strcmp($ip_address, "0.0.0.0") == 0) { return false; } return $ip_address; } /** * Gets the timestamp of the oldest url filter data still stored in * the queue bundle * @return int a Unix timestamp */ public function getUrlFilterAge() { $creation_time = intval( file_get_contents($this->dir_name."/url_timestamp.txt")); return (time() - $creation_time); } /** * Delete the Hash table used to store DNS lookup info. * Then construct an empty new one. * This is called roughly once a day at the same time as * @see emptyRobotFilters() * * @return string $message with what happened during empty process */ public function emptyDNSCache() { $num_values = $this->dns_table->num_values; if (file_exists($this->dir_name . "/dns_table.dat") ) { unlink($this->dir_name . "/dns_table.dat"); } $this->dns_table = null; garbageCollect(); $this->dns_table = new HashTable($this->dir_name . "/dns_table.dat", $num_values, self::HASH_KEY_SIZE, self::IP_SIZE); if ($this->dns_table) { $message = "Robot Emptier: dns_table empty now ". "and not null\n"; } else { $message = "Robot Emptier: dns_table could not be ". "reinitialized\n"; } return $message; } /** * Empty the crawled url filter for this web queue bundle; resets the * the timestamp of the last time this filter was emptied. */ public function emptyUrlFilter() { file_put_contents($this->dir_name."/url_timestamp.txt", time()); $this->url_exists_filter_bundle->reset(); } }