viewgit/inc/functions.php:22 Function utf8_encode() is deprecated [8192]
<?php /** * SeekQuarry/Yioop -- * Open Source Pure PHP Search Engine, Crawler, and Indexer * * Copyright (C) 2009 - 2014 Chris Pollett chris@pollett.org * * LICENSE: * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * * END LICENSE * * @author Chris Pollett chris@pollett.org * @package seek_quarry * @subpackage library * @license http://www.gnu.org/licenses/ GPL3 * @link http://www.seekquarry.com/ * @copyright 2009 - 2014 * @filesource */ if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} /** * We use a variety of bloom filters for handling robots.txt data */ require_once 'bloom_filter_file.php'; /** * Data on which urls we've already crawled is stored in a bloom filter bundle */ require_once 'bloom_filter_bundle.php'; /** * Priority queue is used to store a 8 byte ids of urls to crawl next */ require_once 'priority_queue.php'; /** * Hash table is used to store for each id in the priority queue an offset into * a web archive for that urls id actual complete url */ require_once 'hash_table.php'; /** * Urls are stored in a web archive using a filter that does no compression */ require_once BASE_DIR.'/lib/compressors/non_compressor.php'; /** * Used to store to crawl urls */ require_once 'web_archive.php'; /** * Used for getHost function */ require_once 'url_parser.php'; /** * Used for the crawlHash function */ require_once 'utility.php'; /** * B-Tree class for storing Etag and Expires https header data */ require_once "btree.php"; /** * Needed for robot stuff */ require_once 'crawl_constants.php'; /** * Encapsulates the data structures needed to have a queue of to crawl urls * * <pre> * (hash of url, weights) are stored in a PriorityQueue, * (hash of url, index in PriorityQueue, offset of url in WebArchive) is stored * in a HashTable * urls are stored in a WebArchive in an uncompressed format * </pre> * * @author Chris Pollett * * @package seek_quarry * @subpackage library */ class WebQueueBundle implements Notifier { /** * The folder name of this WebQueueBundle * @var string */ var $dir_name; /** * Number items that can be stored in a partition of the page exists filter * bundle * @var int */ var $filter_size; /** * number of entries the priority queue used by this web queue bundle * can store * @var int */ var $num_urls_ram; /** * whether polling the first element of the priority queue returns the * smallest or largest weighted element. This is set to a constant specified * in PriorityQueue * @var int */ var $min_or_max; /** * the PriorityQueue used by this WebQueueBundle * @var object */ var $to_crawl_queue; /** * the HashTable used by this WebQueueBundle * @var object */ var $to_crawl_table; /** * Current count of the number of non-read operation performed on the * WebQueueBundles's hash table since the last time it was rebuilt. * @var int */ var $hash_rebuild_count; /** * Number of non-read operations on the hash table before it needs to be * rebuilt. * @var int */ var $max_hash_ops_before_rebuild; /** * WebArchive used to store urls that are to be crawled * @var object */ var $to_crawl_archive; /** * BloomFilter used to keep track of which urls we've already seen * @var object */ var $url_exists_filter_bundle; /** * BloomFilter used to store which hosts whose robots.txt file we * have already download * @var object */ var $got_robottxt_filter; /** * host-ip table used for dns look-up, comes from robot.txt data and * deleted with same frequency * @var object */ var $dns_table; /** * HashTable used to store offsets into WebArchive that stores robot paths * @var object */ var $robot_table; /** * WebArchive used to store paths coming from robots.txt files * @var object */ var $robot_archive; /** * BloomFilter used to keep track of crawl delay in seconds for a given * host * @var object */ var $crawl_delay_filter; /** * Holds the B-Tree used for saving etag and expires http data */ var $etag_btree; /** * Associative array of (hash_url => index) pairs saying where a hash_url * has moved in the priority queue. These moves are buffered and later * re-stored in the hash table when notifyFlush called * @var array */ var $notify_buffer; /** * The largest offset for the url WebArchive before we rebuild it. * Entries are never deleted from the url WebArchive even if they are * deleted from the priority queue. So when we pass this value we * make a new WebArchive containing only those urls which are still in * the queue. */ const max_url_archive_offset = 200000000; /** * Number of bytes in for hash table key */ const HASH_KEY_SIZE = 8; /** * 4 bytes offset, 4 bytes index, 4 bytes flags */ const HASH_VALUE_SIZE = 12; /** * Length of an IPv6 ip address (IPv4 address are padded) */ const IP_SIZE = 16; /** * Url type flag */ const NO_FLAGS = 0; /** * Url type flag */ const ROBOT = 1; /** * Url type flag */ const SCHEDULABLE = 2; /** Size of int **/ const INT_SIZE = 4; /** Size of notify buffer **/ const NOTIFY_BUFFER_SIZE = 1000; /** * Makes a WebQueueBundle with the provided parameters * * @param string $dir_name folder name used by this WebQueueBundle * @param int $filter_size size of each partition in the page exists * BloomFilterBundle * @param int $num_urls_ram number of entries in ram for the priority queue * @param string $min_or_max when the priority queue maintain the heap * property with respect to the least or the largest weight */ function __construct($dir_name, $filter_size, $num_urls_ram, $min_or_max) { $this->dir_name = $dir_name; $this->filter_size = $filter_size; $this->num_urls_ram = $num_urls_ram; $this->min_or_max = $min_or_max; if(!file_exists($this->dir_name)) { mkdir($this->dir_name); } /* if we are resuming a crawl we discard the old priority queue and associated hash table and archive new queue data will be read in from any existing schedule */ // set up the priority queue... stores (hash(url), weight) pairs. $this->to_crawl_queue = new PriorityQueue($dir_name."/queue.dat", $num_urls_ram, self::HASH_KEY_SIZE, $min_or_max, $this, 0); /* set up the hash table... stores (hash(url), offset into url archive, index in priority queue) triples. */ /*to ensure we can always insert into table, because of how deletions work we will periodically want to rebuild our table we will also want to give a little more than the usual twice the number we want to insert slack */ $this->to_crawl_table = $this->constructHashTable( $dir_name."/hash_table.dat", 8 * $num_urls_ram); /* set up url archive, used to store the full text of the urls which are on the priority queue */ $url_archive_name = $dir_name."/url_archive". NonCompressor::fileExtension(); if(file_exists($url_archive_name)) { unlink($url_archive_name); } $this->to_crawl_archive = new WebArchive( $url_archive_name, new NonCompressor(), false, true); //timestamp for url filters (so can delete if get too old) if(!file_exists($dir_name."/url_timestamp.txt")) { file_put_contents($dir_name."/url_timestamp.txt", time()); } //filter bundle to check if we have already visited a URL $this->url_exists_filter_bundle = new BloomFilterBundle( $dir_name."/UrlExistsFilterBundle", $filter_size); //timestamp for robot filters (so can delete if get too old) if(!file_exists($dir_name."/robot_timestamp.txt")) { file_put_contents($dir_name."/robot_timestamp.txt", time()); } //filter to check if we have already have a copy of a robot.txt file if(file_exists($dir_name."/got_robottxt.ftr")) { $this->got_robottxt_filter = BloomFilterFile::load( $dir_name."/got_robottxt.ftr"); } else { $this->got_robottxt_filter = new BloomFilterFile( $dir_name."/got_robottxt.ftr", $filter_size); } /* Hash table containing DNS cache this is cleared whenever robot filters cleared */ if(file_exists($dir_name."/dns_table.dat")) { $this->dns_table = HashTable::load($dir_name."/dns_table.dat"); } else { $this->dns_table = new HashTable($dir_name."/dns_table.dat", 4 * $num_urls_ram, self::HASH_KEY_SIZE, self::IP_SIZE); } //set up storage for robots.txt info $robot_archive_name = $dir_name."/robot_archive". NonCompressor::fileExtension(); $this->robot_archive = new WebArchive( $robot_archive_name, new NonCompressor(), false, true); if(file_exists($dir_name."/robot.dat")) { $this->robot_table = HashTable::load($dir_name."/robot.dat"); } else { $this->robot_table = new HashTable($dir_name. "/robot.dat", 16*$num_urls_ram, self::HASH_KEY_SIZE, self::INT_SIZE); } //filter to check for and determine crawl delay if(file_exists($dir_name."/crawl_delay.ftr")) { $this->crawl_delay_filter = BloomFilterFile::load($dir_name."/crawl_delay.ftr"); } else { $this->crawl_delay_filter = new BloomFilterFile($dir_name."/crawl_delay.ftr", $filter_size); } //Initialize B-Tree for storing cache page validation data $this->etag_btree = new BTree($dir_name.'/EtagExpiresTree'); $this->notify_buffer = array(); } /** * Adds an array of (url, weight) pairs to the WebQueueBundle. * * @param array $url_pairs a list of pairs to add */ function addUrlsQueue(&$url_pairs) { $add_urls = array(); $count = count($url_pairs); if( $count < 1) return; for($i = 0; $i < $count; $i++) { $add_urls[$i][0] = & $url_pairs[$i][0]; } $objects = $this->to_crawl_archive->addObjects("offset", $add_urls); for($i = 0; $i < $count; $i++) { $url = & $url_pairs[$i][0]; $weight = $url_pairs[$i][1]; if(isset($objects[$i]['offset'])) { $offset = $objects[$i]['offset']; $data = packInt($offset).packInt(0).packInt(self::NO_FLAGS); if($this->insertHashTable(crawlHash($url, true), $data)) { /* we will change 0 to priority queue index in the notify callback */ $loc = $this->to_crawl_queue->insert( crawlHash($url, true), $weight); } else { crawlLog("Error inserting $url into hash table !!"); } } else { crawlLog("Error inserting $url into web archive !!"); } } $this->notifyFlush(); if(isset($offset) && $offset > self::max_url_archive_offset) { $this->rebuildUrlTable(); } } /** * Check is the url queue already contains the given url * @param string $url what to check * @return bool whether it is contained in the queue yet or not */ function containsUrlQueue(&$url) { $hash_url = crawlHash($url, true); $lookup_url = $this->lookupHashTable($hash_url); return ($lookup_url == false) ? false : true; } /** * Adjusts the weight of the given url in the priority queue by amount delta * * In a page importance crawl. a given web page casts its votes on who * to crawl next by splitting its crawl money amongst its child links. * This entails a mechanism for adusting weights of elements in the * priority queue periodically is necessary. This function is used to * solve this problem. * * @param string $url url whose weight in queue we want to adjust * @param float $delta change in weight (usually positive). * @param bool $flush */ function adjustQueueWeight(&$url, $delta, $flush = true) { $hash_url = crawlHash($url, true); $data = $this->lookupHashTable($hash_url); if($data !== false) { $queue_index = unpackInt(substr($data, 4 , 4)); $this->to_crawl_queue->adjustWeight($queue_index, $delta); if($flush) { $this->notifyFlush(); } } else { crawlLog("Can't adjust weight. Not in queue $url"); } } /** * Sets the flag which provides additional information about the * kind of url, for a url already stored in the queue. For instance, * might say if it is a robots.txt url, or if the url has already * passed the robots.txt test, or if it has a crawl-delay * * @param string $url url whose weight in queue we want to adjust * @param int $flag should be one of self::ROBOT, self::NO_FLAGS, * self::SCHEDULABLE or self::SCHEDULABLE + crawl_delay */ function setQueueFlag(&$url, $flag) { $hash_url = crawlHash($url, true); $both = $this->lookupHashTable($hash_url, HashTable::RETURN_BOTH); if($both !== false) { list($probe, $data) = $both; $non_flag = substr($data, 0 , 8); $new_data = $non_flag . packInt($flag); $this->insertHashTable($hash_url, $new_data, $probe); } else { crawlLog("Can't set flag. Not in queue $url"); } } /** * Removes a url from the priority queue. * * This method would typical be called during a crawl after the given * url is scheduled to be crawled. It only deletes the item from * the bundles priority queue and hash table -- not from the web archive. * * @param string $url the url or hash of url to delete * @param bool $isHash flag to say whether or not is the hash of a url */ function removeQueue($url, $isHash = false) { if($isHash == true) { $hash_url = $url; } else { $hash_url = crawlHash($url, true); } $both = $this->lookupHashTable($hash_url, HashTable::RETURN_BOTH); if(!$both) { crawlLog("Not in queue $url"); return; } list($probe, $data) = $both; $queue_index = unpackInt(substr($data, 4 , 4)); $this->to_crawl_queue->poll($queue_index); $this->notifyFlush(); $this->deleteHashTable($hash_url, $probe); } /** * Gets the url and weight of the ith entry in the priority queue * @param int $i entry to look up * @param resource $fh a file handle to the WebArchive for urls * @return mixed false on error, otherwise the ordered 4-tuple in an array */ function peekQueue($i = 1, $fh = NULL) { $tmp = $this->to_crawl_queue->peek($i); if(!$tmp) { crawlLog("web queue peek error on index $i"); return false; } list($hash_url, $weight) = $tmp; $both = $this->lookupHashTable($hash_url, HashTable::RETURN_BOTH); if($both === false ) { crawlLog("web queue hash lookup error $hash_url"); return false; } list($probe, $data) = $both; $offset = unpackInt(substr($data, 0 , 4)); $flag = unpackInt(substr($data, 8 , 4)); $url_obj = $this->to_crawl_archive->getObjects($offset, 1, true, $fh); if(isset($url_obj[0][1][0])) { $url = $url_obj[0][1][0]; } else { $url = "LOOKUP ERROR"; } return array($url, $weight, $flag, $probe); } /** * Pretty prints the contents of the queue bundle in order */ function printContents() { $count = $this->to_crawl_queue->count; for($i = 1; $i <= $count; $i++) { list($url, $weight, $flag, $probe) = $this->peekQueue($i); print "$i URL: $url WEIGHT:$weight FLAG: $flag PROBE: $probe\n"; } } /** * Gets the contents of the queue bundle as an array of ordered * url,weight, flag triples * @return array a list of ordered url, weight, falg triples */ function getContents() { $count = $this->to_crawl_queue->count; $contents = array(); for($i = 1; $i <= $count; $i++) { $contents[] = $this->peekQueue($i); } return $contents; } /** * Makes the weight sum of the to-crawl priority queue sum to $new_total * @param int $new_total amount weights should sum to. All weights will be * scaled by the same factor. */ function normalize($new_total = NUM_URLS_QUEUE_RAM) { $this->to_crawl_queue->normalize(); // we don't chance positions when normalize $this->to_crawl_queue->notify_buffer = array(); } //Filter and Filter Bundle Methods /** * Opens the url WebArchive associated with this queue bundle in the * given read/write mode * @param string $mode the read/write mode to open the archive with * @return resource a file handle to the WebArchive file */ function openUrlArchive($mode = "r") { return $this->to_crawl_archive->open($mode); } /** * Closes a file handle to the url WebArchive * @param resource $fh a valid handle to the url WebArchive file */ function closeUrlArchive($fh) { $this->to_crawl_archive->close($fh); } /** * Adds the supplied url to the url_exists_filter_bundle * @param string $url url to add */ function addSeenUrlFilter($url) { $this->url_exists_filter_bundle->add($url); } /** * Removes all url objects from $url_array which have been seen * @param array &$url_array objects to check if have been seen * @param array $field_names an array of components of a url_array element * which * contains a url to check if seen */ function differenceSeenUrls(&$url_array, $field_names = NULL) { $this->url_exists_filter_bundle->differenceFilter( $url_array, $field_names); } /** * Adds the supplied $host to the got_robottxt_filter * @param string $host url to add */ function addGotRobotTxtFilter($host) { $this->got_robottxt_filter->add($host); } /** * Checks if we have a fresh copy of robots.txt info for $host * @param string $host url to check * @return bool whether we do or not */ function containsGotRobotTxt($host) { return $this->got_robottxt_filter->contains($host); } /** * Adds all the paths for a host to the Robots Web Archive. * @param string $host name that the paths are to be added for. * @param array an array with two keys CrawlConstants::ALLOWED_SITES and * CrawlConstants::DISALLOWED_SITES. For each key * one has an array of paths */ function addRobotPaths($host, $paths) { $paths_container = array($paths); $objects = $this->robot_archive->addObjects("offset", $paths_container); if(isset($objects[0]['offset'])) { $offset = $objects[0]['offset']; $data = packInt($offset); $this->robot_table->insert(crawlHash($host, true), $data); } } /** * Checks if the given $url is allowed to be crawled based on stored * robots.txt info. * @param string $url to check * @return bool whether it was allowed or not */ function checkRobotOkay($url) { // local cache of recent robot.txt stuff static $robot_cache = array(); $cache_size = 2000; list($host, $path) = UrlParser::getHostAndPath($url, true, true); $path = urldecode($path); $key = crawlHash($host, true); if(isset($robot_cache[$key])) { $robot_object = $robot_cache[$key]; } else { $data = $this->robot_table->lookup($key); $offset = unpackInt($data); $robot_object = $this->robot_archive->getObjects($offset, 1); $robot_cache[$key] = $robot_object; if(count($robot_cache) > $cache_size) { array_shift($robot_cache); } } $robot_paths = (isset($robot_object[0][1])) ? $robot_object[0][1] : array(); //these should have been urldecoded in RobotProcessor $robots_okay = true; $robots_not_okay = false; if(isset($robot_paths[CrawlConstants::DISALLOWED_SITES])) { $robots_not_okay = UrlParser::isPathMemberRegexPaths($path, $robot_paths[CrawlConstants::DISALLOWED_SITES]); $robots_okay = !$robots_not_okay; } if(isset($robot_paths[CrawlConstants::ALLOWED_SITES])) { $robots_okay = UrlParser::isPathMemberRegexPaths($path, $robot_paths[CrawlConstants::ALLOWED_SITES]); } return $robots_okay || !$robots_not_okay; } /** * Gets the timestamp of the oldest robot data still stored in * the queue bundle * @return int a Unix timestamp */ function getRobotTxtAge() { $creation_time = intval( file_get_contents($this->dir_name."/robot_timestamp.txt")); return (time() - $creation_time); } /** * Add an entry to the web_queue_bundles DNS cache * * @param string $host hostname to add to DNS Lookup table * @param string $ip_address in presentation format (not as int) to add * to table */ function addDNSCache($host, $ip_address) { $pad = "000000000000"; $hash_host = crawlHash($host, true); $packed_ip = inet_pton($ip_address); if(strlen($packed_ip) == 4) { $packed_ip .= $pad; } $this->dns_table->insert($hash_host, $packed_ip); } /** * Add an entry to the web_queue_bundles DNS cache * * @param string $host hostname to add to DNS Lookup table * @return value */ function dnsLookup($host) { $pad = "000000000000"; $hash_host = crawlHash($host, true); $packed_ip = $this->dns_table->lookup($hash_host); if(!$packed_ip) return false; $maybe_pad = substr($packed_ip, 4); $maybe_ip4 = substr($packed_ip, 0, 4); if(strcmp($maybe_pad, $pad) == 0) { $ip_address = inet_ntop($maybe_ip4); } else { $ip_address = inet_ntop($packed_ip); } if(strcmp($ip_address, "0.0.0.0") == 0) { return false; } return $ip_address; } /** * Gets the timestamp of the oldest url filter data still stored in * the queue bundle * @return int a Unix timestamp */ function getUrlFilterAge() { $creation_time = intval( file_get_contents($this->dir_name."/url_timestamp.txt")); return (time() - $creation_time); } /** * Sets the Crawl-delay of $host to passes $value in seconds * * @param string $host a host to set the Crawl-delay for * @param int $value a delay in seconds up to 255 */ function setCrawlDelay($host, $value) { $this->crawl_delay_filter->add("-1".$host); //used to say a crawl delay has been set for($i = 0; $i < 8; $i++) { if(($value & 1) == 1) { $this->crawl_delay_filter->add("$i".$host); } $value = $value >> 1; } } /** * Gets the Crawl-delay of $host from the crawl delay bloom filter * * @param string $host site to check for a Crawl-delay * @return int the crawl-delay in seconds or -1 if $host has no delay */ function getCrawlDelay($host) { if(!$this->crawl_delay_filter->contains("-1".$host)) { return -1; } $value = 0; for($i = 0; $i < 8; $i++) { if($this->crawl_delay_filter->contains("$i".$host)) { $value += (1 << $i); } } return $value; } /** * Mainly, a Factory style wrapper around the HashTable's constructor. * However, this function also sets up a rebuild frequency. It is used * as part of the process of keeping the to crawl table from having too * many entries * * @param string $name filename to store the hash table persistently * @param int $num_values size of HashTable's arraya * @return object the newly built hash table * @see rebuildHashTable() */ function constructHashTable($name, $num_values) { $this->hash_rebuild_count = 0; $this->max_hash_ops_before_rebuild = floor($num_values/4); return new HashTable($name, $num_values, self::HASH_KEY_SIZE, self::HASH_VALUE_SIZE); } /** * Looks up $key in the to-crawl hash table * * @param string $key the things to look up * @param int $return_probe_value one of self::ALWAYS_RETURN_PROBE, * self::RETURN_PROBE_ON_KEY_FOUND, self::RETURN_VALUE, or self::BOTH. * Here value means the value associated with the key and probe is * either the location in the array where the key was found or * the first location in the array where it was determined the * key could not be found. * @return mixed would be string if the value is being returned, * otherwise, false if the key is not found */ function lookupHashTable($key, $return_probe_value = HashTable::RETURN_VALUE) { return $this->to_crawl_table->lookup($key, $return_probe_value); } /** * Removes an entries from the to crawl hash table * @param int $probe if the location in the hash table is already known * to be $probe then this variable can be used to save a lookup * @param string $key usually a hash of a url */ function deleteHashTable($key, $probe = false) { $this->to_crawl_table->delete($key, $probe); $this->hash_rebuild_count++; if($this->hash_rebuild_count > $this->max_hash_ops_before_rebuild) { $this->rebuildHashTable(); } } /** * Inserts the $key, $value pair into this web queue's to crawl table * * @param string $key intended to be a hash of a url * @param string $value intended to be offset into a webarchive for urls * together with an index into the priority queue * @param int $probe if the location in the hash table is already known * to be $probe then this variable can be used to save a lookup * @return bool whether the insert was a success or not */ function insertHashTable($key, $value, $probe = false) { $this->hash_rebuild_count++; if($this->hash_rebuild_count > $this->max_hash_ops_before_rebuild) { $this->rebuildHashTable(); } return $this->to_crawl_table->insert($key, $value, $probe); } /** * Makes a new HashTable without deleted rows * * The hash table in Yioop is implemented using open addressing. i.e., * We store key value pair in the table itself and if there is a collision * we look for the next available slot. Two codes are use to indicate * space available in the table. One to indicate empty never used, the * other used to indicate empty but previously used and deleted. The reason * you need two codes is to ensure that if somebody inserted an item B, * it hashes to the same value as A and we move to the next empty slot, * to store B, then if we delete A we should still be able to lookup B. * The problem is as the table gets reused a lot, it tends to fill up * with a lot of deleted entries making lookup times get more and more * linear in the hash table size. By rebuilding the table we mitigate * against this problem. By choosing the rebuild frequency appropriately, * the amortized cost of this operation is only O(1). */ function rebuildHashTable() { crawlLog("Rebuilding Hash table"); $num_values = $this->to_crawl_table->num_values; $tmp_table = $this->constructHashTable( $this->dir_name."/tmp_table.dat", $num_values); $null = $this->to_crawl_table->null; $deleted = $this->to_crawl_table->deleted; for($i = 0; $i < $num_values; $i++) { crawlTimeoutLog("..still rebuilding hash table. At entry %s of %s", $i, $num_values); list($key, $value) = $this->to_crawl_table->getEntry($i); if(strcmp($key, $null) != 0 && strcmp($key, $deleted) != 0) { $tmp_table->insert($key, $value); } } $this->to_crawl_table = NULL; gc_collect_cycles(); if(file_exists($this->dir_name."/hash_table.dat")) { unlink($this->dir_name."/hash_table.dat"); if(file_exists($this->dir_name."/tmp_table.dat")) { rename($this->dir_name."/tmp_table.dat", $this->dir_name."/hash_table.dat"); } } $tmp_table->filename = $this->dir_name."/hash_table.dat"; $this->to_crawl_table = $tmp_table; } /** * Since offsets are integers, even if the queue is kept relatively small, * periodically we will need to rebuild the archive for storing urls. */ function rebuildUrlTable() { crawlLog("Rebuilding URL table"); $dir_name = $this->dir_name; $count = $this->to_crawl_queue->count; $tmp_archive_name = $dir_name."/tmp_archive" . NonCompressor::fileExtension(); $url_archive_name = $dir_name."/url_archive" . NonCompressor::fileExtension(); $tmp_archive = new WebArchive($tmp_archive_name, new NonCompressor(), false, true); for($i = 1; $i <= $count; $i++) { list($url, $weight, $flag, $probe) = $this->peekQueue($i); $url_container = array(array($url)); $objects = $tmp_archive->addObjects("offset", $url_container); if(isset($objects[0]['offset'])) { $offset = $objects[0]['offset']; } else { crawlLog("Error inserting $url into rebuild url archive file"); continue; } $hash_url = crawlHash($url, true); $data = packInt($offset).packInt($i).packInt($flag); $this->insertHashTable($hash_url, $data, $probe); } $this->to_crawl_archive = NULL; gc_collect_cycles(); $tmp_archive->filename = $url_archive_name; $this->to_crawl_archive = $tmp_archive; } /** * Delete the Bloom filters used to store robots.txt file info. * Then construct empty new ones. * This is called roughly once a day so that robots files will be * reloaded and so the policies used won't be too old. */ function emptyRobotData() { $robot_archive_name = $this->dir_name."/robot_archive". NonCompressor::fileExtension(); @unlink($this->dir_name."/got_robottxt.ftr"); @unlink($this->dir_name."/crawl_delay.ftr"); @unlink($this->dir_name."/robot.dat"); @unlink($robot_archive_name); $this->crawl_delay_table = array(); file_put_contents($this->dir_name."/robot_timestamp.txt", time()); $this->got_robottxt_filter = NULL; $this->crawl_delay_filter = NULL; $this->robot_archive = NULL; $this->robot_table = NULL; gc_collect_cycles(); $this->got_robottxt_filter = new BloomFilterFile( $this->dir_name."/got_robottxt.ftr", $this->filter_size); $this->crawl_delay_filter = new BloomFilterFile( $this->dir_name."/crawl_delay.ftr", $this->filter_size); $this->robot_archive = new WebArchive( $robot_archive_name, new NonCompressor(), false, true); $this->robot_table = new HashTable($this->dir_name. "/robot.dat", 8*$this->num_urls_ram, self::HASH_KEY_SIZE, self::HASH_VALUE_SIZE); } /** * Delete the Hash table used to store DNS lookup info. * Then construct an empty new one. * This is called roughly once a day at the same time as * @see emptyRobotFilters() */ function emptyDNSCache() { $num_values = $this->dns_table->num_values; unlink($this->dir_name."/dns_table.dat"); $this->dns_table = NULL; gc_collect_cycles(); $this->dns_table = new HashTable($this->dir_name."/dns_table.dat", $num_values, self::HASH_KEY_SIZE, self::IP_SIZE); } /** * Empty the crawled url filter for this web queue bundle; resets the * the timestamp of the last time this filter was emptied. */ function emptyUrlFilter() { file_put_contents($this->dir_name."/url_timestamp.txt", time()); $this->url_exists_filter_bundle->reset(); } /** * Callback which is called when an item in the priority queue changes * position. The position is updated in the hash table. * The priority queue stores (hash of url, weight). The hash table * stores (hash of url, web_archive offset to url, index priority queue). * This method actually buffers changes in $this->notify_buffer, * they are applied when notifyFlush is called or when the buffer reaches * self::NOTIFY_BUFFER_SIZE. * * @param int $index new index in priority queue * @param array $data (hash url, weight) */ function notify($index, $data) { $this->notify_buffer[$data[0]] = $index; if(count($this->notify_buffer) > self::NOTIFY_BUFFER_SIZE) { $this->notifyFlush(); //prevent memory from being exhausted } } /** * Used to flush changes of hash_url indexes caused by adjusting weights * in the bundle's priority queue to its hash table. */ function notifyFlush() { foreach($this->notify_buffer as $hash_url => $index) { $both = $this->lookupHashTable($hash_url, HashTable::RETURN_BOTH); if($both !== false) { list($probe, $value) = $both; $packed_offset = substr($value, 0 , 4); $packed_flag = substr($value, 8 , 4); $new_data = $packed_offset.packInt($index).$packed_flag; $this->insertHashTable($hash_url, $new_data, $probe); } else { crawlLog("NOTIFY LOOKUP FAILED. INDEX WAS $index. DATA WAS ". bin2hex($hash_url)); } } $this->notify_buffer = array(); } } ?>