<?php /** * SeekQuarry/Yioop -- * Open Source Pure PHP Search Engine, Crawler, and Indexer * * Copyright (C) 2009 - 2012 Chris Pollett chris@pollett.org * * LICENSE: * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * * END LICENSE * * @author Chris Pollett chris@pollett.org * @package seek_quarry * @subpackage model * @license http://www.gnu.org/licenses/ GPL3 * @link http://www.seekquarry.com/ * @copyright 2009 - 2012 * @filesource */ if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} /** For crawlHash function */ require_once BASE_DIR."/lib/utility.php"; /** * Loads common constants for web crawling, used for index_data_base_name and * schedule_data_base_name */ require_once BASE_DIR."/lib/crawl_constants.php"; /** * Crawl data is stored in an IndexArchiveBundle, * so load the definition of this class */ require_once BASE_DIR."/lib/index_archive_bundle.php"; /** * Needed to be able to send data via http to remote queue_servers */ require_once BASE_DIR.'/lib/fetch_url.php'; /** used to prevent cache page requests from being logged*/ define("NO_LOGGING", true); /** * This is class is used to handle * db results for a given phrase search * * @author Chris Pollett * * @package seek_quarry * @subpackage model */ class CrawlModel extends Model implements CrawlConstants { /** * Stores the name of the current index archive to use to get search * results from * @var string */ var $index_name; /** * {@inheritdoc} */ function __construct($db_name = DB_NAME) { parent::__construct($db_name); } /** * Get a summary of a document by the generation it is in * and its offset into the corresponding WebArchive. * * @param int $summary_offset offset in $generation WebArchive * @param int $generation the index of the WebArchive in the * IndexArchiveBundle to find the item in. * @return array summary data of the matching document */ function getCrawlItem($summary_offset, $generation) { $index_archive_name = self::index_data_base_name . $this->index_name; $index_archive = new IndexArchiveBundle(CRAWL_DIR.'/cache/'.$index_archive_name); $summary = $index_archive->getPage($summary_offset, $generation); return $summary; } /** * Gets the cached version of a web page from the machine on which it was * fetched. * * Complete cached versions of web pages typically only live on a fetcher * machine. The queue server machine typically only maintains summaries. * This method makes a REST request of a fetcher machine for a cached page * and get the results back. * * @param string $machine the ip address of domain name of the machine the * cached page lives on * @param string $machine_uri the path from document root on $machine where * the yioop scripts live * @param int $partition the partition in the WebArchiveBundle the page is * in * @param int $offset the offset in bytes into the WebArchive partition in * the WebArchiveBundle at which the cached page lives. * @param string $crawl_time the timestamp of the crawl the cache page is * from * @param int $instance_num which fetcher instance for the particular * fetcher crawled the page (if more than one), false otherwise * @return array page data of the cached page */ function getCacheFile($machine, $machine_uri, $partition, $offset, $crawl_time, $instance_num = false) { $time = time(); $session = md5($time . AUTH_KEY); if($machine == '::1') { //IPv6 :( $machine = "[::1]/"; //used if the fetching and queue serving were on the same machine } $request = "http://$machine$machine_uri?c=archive&a=cache&time=$time". "&session=$session&partition=$partition&offset=$offset". "&crawl_time=$crawl_time"; if($instance_num !== false) { $request .= "&instance_num=$instance_num"; } $tmp = FetchUrl::getPage($request); $page = @unserialize(base64_decode($tmp)); $page['REQUEST'] = $request; return $page; } /** * Gets the name (aka timestamp) of the current index archive to be used to * handle search queries * * @return string the timestamp of the archive */ function getCurrentIndexDatabaseName() { $this->db->selectDB(DB_NAME); $sql = "SELECT CRAWL_TIME FROM CURRENT_WEB_INDEX"; $result = $this->db->execute($sql); $row = $this->db->fetchArray($result); return $row['CRAWL_TIME']; } /** * Sets the IndexArchive that will be used for search results * * @param $timestamp the timestamp of the index archive. The timestamp is * when the crawl was started. Currently, the timestamp appears as substring * of the index archives directory name */ function setCurrentIndexDatabaseName($timestamp) { $this->db->selectDB(DB_NAME); $this->db->execute("DELETE FROM CURRENT_WEB_INDEX"); $sql = "INSERT INTO CURRENT_WEB_INDEX VALUES ('".$timestamp."')"; $this->db->execute($sql); } /** * Returns all the files in $dir or its subdirectories with modfied times * more recent than timestamp. The file which have * in their path or name a string in the $excludes array will be exclude * * @param string a directory to traverse * @param int $timestamp used to check modified times against * @param array $excludes an array of path substrings tot exclude * @return array of file structs consisting of name, modified time and * size. */ function getDeltaFileInfo($dir, $timestamp = 0, $excludes) { $dir_path_len = strlen($dir) + 1; $files = $this->db->fileInfoRecursive($dir, true); $names = array(); $results = array(); foreach ($files as $file) { $file["name"] = substr($file["name"], $dir_path_len); if($file["modified"] > $timestamp && $file["name"] !="") { $flag = true; foreach($excludes as $exclude) { if(stristr($file["name"], $exclude)) { $flag = false; break; } } if($flag) { $results[$file["name"]] = $file; } } } $results = array_values($results); return $results; } /** * Gets a list of all mixes of available crawls * * @param bool $components if false then don't load the factors * that make up the crawl mix, just load the name of the mixes * and their timestamps; otherwise, if true loads everything * @return array list of available crawls */ function getMixList($components = false) { $this->db->selectDB(DB_NAME); $sql = "SELECT MIX_TIMESTAMP, MIX_NAME FROM CRAWL_MIXES"; $result = $this->db->execute($sql); $rows = array(); while($row = $this->db->fetchArray($result)) { if($components) { $mix = $this->getCrawlMix($row['MIX_TIMESTAMP'], true); $row['GROUPS'] = $mix['GROUPS']; } $rows[] = $row; } return $rows; } /** * Retrieves the weighting component of the requested crawl mix * * @param string $timestamp of the requested crawl mix * @param bool $just_components says whether to find the mix name or * just the components array. * @return array the crawls and their weights that make up the * requested crawl mix. */ function getCrawlMix($timestamp, $just_components = false) { $this->db->selectDB(DB_NAME); if(!$just_components) { $sql = "SELECT MIX_TIMESTAMP, MIX_NAME FROM CRAWL_MIXES WHERE ". " MIX_TIMESTAMP='$timestamp'"; $result = $this->db->execute($sql); $mix = $this->db->fetchArray($result); } else { $mix = array(); } $sql = "SELECT GROUP_ID, RESULT_BOUND". " FROM MIX_GROUPS WHERE ". " MIX_TIMESTAMP='$timestamp'"; $result = $this->db->execute($sql); $mix['GROUPS'] = array(); while($row = $this->db->fetchArray($result)) { $mix['GROUPS'][$row['GROUP_ID']]['RESULT_BOUND'] = $row['RESULT_BOUND']; } foreach($mix['GROUPS'] as $group_id => $data) { $sql = "SELECT CRAWL_TIMESTAMP, WEIGHT, KEYWORDS ". " FROM MIX_COMPONENTS WHERE ". " MIX_TIMESTAMP='$timestamp' AND GROUP_ID='$group_id'"; $result = $this->db->execute($sql); $mix['COMPONENTS'] = array(); $count = 0; while($row = $this->db->fetchArray($result)) { $mix['GROUPS'][$group_id]['COMPONENTS'][$count] =$row; $count++; } } return $mix; } /** * Returns the timestamp associated with a mix name; * * @param string $mix_name name to lookup * @return mixed timestamp associated with name if exists false otherwise */ function getCrawlMixTimestamp($mix_name) { $this->db->selectDB(DB_NAME); $sql = "SELECT MIX_TIMESTAMP, MIX_NAME FROM CRAWL_MIXES WHERE ". " MIX_NAME='$mix_name'"; $result = $this->db->execute($sql); $mix = $this->db->fetchArray($result); if(isset($mix["MIX_TIMESTAMP"])) { return $mix["MIX_TIMESTAMP"]; } return false; } /** * Returns whether the supplied timestamp corresponds to a crawl mix * * @param string timestamp of the requested crawl mix * * @return bool true if it does; false otherwise */ function isCrawlMix($timestamp) { $this->db->selectDB(DB_NAME); $sql = "SELECT MIX_TIMESTAMP, MIX_NAME FROM CRAWL_MIXES WHERE ". " MIX_TIMESTAMP='$timestamp'"; $result = $this->db->execute($sql); if($result) { if($mix = $this->db->fetchArray($result)) { return true; } else { return false; } } } /** * Stores in DB the supplied crawl mix object * * @param array $mix an associative array repreenting the crawl mix object */ function setCrawlMix($mix) { $this->db->selectDB(DB_NAME); //although maybe slower, we first get rid of any old data $timestamp = $mix['MIX_TIMESTAMP']; $this->deleteCrawlMix($timestamp); //next we store the new data $sql = "INSERT INTO CRAWL_MIXES VALUES ('$timestamp', '". $mix['MIX_NAME']."')"; $this->db->execute($sql); $gid = 0; foreach($mix['GROUPS'] as $group_id => $group_data) { $sql = "INSERT INTO MIX_GROUPS VALUES ('$timestamp', '$gid', ". "'".$group_data['RESULT_BOUND']."')"; $this->db->execute($sql); foreach($group_data['COMPONENTS'] as $component) { $sql = "INSERT INTO MIX_COMPONENTS VALUES ('$timestamp', '". $gid."', '".$component['CRAWL_TIMESTAMP']."', '". $component['WEIGHT']."', '" . $component['KEYWORDS']."')"; $this->db->execute($sql); } $gid++; } } /** * Stores in DB the supplied crawl mix object * * @param array $mix an associative array repreenting the crawl mix object */ function deleteCrawlMix($timestamp) { $this->db->selectDB(DB_NAME); $sql = "DELETE FROM CRAWL_MIXES WHERE MIX_TIMESTAMP='$timestamp'"; $this->db->execute($sql); $sql = "DELETE FROM MIX_GROUPS WHERE MIX_TIMESTAMP='$timestamp'"; $this->db->execute($sql); $sql = "DELETE FROM MIX_COMPONENTS WHERE MIX_TIMESTAMP='$timestamp'"; $this->db->execute($sql); } /** * Returns the initial sites that a new crawl will start with along with * crawl parameters such as crawl order, allowed and disallowed crawl sites * @param bool $use_default whether or not to use the Yioop! default * crawl.ini file rather than the one created by the user. * @return array the first sites to crawl during the next crawl * restrict_by_url, allowed, disallowed_sites */ function getSeedInfo($use_default = false) { if(file_exists(WORK_DIRECTORY."/crawl.ini") && !$use_default) { $info = parse_ini_file (WORK_DIRECTORY."/crawl.ini", true); } else { $info = parse_ini_file (BASE_DIR."/configs/default_crawl.ini", true); } return $info; } /** * Writes a crawl.ini file with the provided data to the user's * WORK_DIRECTORY * * @param array $info an array containing information about the crawl * such as crawl_order, whether restricted_by_url, seed_sites, * allowed_sites and disallowed_sites */ function setSeedInfo($info) { if(!isset($info['general']['crawl_index'])) { $info['general']['crawl_index']='12345678'; } $n = array(); $n[] = <<<EOT ; ***** BEGIN LICENSE BLOCK ***** ; SeekQuarry/Yioop Open Source Pure PHP Search Engine, Crawler, and Indexer ; Copyright (C) 2009, 2010 Chris Pollett chris@pollett.org ; ; This program is free software: you can redistribute it and/or modify ; it under the terms of the GNU General Public License as published by ; the Free Software Foundation, either version 3 of the License, or ; (at your option) any later version. ; ; This program is distributed in the hope that it will be useful, ; but WITHOUT ANY WARRANTY; without even the implied warranty of ; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ; GNU General Public License for more details. ; ; You should have received a copy of the GNU General Public License ; along with this program. If not, see <http://www.gnu.org/licenses/>. ; ***** END LICENSE BLOCK ***** ; ; crawl.ini ; ; crawl configuration file ; EOT; if(!isset($info['general']['page_range_request'])) { $info['general']['page_range_request'] = PAGE_RANGE_REQUEST; } if(!isset($info['general']['page_recrawl_frequency'])) { $info['general']['page_recrawl_frequency'] = PAGE_RECRAWL_FREQUENCY; } $n[] = '[general]'; $n[] = "crawl_order = '".$info['general']['crawl_order']."';"; $n[] = "crawl_type = '".$info['general']['crawl_type']."';"; $n[] = "crawl_index = '".$info['general']['crawl_index']."';"; $n[] = "page_recrawl_frequency = '". $info['general']['page_recrawl_frequency']."';"; $n[] = "page_range_request = '". $info['general']['page_range_request']."';"; $bool_string = ($info['general']['restrict_sites_by_url']) ? "true" : "false"; $n[] = "restrict_sites_by_url = $bool_string;"; $n[] = ""; $n[] = "[indexed_file_types]"; if(isset($info["indexed_file_types"]['extensions'])) { foreach($info["indexed_file_types"]['extensions'] as $extension) { $n[] = "extensions[] = '$extension';"; } } $n[] = ""; $site_types = array('allowed_sites', 'disallowed_sites', 'seed_sites'); foreach($site_types as $type) { $n[] = "[$type]"; foreach($info[$type]['url'] as $url) { $n[] = "url[] = '$url';"; } $n[]=""; } $n[] = "[meta_words]"; if(isset($info["meta_words"])) { foreach($info["meta_words"] as $word_pattern => $url_pattern) { $n[] = "$word_pattern = '$url_pattern';"; } $n[]=""; } $n[] = "[indexing_plugins]"; if(isset($info["indexing_plugins"]['plugins'])) { foreach($info["indexing_plugins"]['plugins'] as $plugin) { $n[] = "plugins[] = '$plugin';"; } } $out = implode("\n", $n); file_put_contents(WORK_DIRECTORY."/crawl.ini", $out); } /** * Returns the crawl parameters that were used during a given crawl * * @param string $timestamp timestamp of the crawl to load the crawl * parameters of * @return array the first sites to crawl during the next crawl * restrict_by_url, allowed, disallowed_sites * @param array $machine_urls an array of urls of yioop queue servers * */ function getCrawlSeedInfo($timestamp, $machine_urls = NULL) { if($machine_urls != NULL && !$this->isSingleLocalhost($machine_urls)) { /* seed info should be same amongst all queue_servers that have it-- only start schedule differs -- however, not all queue_servers necessarily have the same crawls. THus, we still query all machines in case only one has it. */ $a_list = $this->execMachines("getCrawlSeedInfo", $machine_urls, serialize($timestamp)); if(is_array($a_list)) { foreach($a_list as $elt) { $seed_info = unserialize(webdecode( $elt[self::PAGE])); if(isset($seed_info['general'])) { break; } } } return $seed_info; } $dir = CRAWL_DIR.'/cache/'.self::index_data_base_name.$timestamp; $seed_info = NULL; if(file_exists($dir)) { $info = IndexArchiveBundle::getArchiveInfo($dir); $index_info = unserialize($info['DESCRIPTION']); $seed_info['general']["restrict_sites_by_url"] = $index_info[self::RESTRICT_SITES_BY_URL]; $seed_info['general']["crawl_type"] = (isset($index_info[self::CRAWL_TYPE])) ? $index_info[self::CRAWL_TYPE] : self::WEB_CRAWL; $seed_info['general']["crawl_index"] = (isset($index_info[self::CRAWL_INDEX])) ? $index_info[self::CRAWL_INDEX] : ''; $seed_info['general']["crawl_order"] = $index_info[self::CRAWL_ORDER]; $site_types = array( "allowed_sites" => self::ALLOWED_SITES, "disallowed_sites" => self::DISALLOWED_SITES, "seed_sites" => self::TO_CRAWL ); foreach($site_types as $type => $code) { if(isset($index_info[$code])) { $tmp = & $index_info[$code]; } else { $tmp = array(); } $seed_info[$type]['url'] = $tmp; } $seed_info['meta_words'] = array(); if(isset($index_info[self::META_WORDS]) ) { $seed_info['meta_words'] = $index_info[self::META_WORDS]; } if(isset($index_info[self::INDEXING_PLUGINS])) { $seed_info['indexing_plugins']['plugins'] = $index_info[self::INDEXING_PLUGINS]; } } return $seed_info; } /** * Changes the crawl parameters of an existing crawl * * @param string $timestamp timestamp of the crawl to change * @param array $new_info the new parameters * @param array $machine_urls an array of urls of yioop queue servers */ function setCrawlSeedInfo($timestamp, $new_info, $machine_urls = NULL) { if($machine_urls != NULL && !$this->isSingleLocalhost($machine_urls)) { $params = array($timestamp, $new_info); $this->execMachines("setCrawlSeedInfo", $machine_urls, serialize($params)); } $dir = CRAWL_DIR.'/cache/'.self::index_data_base_name.$timestamp; if(file_exists($dir)) { $info = IndexArchiveBundle::getArchiveInfo($dir); $index_info = unserialize($info['DESCRIPTION']); if(isset($new_info['general']["restrict_sites_by_url"])) { $index_info[self::RESTRICT_SITES_BY_URL] = $new_info['general']["restrict_sites_by_url"]; } $updatable_site_info = array( "allowed_sites" => self::ALLOWED_SITES, "disallowed_sites" => self::DISALLOWED_SITES ); foreach($updatable_site_info as $type => $code) { if(isset($new_info[$type])) { $index_info[$code] = $new_info[$type]['url']; } } if(isset($new_info['meta_words']) ) { $index_info[self::META_WORDS] = $new_info['meta_words']; } if(isset($new_info['indexing_plugins']) ) { $index_info[self::INDEXING_PLUGINS] = $new_info['indexing_plugins']['plugins']; } $info['DESCRIPTION'] = serialize($index_info); IndexArchiveBundle::setArchiveInfo($dir, $info); } } /** * Get a description associated with a Web Crawl or Crawl Mix * * @param int $timestamp of crawl or mix in question * @param array $machine_urls an array of urls of yioop queue servers * * @return array associative array containing item DESCRIPTION */ function getInfoTimestamp($timestamp, $machine_urls = NULL) { $is_mix = $this->isCrawlMix($timestamp); $info = array(); if($is_mix) { $this->db->selectDB(DB_NAME); $sql = "SELECT MIX_TIMESTAMP, MIX_NAME FROM CRAWL_MIXES WHERE ". " MIX_TIMESTAMP='$timestamp'"; $result = $this->db->execute($sql); $mix = $this->db->fetchArray($result); $info['TIMESTAMP'] = $timestamp; $info['DESCRIPTION'] = $mix['MIX_NAME']; $info['IS_MIX'] = true; } else { if($machine_urls != NULL && !$this->isSingleLocalhost($machine_urls)) { $cache_file = CRAWL_DIR."/cache/Network".$timestamp.".txt"; if(file_exists($cache_file) && filemtime($cache_file) + 300 > time() ) { return unserialize(file_get_contents($cache_file)); } $info_lists = $this->execMachines("getInfoTimestamp", $machine_urls, serialize($timestamp)); $info = array(); $info['DESCRIPTION'] = ""; $info["COUNT"] = 0; $info['VISITED_URLS_COUNT'] = 0; foreach($info_lists as $info_list) { $a_info = unserialize(webdecode( $info_list[self::PAGE])); if(isset($a_info['DESCRIPTION'])) { $info['DESCRIPTION'] = $a_info['DESCRIPTION']; } if(isset($a_info['VISITED_URLS_COUNT'])) { $info['VISITED_URLS_COUNT'] += $a_info['VISITED_URLS_COUNT']; } if(isset($a_info['COUNT'])) { $info['COUNT'] += $a_info['COUNT']; } } file_put_contents($cache_file, serialize($info)); return $info; } $dir = CRAWL_DIR.'/cache/'.self::index_data_base_name.$timestamp; if(file_exists($dir)) { $info = IndexArchiveBundle::getArchiveInfo($dir); $tmp = unserialize($info['DESCRIPTION']); $info['DESCRIPTION'] = $tmp['DESCRIPTION']; } } return $info; } /** * Deletes the crawl with the supplied timestamp if it exists. Also * deletes any crawl mixes making use of this crawl * * @param string $timestamp a Unix timestamp * @param array $machine_urls an array of urls of yioop queue servers */ function deleteCrawl($timestamp, $machine_urls) { if($machine_urls != NULL && !$this->isSingleLocalhost($machine_urls)) { //get rid of cache info on Name machine $mask = CRAWL_DIR."/cache/NetworkCrawlList*.txt"; array_map( "unlink", glob( $mask ) ); @unlink(CRAWL_DIR."/cache/Network$timestamp.txt"); //now get rid of files on queue_servers $this->execMachines("deleteCrawl", $machine_urls, serialize($timestamp)); return; } $this->db->unlinkRecursive( CRAWL_DIR.'/cache/'.self::index_data_base_name . $timestamp, true); $this->db->unlinkRecursive( CRAWL_DIR.'/schedules/'.self::index_data_base_name . $timestamp, true); $this->db->unlinkRecursive( CRAWL_DIR.'/schedules/' . self::schedule_data_base_name.$timestamp, true); $this->db->unlinkRecursive( CRAWL_DIR.'/schedules/'.self::robot_data_base_name. $timestamp, true); $this->db->selectDB(DB_NAME); $sql = "SELECT DISTINCT MIX_TIMESTAMP FROM MIX_COMPONENTS WHERE ". " CRAWL_TIMESTAMP='$timestamp'"; $result = $this->db->execute($sql); $rows = array(); while($rows[] = $this->db->fetchArray($result)) ; foreach($rows as $row) { $this->deleteCrawlMix($row['MIX_TIMESTAMP']); } $current_timestamp = $this->getCurrentIndexDatabaseName(); if($current_timestamp == $timestamp) { $this->db->execute("DELETE FROM CURRENT_WEB_INDEX"); } } /** * Used to send a message to the queue_servers to start a crawl * * @param array $machine_urls an array of urls of yioop queue servers */ function sendStartCrawlMessage($crawl_params, $seed_info = NULL, $machine_urls = NULL) { if($machine_urls != NULL && !$this->isSingleLocalhost($machine_urls)) { $params = array($crawl_params, $seed_info); $this->execMachines("sendStartCrawlMessage", $machine_urls, serialize($params)); return; } $info_string = serialize($crawl_params); file_put_contents( CRAWL_DIR."/schedules/queue_server_messages.txt", $info_string); chmod(CRAWL_DIR."/schedules/queue_server_messages.txt", 0777); if($seed_info != NULL) { $scheduler_info[self::HASH_SEEN_URLS] = array(); foreach ($seed_info['seed_sites']['url'] as $site) { $scheduler_info[self::TO_CRAWL][] = array($site, 1.0); } $scheduler_string = "\n".webencode( gzcompress(serialize($scheduler_info))); file_put_contents( CRAWL_DIR."/schedules/".self::schedule_start_name, $scheduler_string); } } /** * Used to send a message to the queue_servers to stop a crawl * @param array $machine_urls an array of urls of yioop queue servers */ function sendStopCrawlMessage($machine_urls = NULL) { if($machine_urls != NULL && !$this->isSingleLocalhost($machine_urls)) { $this->execMachines("sendStopCrawlMessage", $machine_urls); return; } $info = array(); $info[self::STATUS] = "STOP_CRAWL"; $info_string = serialize($info); file_put_contents( CRAWL_DIR."/schedules/queue_server_messages.txt", $info_string); } /** * Gets a list of all index archives of crawls that have been conducted * * @param bool $return_arc_bundles whether index bundles used for indexing * arc or other archive bundles should be included in the lsit * @param bool $return_recrawls whether index archive bundles generated as * a result of recrawling should be included in the result * @param array $machine_urls an array of urls of yioop queue servers * @param bool $cache whether to try to get/set the data to a cache file * * @return array Available IndexArchiveBundle directories and * their meta information this meta information includes the time of * the crawl, its description, the number of pages downloaded, and the * number of partitions used in storing the inverted index */ function getCrawlList($return_arc_bundles = false, $return_recrawls = false, $machine_urls = NULL, $cache = false) { if($machine_urls != NULL && !$this->isSingleLocalhost($machine_urls)) { $pre_arg = ($return_arc_bundles && $return_recrawls) ? 3 : ($return_recrawls) ? 2 : ($return_arc_bundles) ? 1 : 0; $cache_file = CRAWL_DIR."/cache/NetworkCrawlList$pre_arg.txt"; if($cache && file_exists($cache_file) && filemtime($cache_file) + 300 > time() ) { return unserialize(file_get_contents($cache_file)); } $arg = "arg=$pre_arg"; $list_strings = $this->execMachines("getCrawlList", $machine_urls, $arg); $list = $this->aggregateCrawlList($list_strings); if($cache) { file_put_contents($cache_file, serialize($list)); } return $list; } $list = array(); $dirs = glob(CRAWL_DIR.'/cache/*', GLOB_ONLYDIR); foreach($dirs as $dir) { if(strlen($pre_timestamp = strstr($dir, self::index_data_base_name)) > 0) { $crawl = array(); $crawl['CRAWL_TIME'] = substr($pre_timestamp, strlen(self::index_data_base_name)); $info = IndexArchiveBundle::getArchiveInfo($dir); $index_info = unserialize($info['DESCRIPTION']); $crawl['DESCRIPTION'] = ""; if(!$return_arc_bundles && isset($index_info['ARCFILE'])) { continue; } else if ($return_arc_bundles && isset($index_info['ARCFILE'])) { $crawl['DESCRIPTION'] = "ARCFILE::"; } if(!$return_recrawls && isset($index_info[self::CRAWL_TYPE]) && $index_info[self::CRAWL_TYPE] == self::ARCHIVE_CRAWL) { continue; } else if($return_recrawls && isset($index_info[self::CRAWL_TYPE]) && $index_info[self::CRAWL_TYPE] == self::ARCHIVE_CRAWL) { $crawl['DESCRIPTION'] = "RECRAWL::"; } $schedules = glob(CRAWL_DIR.'/schedules/'. self::schedule_data_base_name.$crawl['CRAWL_TIME']. '/*/At*.txt'); $crawl['RESUMABLE'] = (count($schedules) > 0) ? true: false; $crawl['DESCRIPTION'] .= $index_info['DESCRIPTION']; $crawl['VISITED_URLS_COUNT'] = isset($info['VISITED_URLS_COUNT']) ? $info['VISITED_URLS_COUNT'] : 0; $crawl['COUNT'] = $info['COUNT']; $crawl['NUM_DOCS_PER_PARTITION'] = $info['NUM_DOCS_PER_PARTITION']; $crawl['WRITE_PARTITION'] = $info['WRITE_PARTITION']; $list[] = $crawl; } } return $list; } /** * When @see getCrawlList() is used in a multi-queue_server this method * used to integrate the crawl lists received by the different machines * * @param array $list_strings serialized crawl list data from different * queue_servers * @param string $data_field field of $list_strings to use for data * @return array list of crawls and their meta data */ function aggregateCrawlList($list_strings, $data_field = NULL) { $pre_list = array(); foreach($list_strings as $list_string) { $a_list = unserialize(webdecode( $list_string[self::PAGE])); if($data_field != NULL) { $a_list = $a_list[$data_field]; } if(is_array($a_list)) { foreach($a_list as $elt) { $timestamp = $elt['CRAWL_TIME']; if(!isset($pre_list[$timestamp])) { $pre_list[$timestamp] = $elt; } else { $pre_list[$timestamp]["VISITED_URLS_COUNT"] += $elt["VISITED_URLS_COUNT"]; $pre_list[$timestamp]["COUNT"] += $elt["COUNT"]; $pre_list[$timestamp]['RESUMABLE'] |= $elt['RESUMABLE']; } } } } $list = array_values($pre_list); return $list; } /** * Determines if the length of time since any of the fetchers has spoken * with any of the queue_servers has exceeded CRAWL_TIME_OUT. If so, * typically the caller of this method would do something such as officially * stop the crawl. * * @param array $list_strings serialized crawl list data from different * queue_servers * @param array $machine_urls an array of urls of yioop queue servers * @return bool whether the current crawl is stalled or not */ function crawlStalled($machine_urls = NULL) { if($machine_urls != NULL && !$this->isSingleLocalhost($machine_urls)) { $outputs = $this->execMachines("crawlStalled", $machine_urls); return $this->aggregateStalled($outputs); } if(file_exists(CRAWL_DIR."/schedules/crawl_status.txt")) { //assume if status not updated for CRAWL_TIME_OUT // crawl not active (do check for both scheduler and indexer) if(filemtime( CRAWL_DIR."/schedules/crawl_status.txt") + CRAWL_TIME_OUT < time() ) { return true; } $schedule_status_exists = file_exists(CRAWL_DIR."/schedules/schedule_status.txt"); if($schedule_status_exists && filemtime(CRAWL_DIR."/schedules/schedule_status.txt") + CRAWL_TIME_OUT < time() ) { return true; } } return false; } /** * When @see crawlStalled() is used in a multi-queue_server this method * used to integrate the stalled information received by the different * machines * * @param array $stall_statuses contains web encoded serialized data one * one field of which has the boolean data concerning stalled statis * * @param string $data_field field of $stall_statuses to use for data * if NULL then each element of $stall_statuses is a wen encoded * serialized boolean * @return bool true if at list one queue_server has heard from one * fetcher within the time out period */ function aggregateStalled($stall_statuses, $data_field = NULL) { foreach($stall_statuses as $status) { $stall_status = unserialize(webdecode($status[self::PAGE])); if($data_field != NULL) { $stall_status = $stall_status[$data_field]; } if($stall_status === true) { return true; } } return false; } /** * Returns data about current crawl such as DESCRIPTION, TIMESTAMP, * peak memory of various processes, most recent fetcher, most recent * urls, urls seen, urls visited, etc. * * @param array $machine_urls an array of urls of yioop queue servers * on which the crawl is being conducted * @return array associative array of the said data */ function crawlStatus($machine_urls = NULL) { if($machine_urls != NULL && !$this->isSingleLocalhost($machine_urls)) { $status_strings = $this->execMachines("crawlStatus", $machine_urls); return $this->aggregateStatuses($status_strings); } $data = array(); $crawl_status_exists = file_exists(CRAWL_DIR."/schedules/crawl_status.txt"); if($crawl_status_exists) { $crawl_status = @unserialize(file_get_contents( CRAWL_DIR."/schedules/crawl_status.txt")); } $schedule_status_exists = file_exists(CRAWL_DIR."/schedules/schedule_status.txt"); if($schedule_status_exists) { $schedule_status = @unserialize(file_get_contents( CRAWL_DIR."/schedules/schedule_status.txt")); if(isset($schedule_status[self::TYPE]) && $schedule_status[self::TYPE] == self::SCHEDULER) { $data['SCHEDULER_PEAK_MEMORY'] = isset($schedule_status[self::MEMORY_USAGE]) ? $schedule_status[self::MEMORY_USAGE] : 0; } } $data = (isset($crawl_status) && is_array($crawl_status)) ? array_merge($data, $crawl_status) : $data; if(isset($data['VISITED_COUNT_HISTORY']) && count($data['VISITED_COUNT_HISTORY']) > 1) { $recent = array_shift($data['VISITED_COUNT_HISTORY']); $data["MOST_RECENT_TIMESTAMP"] = $recent[0]; $oldest = array_pop($data['VISITED_COUNT_HISTORY']); unset($data['VISITED_COUNT_HISTORY']); $change_in_time_hours = floatval(time() - $oldest[0])/3600.; $change_in_urls = $recent[1] - $oldest[1]; $data['VISITED_URLS_COUNT_PER_HOUR'] = ($change_in_time_hours > 0) ? $change_in_urls/$change_in_time_hours : 0; } else { $data['VISITED_URLS_COUNT_PER_HOUR'] = 0; } return $data; } /** * When @see crawlStatus() is used in a multi-queue_server this method * used to integrate the status information received by the different * machines * * @param array $status_strings * @param string $data_field field of $status_strings to use for data * @return array associative array of DESCRIPTION, TIMESTAMP, * peak memory of various processes, most recent fetcher, most recent * urls, urls seen, urls visited, etc. */ function aggregateStatuses($status_strings, $data_field = NULL) { $status['WEBAPP_PEAK_MEMORY'] = 0; $status['FETCHER_PEAK_MEMORY'] = 0; $status['QUEUE_PEAK_MEMORY'] = 0; $status["SCHEDULER_PEAK_MEMORY"] = 0; $status["COUNT"] = 0; $status["VISITED_URLS_COUNT"] = 0; $status["VISITED_URLS_COUNT_PER_HOUR"] = 0; $status["MOST_RECENT_TIMESTAMP"] = 0; $status["DESCRIPTION"] = ""; $status['MOST_RECENT_FETCHER'] = ""; $status['MOST_RECENT_URLS_SEEN'] = array(); $status['CRAWL_TIME'] = 0; foreach($status_strings as $status_string) { $a_status = unserialize(webdecode( $status_string[self::PAGE])); if($data_field != NULL) { $a_status = $a_status[$data_field]; } $count_fields = array("COUNT", "VISITED_URLS_COUNT_PER_HOUR", "VISITED_URLS_COUNT"); foreach($count_fields as $field) { if(isset($a_status[$field])) { $status[$field] += $a_status[$field]; } } if(isset($a_status["CRAWL_TIME"]) && $a_status["CRAWL_TIME"] >= $status['CRAWL_TIME']) { $status['CRAWL_TIME'] = $a_status["CRAWL_TIME"]; $text_fields = array("DESCRIPTION", "MOST_RECENT_FETCHER"); foreach($text_fields as $field) { if(isset($a_status[$field])) { if($status[$field] == "" || in_array($status[$field], array("BEGIN_CRAWL", "RESUME_CRAWL") )) { $status[$field] = $a_status[$field]; } } } } if(isset($a_status["MOST_RECENT_TIMESTAMP"]) && $status["MOST_RECENT_TIMESTAMP"] <= $a_status["MOST_RECENT_TIMESTAMP"]) { $status["MOST_RECENT_TIMESTAMP"] = $a_status["MOST_RECENT_TIMESTAMP"]; if(isset($a_status['MOST_RECENT_URLS_SEEN'])) { $status['MOST_RECENT_URLS_SEEN'] = $a_status['MOST_RECENT_URLS_SEEN']; } } $memory_fields = array("WEBAPP_PEAK_MEMORY", "FETCHER_PEAK_MEMORY", "QUEUE_PEAK_MEMORY", "SCHEDULER_PEAK_MEMORY"); foreach($memory_fields as $field) { $status[$field] = (!isset($a_status[$field])) ? 0 : max($status[$field], $a_status[$field]); } } return $status; } /** * This method is used to reduce the number of network requests * needed by the crawlStatus method of admin_controller. It returns * an array containing the results of the @see crawlStalled * @see crawlStatus and @see getCrawlList methods * * @param array $machine_urls an array of urls of yioop queue servers * @return array containing three components one for each of the three * kinds of results listed above */ function combinedCrawlInfo($machine_urls = NULL) { if($machine_urls != NULL && !$this->isSingleLocalhost($machine_urls)) { $combined_strings = $this->execMachines("combinedCrawlInfo", $machine_urls); $combined = array(); $combined[] = $this->aggregateStalled($combined_strings, 0); $combined[] = $this->aggregateStatuses($combined_strings, 1); $combined[] = $this->aggregateCrawlList($combined_strings, 2); return $combined; } $combined = array(); $combined[] = $this->crawlStalled(); $combined[] = $this->crawlStatus(); $combined[] = $this->getCrawlList(false, true); return $combined; } /** * Add the provided urls to the schedule directory of URLs that will * be crawled * * @param string $timestamp Unix timestamp of crawl to add to schedule of * @param array $inject_urls urls to be added to the schedule of * the active crawl * @param array $machine_urls an array of urls of yioop queue servers */ function injectUrlsCurrentCrawl($timestamp, $inject_urls, $machine_urls = NULL) { if($machine_urls != NULL && !$this->isSingleLocalhost($machine_urls)) { $this->execMachines("injectUrlsCurrentCrawl", $machine_urls, serialize(array($timestamp, $inject_urls))); return; } $dir = CRAWL_DIR."/schedules/". self::schedule_data_base_name. $timestamp; if(!file_exists($dir)) { mkdir($dir); chmod($dir, 0777); } $day = floor($timestamp/86400) - 1; //want before all other schedules, // execute next $dir .= "/$day"; if(!file_exists($dir)) { mkdir($dir); chmod($dir, 0777); } $count = count($inject_urls); if($count > 0 ) { $now = time(); $schedule_data = array(); $schedule_data[self::SCHEDULE_TIME] = $timestamp; $schedule_data[self::TO_CRAWL] = array(); for($i = 0; $i < $count; $i++) { $url = $inject_urls[$i]; $hash = crawlHash($now.$url); $schedule_data[self::TO_CRAWL][] = array($url, 1, $hash); } $data_string = webencode( gzcompress(serialize($schedule_data))); $data_hash = crawlHash($data_string); file_put_contents($dir."/At1From127-0-0-1". "WithHash$data_hash.txt", $data_string); return true; } return false; } /** * This method is invoked by other crawlModel methods when they * want to have their method performed on an array of other * Yioop instances. The results returned can then be aggregated. * The invokation sequence is * crawlModelMethodA invokes execMachine with a list of * urls of other Yioop instances. execMachine makes REST requests of * those instances of the given command and optional arguments * This request would be handled by a CrawlController which in turn * calls crawlModelMethodA on the given Yioop instance, serializes the * result and gives it back to execMachine and then back to the originally * calling function. * * @param string $command the CrawlModel method to invoke on the remote * Yioop instances * @param array $machine_urls machines to invoke this command on * @param string additional arguments to be passed to the remote machine * @return array a list of outputs from each machine that was called. */ function execMachines($command, $machine_urls, $arg = NULL) { $num_machines = count($machine_urls); $time = time(); $session = md5($time . AUTH_KEY); $query = "c=crawl&a=$command&time=$time&session=$session" . "&num=$num_machines"; if($arg != NULL) { $arg = webencode($arg); $query .= "&arg=$arg"; } $sites = array(); $post_data = array(); $i = 0; foreach($machine_urls as $machine_url) { $sites[$i][CrawlConstants::URL] = $machine_url; $post_data[$i] = $query."&i=$i"; $i++; } $outputs = array(); if(count($sites) > 0) { $outputs = FetchUrl::getPages($sites, false, 0, NULL, self::URL, self::PAGE, true, $post_data); } return $outputs; } } ?>