viewgit/inc/functions.php:22 Function utf8_encode() is deprecated [8192]
diff --git a/controllers/FetchController.php b/controllers/FetchController.php index ebed0a701..1dee71449 100755 --- a/controllers/FetchController.php +++ b/controllers/FetchController.php @@ -182,14 +182,9 @@ class FetchController extends Controller implements CrawlConstants $result_timestamp = $crawl_time; $result_dir = WORK_DIRECTORY. "/schedules/".self::name_archive_iterator.$crawl_time; - $arctype = $info[self::ARC_TYPE]; $iterator_name = NS_ARCHIVE . $arctype."Iterator"; - - if (!class_exists($iterator_name)) { - $info['ARCHIVE_BUNDLE_ERROR'] = - "Invalid bundle iterator: '{$iterator_name}'"; - } else { + try { if ($info[self::ARC_DIR] == "MIX") { //recrawl of crawl mix case $archive_iterator = new $iterator_name( @@ -200,11 +195,15 @@ class FetchController extends Controller implements CrawlConstants $iterate_timestamp, $info[self::ARC_DIR], $result_timestamp, $result_dir); } + } catch (\Exception $e) { + $info['ARCHIVE_BUNDLE_ERROR'] = + "Invalid bundle iterator: '{$iterator_name}' \n". + $e->getMessage(); } } $pages = false; if ($archive_iterator && !$archive_iterator->end_of_iterator) { - if (generalIsA($archive_iterator, + if (L\generalIsA($archive_iterator, "TextArchiveBundleIterator")) { $pages = $archive_iterator->nextChunk(); $chunk = true; diff --git a/controllers/ResourceController.php b/controllers/ResourceController.php index 862b2c54e..af2ead08a 100644 --- a/controllers/ResourceController.php +++ b/controllers/ResourceController.php @@ -178,7 +178,7 @@ class ResourceController extends Controller implements CrawlConstants, $group = $group_model->getGroupById($group_id, $user_id); if (!$group) { return false; } $hash_word = (isset($_REQUEST['t'])) ? 'thumb' : 'group'; - $subfolder = crawlHash( + $subfolder = L\crawlHash( $hash_word . $group_id. $page_id . AUTH_KEY); $prefix_folder = substr($subfolder, 0, 3); $add_to_path = true; diff --git a/executables/Fetcher.php b/executables/Fetcher.php index 9c9d1601e..16d4a840e 100755 --- a/executables/Fetcher.php +++ b/executables/Fetcher.php @@ -1174,7 +1174,7 @@ class Fetcher implements CrawlConstants "&check_crawl_time=".$this->check_crawl_time; L\crawlLog($request); $response_string = FetchUrl::getPage($request, null, true); - +L\crawlLog($response_string); if ($response_string === false) { L\crawlLog("The following request failed:"); L\crawlLog($request); diff --git a/executables/QueueServer.php b/executables/QueueServer.php index eba23e29a..c78811acd 100755 --- a/executables/QueueServer.php +++ b/executables/QueueServer.php @@ -417,7 +417,7 @@ class QueueServer implements CrawlConstants, Join } L\crawlLog("{$this->server_name} active crawl is ". "{$this->crawl_time}."); - if ($this->isAScheduler()) { + if ($this->isAScheduler() && $this->crawl_type == self::WEB_CRAWL) { L\crawlLog("Current queue size is:". $this->web_queue->to_crawl_queue->count); } diff --git a/library/Bzip2BlockIterator.php b/library/Bzip2BlockIterator.php index f64df578b..e478af69b 100644 --- a/library/Bzip2BlockIterator.php +++ b/library/Bzip2BlockIterator.php @@ -132,11 +132,11 @@ class BZip2BlockIterator $this->fd = fopen($this->path, 'rb'); $this->header = fread($this->fd, 4); if (substr($this->header, 0, 3) != self::MAGIC) { - throw new Exception('Bad bz2 magic number. Not a bz2 file?'); + throw new \Exception('Bad bz2 magic number. Not a bz2 file?'); } $this->block = fread($this->fd, 6); if ($this->block != self::BLOCK_HEADER) { - throw new Exception('Bad bz2 block header'); + throw new \Exception('Bad bz2 block header'); } $this->file_offset = 10; } @@ -176,11 +176,8 @@ class BZip2BlockIterator $next_chunk = fread($this->fd, self::BLOCK_SIZE); $this->file_offset += strlen($next_chunk); $this->buffer .= $next_chunk; - $match = preg_match( - self::BLOCK_LEADER_RE, - $this->buffer, - $matches, - PREG_OFFSET_CAPTURE); + $match = preg_match( self::BLOCK_LEADER_RE, $this->buffer, + $matches, PREG_OFFSET_CAPTURE); if ($match) { /* $pos is the position of the SECOND byte of the magic number diff --git a/library/archive_bundle_iterators/MediawikiBundleIterator.php b/library/archive_bundle_iterators/MediawikiBundleIterator.php deleted file mode 100644 index cc022242e..000000000 --- a/library/archive_bundle_iterators/MediawikiBundleIterator.php +++ /dev/null @@ -1,291 +0,0 @@ -<?php -/** - * SeekQuarry/Yioop -- - * Open Source Pure PHP Search Engine, Crawler, and Indexer - * - * Copyright (C) 2009 - 2015 Chris Pollett chris@pollett.org - * - * LICENSE: - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * END LICENSE - * - * @author Chris Pollett chris@pollett.org - * @license http://www.gnu.org/licenses/ GPL3 - * @link http://www.seekquarry.com/ - * @copyright 2009 - 2015 - * @filesource - */ -namespace seekquarry\yioop\library\archive_bundle_iterators; - -use seekquarry\yioop\library\Bzip2BlockIterator; -use seekquarry\yioop\library\WikiParser; - -if (!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} -/** - * Used to define the styles we put on cache wiki pages - */ -define('WIKI_PAGE_STYLES', <<<EOD -<style type="text/css"> -table.wikitable -{ - background:white; - border:1px #aaa solid; - border-collapse: scollapse - margin:1em 0; -} -table.wikitable > tr > th,table.wikitable > tr > td, -table.wikitable > * > tr > th,table.wikitable > * > tr > td -{ - border:1px #aaa solid; - padding:0.2em; -} -table.wikitable > tr > th, -table.wikitable > * > tr > th -{ - text-align:center; - background:white; - font-weight:bold -} -table.wikitable > caption -{ - font-weight:bold; -} -</style> -EOD -); -/** - * Used to iterate through a collection of .xml.bz2 media wiki files - * stored in a WebArchiveBundle folder. Here these media wiki files contain the - * kinds of documents used by wikipedia. Iteration would be - * for the purpose making an index of these records - * - * @author Chris Pollett - * @package seek_quarry\library\archive_bundle_iterator - * @see WebArchiveBundle - */ -class MediaWikiArchiveBundleIterator extends TextArchiveBundleIterator - implements CrawlConstants -{ - /** - * Used to hold a WikiParser object that will be used for parsing - * @var object - */ - public $parser; - /** - * Creates a media wiki archive iterator with the given parameters. - * - * @param string $iterate_timestamp timestamp of the arc archive bundle to - * iterate over the pages of - * @param string $iterate_dir folder of files to iterate over - * @param string $result_timestamp timestamp of the arc archive bundle - * results are being stored in - * @param string $result_dir where to write last position checkpoints to - */ - public function __construct($iterate_timestamp, $iterate_dir, - $result_timestamp, $result_dir) - { - $ini = [ 'compression' => 'bzip2', - 'file_extension' => 'bz2', - 'encoding' => 'UTF-8', - 'start_delimiter' => '@page@']; - parent::__construct($iterate_timestamp, $iterate_dir, - $result_timestamp, $result_dir, $ini); - $this->switch_partition_callback_name = "readMediaWikiHeader"; - } - /** - * Estimates the important of the site according to the weighting of - * the particular archive iterator - * @param $site an associative array containing info about a web page - * @return int a 4-bit number based on the log_2 size - 10 of the wiki - * entry (@see nextPage). - */ - public function weight(&$site) - { - return min($site[self::WEIGHT], 15); - } - /** - * Reads the siteinfo tag of the mediawiki xml file and extract data that - * will be used in constructing page summaries. - */ - public function readMediaWikiHeader() - { - $this->header = []; - $site_info = $this->getNextTagData("siteinfo"); - $found_lang = - preg_match('/lang\=\"(.*)\"/', $this->remainder, $matches); - if ($found_lang) { - $this->header['lang'] = $matches[1]; - } - if ($site_info === false) { - $this->bz2_iterator = null; - return false; - } - $dom = new \DOMDocument(); - @$dom->loadXML($site_info); - $this->header['sitename'] = $this->getTextContent($dom, - "/siteinfo/sitename"); - $pre_host_name = - $this->getTextContent($dom, "/siteinfo/base"); - $this->header['base_address'] = substr($pre_host_name, 0, - strrpos($pre_host_name, "/") + 1); - $url_parts = @parse_url($this->header['base_address']); - $this->header['ip_address'] = gethostbyname($url_parts['host']); - return true; - } - /** - * Used to initialize the arrays of match/replacements used to format - * wikimedia syntax into HTML (not perfectly since we are only doing - * regexes) - * - * @param string $base_address base url for link substitutions - */ - public function initializeSubstitutions($base_address) - { - $add_substitutions = [ - ['/{{([^}]*)({{([^{}]*)}})/', '{{$1$3' ], - ['/{{([^}]*)({{([^{}]*)}})/', '{{$1$3' ], - ['/{{([^}]*)({{([^{}]*)}})/', '{{$1$3' ], - ['/{{([^}]*)({{([^{}]*)}})/', '{{$1$3' ], - ['/{{([^}]*)({{([^{}]*)}})/', '{{$1$3' ], - ['/{{([^}]*)({{([^{}]*)}})/', '{{$1$3' ], - ['/\[\[([^\]]*)(\[\[([^\[\]]*)\]\])/', "[[$1$3"], - ['/\[\[([^\]]*)(\[\[([^\[\]]*)\]\])/', "[[$1$3"], - ['/\[\[([^\]]*)(\[\[([^\[\]]*)\]\])/', "[[$1$3"], - ['/\[\[([^\]]*)(\[\[([^\[\]]*)\]\])/', "[[$1$3"], - ['/\[\[([^\]]*)(\[\[([^\[\]]*)\]\])/', "[[$1$3"], - ['/\[\[([^\]]*)(\[\[([^\[\]]*)\]\])/', "[[$1$3"], - ["/\[\[Image:(.+?)(right\|)(.+?)\]\]/s", "[[Image:$1$3]]"], - ["/\[\[Image:(.+?)(left\|)(.+?)\]\]/s", "[[Image:$1$3]]"], - ["/\[\[Image:(.+?)(\|left)\]\]/s", "[[Image:$1]]"], - ["/\[\[Image:(.+?)(\|right)\]\]/s", "[[Image:$1]]"], - ["/\[\[Image:(.+?)(thumb\|)(.+?)\]\]/s", "[[Image:$1$3]]"], - ["/\[\[Image:(.+?)(live\|)(.+?)\]\]/s", "[[Image:$1$3]]"], - ["/\[\[Image:(.+?)(\s*\d*\s*(px|in|cm|". - "pt|ex|em)\s*\|)(.+?)\]\]/s","[[Image:$1$4]]"], - ["/\[\[Image:([^\|]+?)\]\]/s", - "(<a href=\"{$base_address}File:$1\" >Image:$1</a>)"], - ["/\[\[Image:(.+?)\|(.+?)\]\]/s", - "(<a href=\"{$base_address}File:$1\">Image:$2</a>)"], - ["/\[\[File:(.+?)\|(right\|)?thumb(.+?)\]\]/s", - "(<a href=\"{$base_address}File:$1\">Image:$1</a>)"], - ["/{{Redirect2?\|([^{}\|]+)\|([^{}\|]+)\|([^{}\|]+)}}/i", - "<div class='indent'>\"$1\". ($2 →<a href=\"". - $base_address."$3\">$3</a>)</div>"], - ["/{{Redirect\|([^{}\|]+)}}/i", - "<div class='indent'>\"$1\". (<a href=\"". - $base_address. "$1_(disambiguation)\">$1???</a>)</div>"], - ["/#REDIRECT:\s+\[\[(.+?)\]\]/", - "<a href='{$base_address}$1'>$1</a>"], - ["/{{pp-(.+?)}}/s", ""], - ["/{{bot(.*?)}}/si", ""], - ['/{{Infobox.*?\n}}/si', ""], - ['/{{Clear.*?\n}}/si', ""], - ["/{{clarify\|(.+?)}}/si", ""], - ['/{{[^}]*}}/s', ""], - ]; - $this->parser = new WikiParser($base_address, $add_substitutions); - } - /** - * Restores the internal state from the file iterate_status.txt in the - * result dir such that the next call to nextPages will pick up from just - * after the last checkpoint. We also reset up our regex substitutions - * - * @return array the data serialized when saveCheckpoint was called - */ - public function restoreCheckPoint() - { - $info = parent::restoreCheckPoint(); - if (!$this->iterate_dir) { // do on client not name server - $this->initializeSubstitutions($this->header['base_address']); - } - return $info; - } - /** - * Gets the text content of the first dom node satisfying the - * xpath expression $path in the dom document $dom - * - * @param object $dom DOMDocument to get the text from - * @param $path xpath expression to find node with text - * - * @return string text content of the given node if it exists - */ - public function getTextContent($dom, $path) - { - $xpath = new \DOMXPath($dom); - $objects = $xpath->evaluate($path); - if ($objects && is_object($objects) && $objects->item(0) != null ) { - return $objects->item(0)->textContent; - } - return ""; - } - /** - * Gets the next doc from the iterator - * @param bool $no_process do not do any processing on page data - * @return array associative array for doc or string if no_process true - */ - public function nextPage($no_process = false) - { - static $minimal_regexes = false; - static $first_call = true; - if ($first_call) { - if (!isset($this->header['base_address'])) { - $this->header['base_address'] = ""; - } - $this->initializeSubstitutions($this->header['base_address']); - } - $page_info = $this->getNextTagData("page"); - if ($no_process) { return $page_info; } - $dom = new \DOMDocument(); - @$dom->loadXML($page_info); - $site = []; - $pre_url = $this->getTextContent($dom, "/page/title"); - $pre_url = str_replace(" ", "_", $pre_url); - $site[self::URL] = $this->header['base_address'].$pre_url; - $site[self::IP_ADDRESSES] = [$this->header['ip_address']]; - $pre_timestamp = $this->getTextContent($dom, - "/page/revision/timestamp"); - $site[self::MODIFIED] = date("U", strtotime($pre_timestamp)); - $site[self::TIMESTAMP] = time(); - $site[self::TYPE] = "text/html"; - $site[self::HEADER] = "mediawiki_bundle_iterator extractor"; - $site[self::HTTP_CODE] = 200; - $site[self::ENCODING] = "UTF-8"; - $site[self::SERVER] = "unknown"; - $site[self::SERVER_VERSION] = "unknown"; - $site[self::OPERATING_SYSTEM] = "unknown"; - $site[self::PAGE] = "<html lang='".$this->header['lang']."' >\n". - "<head><title>$pre_url</title>\n". - WIKI_PAGE_STYLES . "\n</head>\n". - "<body><h1>$pre_url</h1>\n"; - $pre_page = $this->getTextContent($dom, "/page/revision/text"); - $current_hash = crawlHash($pre_page); - if ($first_call) { - $this->saveCheckPoint(); //ensure we remember to advance one on fail - $first_call = false; - } - $pre_page = $this->parser->parse($pre_page, false, true); - $pre_page = preg_replace("/{{Other uses}}/i", - "<div class='indent'>\"$1\". (<a href='". - $site[self::URL]. "_(disambiguation)'>$pre_url</a>)</div>", - $pre_page); - $site[self::PAGE] .= $pre_page; - $site[self::PAGE] .= "\n</body>\n</html>"; - $site[self::HASH] = FetchUrl::computePageHash($site[self::PAGE]); - $site[self::WEIGHT] = ceil(max( - log(strlen($site[self::PAGE]) + 1, 2) - 10, 1)); - return $site; - } -} diff --git a/library/archive_bundle_iterators/TextArchiveBundleIterator.php b/library/archive_bundle_iterators/TextArchiveBundleIterator.php index b9aacc61b..705564c29 100644 --- a/library/archive_bundle_iterators/TextArchiveBundleIterator.php +++ b/library/archive_bundle_iterators/TextArchiveBundleIterator.php @@ -31,6 +31,7 @@ namespace seekquarry\yioop\library\archive_bundle_iterators; use seekquarry\yioop\library as L; +use seekquarry\yioop\library\CrawlConstants; use seekquarry\yioop\library\FetchUrl; use seekquarry\yioop\library\Bzip2BlockIterator; @@ -232,12 +233,13 @@ class TextArchiveBundleIterator extends ArchiveBundleIterator $this->compression = "plain"; } if (isset($ini['start_delimiter'])) { - $this->start_delimiter =addRegexDelimiters($ini['start_delimiter']); + $this->start_delimiter = L\addRegexDelimiters( + $ini['start_delimiter']); } else { $this->start_delimiter = ""; } if (isset($ini['end_delimiter'])) { - $this->end_delimiter = addRegexDelimiters($ini['end_delimiter']); + $this->end_delimiter = L\addRegexDelimiters($ini['end_delimiter']); } else { $this->end_delimiter = ""; } @@ -350,8 +352,8 @@ class TextArchiveBundleIterator extends ArchiveBundleIterator $pages = []; $page_count = 0; for ($i = 0; $i < $num; $i++) { - crawlTimeoutLog("..Still getting pages from archive iterator. At %s" - ." of %s", $i, $num); + L\crawlTimeoutLog("..Still getting pages from archive iterator. ". + "At %s of %s", $i, $num); $page = $this->nextPage($no_process); if (!$page) { if ($this->checkFileHandle()) { @@ -396,7 +398,7 @@ class TextArchiveBundleIterator extends ArchiveBundleIterator $matches = []; while((preg_match($this->delimiter, $this->buffer, $matches, PREG_OFFSET_CAPTURE)) != 1) { - crawlTimeoutLog("..still looking for a page in local buffer"); + L\crawlTimeoutLog("..still looking for a page in local buffer"); $block = $this->getFileBlock(); if (!$block || !$this->checkFileHandle() || $this->checkEof()) { @@ -766,7 +768,7 @@ class TextArchiveBundleIterator extends ArchiveBundleIterator blocks. */ while(!is_string($block = $this->getFileBlock())) { - crawlTimeoutLog("..still getting next tags data.."); + L\crawlTimeoutLog("..still getting next tags data.."); if ($this->checkEof()) return false; }