Towards getting mediawiki working after changes, also works on bug in ResourceController, a=chris

Chris Pollett [2015-07-02 20:Jul:nd]

Towards getting mediawiki working after changes, also works on bug in ResourceController, a=chris

Filename
controllers/FetchController.php
controllers/ResourceController.php
executables/Fetcher.php
executables/QueueServer.php
library/Bzip2BlockIterator.php
library/archive_bundle_iterators/MediawikiBundleIterator.php
library/archive_bundle_iterators/TextArchiveBundleIterator.php

diff --git a/controllers/FetchController.php b/controllers/FetchController.php
index ebed0a701..1dee71449 100755
--- a/controllers/FetchController.php
+++ b/controllers/FetchController.php
@@ -182,14 +182,9 @@ class FetchController extends Controller implements CrawlConstants
                 $result_timestamp = $crawl_time;
                 $result_dir = WORK_DIRECTORY.
                     "/schedules/".self::name_archive_iterator.$crawl_time;
-
                 $arctype = $info[self::ARC_TYPE];
                 $iterator_name = NS_ARCHIVE . $arctype."Iterator";
-
-                if (!class_exists($iterator_name)) {
-                    $info['ARCHIVE_BUNDLE_ERROR'] =
-                        "Invalid bundle iterator: '{$iterator_name}'";
-                } else {
+                try {
                     if ($info[self::ARC_DIR] == "MIX") {
                         //recrawl of crawl mix case
                         $archive_iterator = new $iterator_name(
@@ -200,11 +195,15 @@ class FetchController extends Controller implements CrawlConstants
                             $iterate_timestamp, $info[self::ARC_DIR],
                             $result_timestamp, $result_dir);
                     }
+                } catch (\Exception $e) {
+                    $info['ARCHIVE_BUNDLE_ERROR'] =
+                        "Invalid bundle iterator: '{$iterator_name}' \n".
+                        $e->getMessage();
                 }
             }
             $pages = false;
             if ($archive_iterator && !$archive_iterator->end_of_iterator) {
-                if (generalIsA($archive_iterator,
+                if (L\generalIsA($archive_iterator,
                     "TextArchiveBundleIterator")) {
                     $pages = $archive_iterator->nextChunk();
                     $chunk = true;
diff --git a/controllers/ResourceController.php b/controllers/ResourceController.php
index 862b2c54e..af2ead08a 100644
--- a/controllers/ResourceController.php
+++ b/controllers/ResourceController.php
@@ -178,7 +178,7 @@ class ResourceController extends Controller implements CrawlConstants,
             $group = $group_model->getGroupById($group_id, $user_id);
             if (!$group) { return false; }
             $hash_word = (isset($_REQUEST['t'])) ? 'thumb' : 'group';
-            $subfolder = crawlHash(
+            $subfolder = L\crawlHash(
                 $hash_word . $group_id. $page_id . AUTH_KEY);
             $prefix_folder = substr($subfolder, 0, 3);
             $add_to_path = true;
diff --git a/executables/Fetcher.php b/executables/Fetcher.php
index 9c9d1601e..16d4a840e 100755
--- a/executables/Fetcher.php
+++ b/executables/Fetcher.php
@@ -1174,7 +1174,7 @@ class Fetcher implements CrawlConstants
             "&check_crawl_time=".$this->check_crawl_time;
         L\crawlLog($request);
         $response_string = FetchUrl::getPage($request, null, true);
-
+L\crawlLog($response_string);
         if ($response_string === false) {
             L\crawlLog("The following request failed:");
             L\crawlLog($request);
diff --git a/executables/QueueServer.php b/executables/QueueServer.php
index eba23e29a..c78811acd 100755
--- a/executables/QueueServer.php
+++ b/executables/QueueServer.php
@@ -417,7 +417,7 @@ class QueueServer implements CrawlConstants, Join
             }
             L\crawlLog("{$this->server_name} active crawl is ".
                 "{$this->crawl_time}.");
-            if ($this->isAScheduler()) {
+            if ($this->isAScheduler() && $this->crawl_type == self::WEB_CRAWL) {
                 L\crawlLog("Current queue size is:".
                     $this->web_queue->to_crawl_queue->count);
             }
diff --git a/library/Bzip2BlockIterator.php b/library/Bzip2BlockIterator.php
index f64df578b..e478af69b 100644
--- a/library/Bzip2BlockIterator.php
+++ b/library/Bzip2BlockIterator.php
@@ -132,11 +132,11 @@ class BZip2BlockIterator
         $this->fd = fopen($this->path, 'rb');
         $this->header = fread($this->fd, 4);
         if (substr($this->header, 0, 3) != self::MAGIC) {
-            throw new Exception('Bad bz2 magic number. Not a bz2 file?');
+            throw new \Exception('Bad bz2 magic number. Not a bz2 file?');
         }
         $this->block = fread($this->fd, 6);
         if ($this->block != self::BLOCK_HEADER) {
-            throw new Exception('Bad bz2 block header');
+            throw new \Exception('Bad bz2 block header');
         }
         $this->file_offset = 10;
     }
@@ -176,11 +176,8 @@ class BZip2BlockIterator
             $next_chunk = fread($this->fd, self::BLOCK_SIZE);
             $this->file_offset += strlen($next_chunk);
             $this->buffer .= $next_chunk;
-            $match = preg_match(
-                self::BLOCK_LEADER_RE,
-                $this->buffer,
-                $matches,
-                PREG_OFFSET_CAPTURE);
+            $match = preg_match( self::BLOCK_LEADER_RE, $this->buffer,
+                $matches, PREG_OFFSET_CAPTURE);
             if ($match) {
                 /*
                     $pos is the position of the SECOND byte of the magic number
diff --git a/library/archive_bundle_iterators/MediawikiBundleIterator.php b/library/archive_bundle_iterators/MediawikiBundleIterator.php
deleted file mode 100644
index cc022242e..000000000
--- a/library/archive_bundle_iterators/MediawikiBundleIterator.php
+++ /dev/null
@@ -1,291 +0,0 @@
-<?php
-/**
- * SeekQuarry/Yioop --
- * Open Source Pure PHP Search Engine, Crawler, and Indexer
- *
- * Copyright (C) 2009 - 2015  Chris Pollett chris@pollett.org
- *
- * LICENSE:
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * END LICENSE
- *
- * @author Chris Pollett chris@pollett.org
- * @license http://www.gnu.org/licenses/ GPL3
- * @link http://www.seekquarry.com/
- * @copyright 2009 - 2015
- * @filesource
- */
-namespace seekquarry\yioop\library\archive_bundle_iterators;
-
-use seekquarry\yioop\library\Bzip2BlockIterator;
-use seekquarry\yioop\library\WikiParser;
-
-if (!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}
-/**
- * Used to define the styles we put on cache wiki pages
- */
-define('WIKI_PAGE_STYLES', <<<EOD
-<style type="text/css">
-table.wikitable
-{
-    background:white;
-    border:1px #aaa solid;
-    border-collapse: scollapse
-    margin:1em 0;
-}
-table.wikitable > tr > th,table.wikitable > tr > td,
-table.wikitable > * > tr > th,table.wikitable > * > tr > td
-{
-    border:1px #aaa solid;
-    padding:0.2em;
-}
-table.wikitable > tr > th,
-table.wikitable > * > tr > th
-{
-    text-align:center;
-    background:white;
-    font-weight:bold
-}
-table.wikitable > caption
-{
-    font-weight:bold;
-}
-</style>
-EOD
-);
-/**
- * Used to iterate through a collection of .xml.bz2  media wiki files
- * stored in a WebArchiveBundle folder. Here these media wiki files contain the
- * kinds of documents used by wikipedia. Iteration would be
- * for the purpose making an index of these records
- *
- * @author Chris Pollett
- * @package seek_quarry\library\archive_bundle_iterator
- * @see WebArchiveBundle
- */
-class MediaWikiArchiveBundleIterator extends TextArchiveBundleIterator
-    implements CrawlConstants
-{
-    /**
-     * Used to hold a WikiParser object that will be used for parsing
-     * @var object
-     */
-    public $parser;
-    /**
-     * Creates a media wiki archive iterator with the given parameters.
-     *
-     * @param string $iterate_timestamp timestamp of the arc archive bundle to
-     *     iterate  over the pages of
-     * @param string $iterate_dir folder of files to iterate over
-     * @param string $result_timestamp timestamp of the arc archive bundle
-     *     results are being stored in
-     * @param string $result_dir where to write last position checkpoints to
-     */
-    public function __construct($iterate_timestamp, $iterate_dir,
-            $result_timestamp, $result_dir)
-    {
-        $ini = [ 'compression' => 'bzip2',
-            'file_extension' => 'bz2',
-            'encoding' => 'UTF-8',
-            'start_delimiter' => '@page@'];
-        parent::__construct($iterate_timestamp, $iterate_dir,
-            $result_timestamp, $result_dir, $ini);
-        $this->switch_partition_callback_name = "readMediaWikiHeader";
-    }
-    /**
-     * Estimates the important of the site according to the weighting of
-     * the particular archive iterator
-     * @param $site an associative array containing info about a web page
-     * @return int a 4-bit number based on the log_2 size - 10 of the wiki
-     *     entry (@see nextPage).
-     */
-    public function weight(&$site)
-    {
-        return min($site[self::WEIGHT], 15);
-    }
-    /**
-     * Reads the siteinfo tag of the mediawiki xml file and extract data that
-     * will be used in constructing page summaries.
-     */
-    public function readMediaWikiHeader()
-    {
-        $this->header = [];
-        $site_info = $this->getNextTagData("siteinfo");
-        $found_lang =
-            preg_match('/lang\=\"(.*)\"/', $this->remainder, $matches);
-        if ($found_lang) {
-            $this->header['lang'] = $matches[1];
-        }
-        if ($site_info === false) {
-            $this->bz2_iterator = null;
-            return false;
-        }
-        $dom = new \DOMDocument();
-        @$dom->loadXML($site_info);
-        $this->header['sitename'] = $this->getTextContent($dom,
-            "/siteinfo/sitename");
-        $pre_host_name =
-            $this->getTextContent($dom, "/siteinfo/base");
-        $this->header['base_address'] = substr($pre_host_name, 0,
-            strrpos($pre_host_name, "/") + 1);
-        $url_parts = @parse_url($this->header['base_address']);
-        $this->header['ip_address'] = gethostbyname($url_parts['host']);
-        return true;
-    }
-    /**
-     * Used to initialize the arrays of match/replacements used to format
-     * wikimedia syntax into HTML (not perfectly since we are only doing
-     * regexes)
-     *
-     * @param string $base_address base url for link substitutions
-     */
-    public function initializeSubstitutions($base_address)
-    {
-        $add_substitutions = [
-            ['/{{([^}]*)({{([^{}]*)}})/', '{{$1$3' ],
-            ['/{{([^}]*)({{([^{}]*)}})/', '{{$1$3' ],
-            ['/{{([^}]*)({{([^{}]*)}})/', '{{$1$3' ],
-            ['/{{([^}]*)({{([^{}]*)}})/', '{{$1$3' ],
-            ['/{{([^}]*)({{([^{}]*)}})/', '{{$1$3' ],
-            ['/{{([^}]*)({{([^{}]*)}})/', '{{$1$3' ],
-            ['/\[\[([^\]]*)(\[\[([^\[\]]*)\]\])/', "[[$1$3"],
-            ['/\[\[([^\]]*)(\[\[([^\[\]]*)\]\])/', "[[$1$3"],
-            ['/\[\[([^\]]*)(\[\[([^\[\]]*)\]\])/', "[[$1$3"],
-            ['/\[\[([^\]]*)(\[\[([^\[\]]*)\]\])/', "[[$1$3"],
-            ['/\[\[([^\]]*)(\[\[([^\[\]]*)\]\])/', "[[$1$3"],
-            ['/\[\[([^\]]*)(\[\[([^\[\]]*)\]\])/', "[[$1$3"],
-            ["/\[\[Image:(.+?)(right\|)(.+?)\]\]/s", "[[Image:$1$3]]"],
-            ["/\[\[Image:(.+?)(left\|)(.+?)\]\]/s", "[[Image:$1$3]]"],
-            ["/\[\[Image:(.+?)(\|left)\]\]/s", "[[Image:$1]]"],
-            ["/\[\[Image:(.+?)(\|right)\]\]/s", "[[Image:$1]]"],
-            ["/\[\[Image:(.+?)(thumb\|)(.+?)\]\]/s", "[[Image:$1$3]]"],
-            ["/\[\[Image:(.+?)(live\|)(.+?)\]\]/s", "[[Image:$1$3]]"],
-            ["/\[\[Image:(.+?)(\s*\d*\s*(px|in|cm|".
-                "pt|ex|em)\s*\|)(.+?)\]\]/s","[[Image:$1$4]]"],
-            ["/\[\[Image:([^\|]+?)\]\]/s",
-                "(<a href=\"{$base_address}File:$1\" >Image:$1</a>)"],
-            ["/\[\[Image:(.+?)\|(.+?)\]\]/s",
-                "(<a href=\"{$base_address}File:$1\">Image:$2</a>)"],
-            ["/\[\[File:(.+?)\|(right\|)?thumb(.+?)\]\]/s",
-                "(<a href=\"{$base_address}File:$1\">Image:$1</a>)"],
-            ["/{{Redirect2?\|([^{}\|]+)\|([^{}\|]+)\|([^{}\|]+)}}/i",
-                "<div class='indent'>\"$1\". ($2 &rarr;<a href=\"".
-                $base_address."$3\">$3</a>)</div>"],
-            ["/{{Redirect\|([^{}\|]+)}}/i",
-                "<div class='indent'>\"$1\". (<a href=\"".
-                $base_address. "$1_(disambiguation)\">$1???</a>)</div>"],
-            ["/#REDIRECT:\s+\[\[(.+?)\]\]/",
-                "<a href='{$base_address}$1'>$1</a>"],
-            ["/{{pp-(.+?)}}/s", ""],
-            ["/{{bot(.*?)}}/si", ""],
-            ['/{{Infobox.*?\n}}/si', ""],
-            ['/{{Clear.*?\n}}/si', ""],
-            ["/{{clarify\|(.+?)}}/si", ""],
-            ['/{{[^}]*}}/s', ""],
-        ];
-        $this->parser = new WikiParser($base_address, $add_substitutions);
-    }
-    /**
-     * Restores the internal state from the file iterate_status.txt in the
-     * result dir such that the next call to nextPages will pick up from just
-     * after the last checkpoint. We also reset up our regex substitutions
-     *
-     * @return array the data serialized when saveCheckpoint was called
-     */
-    public function restoreCheckPoint()
-    {
-        $info = parent::restoreCheckPoint();
-        if (!$this->iterate_dir) { // do on client not name server
-            $this->initializeSubstitutions($this->header['base_address']);
-        }
-        return $info;
-    }
-    /**
-     * Gets the text content of the first dom node satisfying the
-     * xpath expression $path in the dom document $dom
-     *
-     * @param object $dom DOMDocument to get the text from
-     * @param $path xpath expression to find node with text
-     *
-     * @return string text content of the given node if it exists
-     */
-    public function getTextContent($dom, $path)
-    {
-        $xpath = new \DOMXPath($dom);
-        $objects = $xpath->evaluate($path);
-        if ($objects  && is_object($objects) && $objects->item(0) != null ) {
-            return $objects->item(0)->textContent;
-        }
-        return "";
-    }
-    /**
-     * Gets the next doc from the iterator
-     * @param bool $no_process do not do any processing on page data
-     * @return array associative array for doc or string if no_process true
-     */
-    public function nextPage($no_process = false)
-    {
-        static $minimal_regexes = false;
-        static $first_call = true;
-        if ($first_call) {
-            if (!isset($this->header['base_address'])) {
-                $this->header['base_address'] = "";
-            }
-            $this->initializeSubstitutions($this->header['base_address']);
-        }
-        $page_info = $this->getNextTagData("page");
-        if ($no_process) { return $page_info; }
-        $dom = new \DOMDocument();
-        @$dom->loadXML($page_info);
-        $site = [];
-        $pre_url = $this->getTextContent($dom, "/page/title");
-        $pre_url = str_replace(" ", "_", $pre_url);
-        $site[self::URL] = $this->header['base_address'].$pre_url;
-        $site[self::IP_ADDRESSES] = [$this->header['ip_address']];
-        $pre_timestamp = $this->getTextContent($dom,
-            "/page/revision/timestamp");
-        $site[self::MODIFIED] = date("U", strtotime($pre_timestamp));
-        $site[self::TIMESTAMP] = time();
-        $site[self::TYPE] = "text/html";
-        $site[self::HEADER] = "mediawiki_bundle_iterator extractor";
-        $site[self::HTTP_CODE] = 200;
-        $site[self::ENCODING] = "UTF-8";
-        $site[self::SERVER] = "unknown";
-        $site[self::SERVER_VERSION] = "unknown";
-        $site[self::OPERATING_SYSTEM] = "unknown";
-        $site[self::PAGE] = "<html lang='".$this->header['lang']."' >\n".
-            "<head><title>$pre_url</title>\n".
-            WIKI_PAGE_STYLES . "\n</head>\n".
-            "<body><h1>$pre_url</h1>\n";
-        $pre_page = $this->getTextContent($dom, "/page/revision/text");
-        $current_hash = crawlHash($pre_page);
-        if ($first_call) {
-            $this->saveCheckPoint(); //ensure we remember to advance one on fail
-            $first_call = false;
-        }
-        $pre_page = $this->parser->parse($pre_page, false, true);
-        $pre_page = preg_replace("/{{Other uses}}/i",
-                "<div class='indent'>\"$1\". (<a href='".
-                $site[self::URL]. "_(disambiguation)'>$pre_url</a>)</div>",
-                $pre_page);
-        $site[self::PAGE] .= $pre_page;
-        $site[self::PAGE] .= "\n</body>\n</html>";
-        $site[self::HASH] = FetchUrl::computePageHash($site[self::PAGE]);
-        $site[self::WEIGHT] = ceil(max(
-            log(strlen($site[self::PAGE]) + 1, 2) - 10, 1));
-        return $site;
-    }
-}
diff --git a/library/archive_bundle_iterators/TextArchiveBundleIterator.php b/library/archive_bundle_iterators/TextArchiveBundleIterator.php
index b9aacc61b..705564c29 100644
--- a/library/archive_bundle_iterators/TextArchiveBundleIterator.php
+++ b/library/archive_bundle_iterators/TextArchiveBundleIterator.php
@@ -31,6 +31,7 @@
 namespace seekquarry\yioop\library\archive_bundle_iterators;

 use seekquarry\yioop\library as L;
+use seekquarry\yioop\library\CrawlConstants;
 use seekquarry\yioop\library\FetchUrl;
 use seekquarry\yioop\library\Bzip2BlockIterator;

@@ -232,12 +233,13 @@ class TextArchiveBundleIterator extends ArchiveBundleIterator
             $this->compression = "plain";
         }
         if (isset($ini['start_delimiter'])) {
-            $this->start_delimiter =addRegexDelimiters($ini['start_delimiter']);
+            $this->start_delimiter = L\addRegexDelimiters(
+                $ini['start_delimiter']);
         } else {
             $this->start_delimiter = "";
         }
         if (isset($ini['end_delimiter'])) {
-            $this->end_delimiter = addRegexDelimiters($ini['end_delimiter']);
+            $this->end_delimiter = L\addRegexDelimiters($ini['end_delimiter']);
         } else {
             $this->end_delimiter = "";
         }
@@ -350,8 +352,8 @@ class TextArchiveBundleIterator extends ArchiveBundleIterator
         $pages = [];
         $page_count = 0;
         for ($i = 0; $i < $num; $i++) {
-            crawlTimeoutLog("..Still getting pages from archive iterator. At %s"
-                ." of %s", $i, $num);
+            L\crawlTimeoutLog("..Still getting pages from archive iterator. ".
+                "At %s of %s", $i, $num);
             $page = $this->nextPage($no_process);
             if (!$page) {
                 if ($this->checkFileHandle()) {
@@ -396,7 +398,7 @@ class TextArchiveBundleIterator extends ArchiveBundleIterator
         $matches = [];
         while((preg_match($this->delimiter, $this->buffer, $matches,
             PREG_OFFSET_CAPTURE)) != 1) {
-            crawlTimeoutLog("..still looking for a page in local buffer");
+            L\crawlTimeoutLog("..still looking for a page in local buffer");
             $block = $this->getFileBlock();
             if (!$block ||
                 !$this->checkFileHandle() || $this->checkEof()) {
@@ -766,7 +768,7 @@ class TextArchiveBundleIterator extends ArchiveBundleIterator
                blocks.
             */
             while(!is_string($block = $this->getFileBlock())) {
-                crawlTimeoutLog("..still getting next tags data..");
+                L\crawlTimeoutLog("..still getting next tags data..");
                 if ($this->checkEof())
                     return false;
             }

ViewGit