Get word filter plugin working after namespace changes, a=chris

Chris Pollett [2015-07-03 19:Jul:rd]

Get word filter plugin working after namespace changes, a=chris

Filename
index.php
library/archive_bundle_iterators/MediaWikiArchiveBundleIterator.php
library/indexing_plugins/IndexingPlugin.php
library/indexing_plugins/WordfilterPlugin.php

diff --git a/index.php b/index.php
index 79dfe0c76..f03813ea3 100755
--- a/index.php
+++ b/index.php
@@ -37,11 +37,12 @@ namespace seekquarry\yioop;

 use seekquarry\yioop\library as L;

+
+$total_time = microtime();
+$pathinfo = pathinfo($_SERVER['SCRIPT_FILENAME']);
 /** Calculate base directory of script
  * @ignore
  */
-$total_time = microtime();
-$pathinfo = pathinfo($_SERVER['SCRIPT_FILENAME']);
 define("BASE_DIR", $pathinfo["dirname"].'/');
 $pathinfo = pathinfo($_SERVER['SCRIPT_NAME']);
 $http = (isset($_SERVER['HTTPS']) && $_SERVER['HTTPS']) ?
diff --git a/library/archive_bundle_iterators/MediaWikiArchiveBundleIterator.php b/library/archive_bundle_iterators/MediaWikiArchiveBundleIterator.php
new file mode 100644
index 000000000..cb2665636
--- /dev/null
+++ b/library/archive_bundle_iterators/MediaWikiArchiveBundleIterator.php
@@ -0,0 +1,294 @@
+<?php
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009 - 2015  Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * END LICENSE
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2015
+ * @filesource
+ */
+namespace seekquarry\yioop\library\archive_bundle_iterators;
+
+use seekquarry\yioop\library as L;
+use seekquarry\yioop\library\BZip2BlockIterator;
+use seekquarry\yioop\library\CrawlConstants;
+use seekquarry\yioop\library\FetchUrl;
+use seekquarry\yioop\library\WikiParser;
+
+if (!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}
+/**
+ * Used to define the styles we put on cache wiki pages
+ */
+define('WIKI_PAGE_STYLES', <<<EOD
+<style type="text/css">
+table.wikitable
+{
+    background:white;
+    border:1px #aaa solid;
+    border-collapse: scollapse
+    margin:1em 0;
+}
+table.wikitable > tr > th,table.wikitable > tr > td,
+table.wikitable > * > tr > th,table.wikitable > * > tr > td
+{
+    border:1px #aaa solid;
+    padding:0.2em;
+}
+table.wikitable > tr > th,
+table.wikitable > * > tr > th
+{
+    text-align:center;
+    background:white;
+    font-weight:bold
+}
+table.wikitable > caption
+{
+    font-weight:bold;
+}
+</style>
+EOD
+);
+/**
+ * Used to iterate through a collection of .xml.bz2  media wiki files
+ * stored in a WebArchiveBundle folder. Here these media wiki files contain the
+ * kinds of documents used by wikipedia. Iteration would be
+ * for the purpose making an index of these records
+ *
+ * @author Chris Pollett
+ * @package seek_quarry\library\archive_bundle_iterator
+ * @see WebArchiveBundle
+ */
+class MediaWikiArchiveBundleIterator extends TextArchiveBundleIterator
+    implements CrawlConstants
+{
+    /**
+     * Used to hold a WikiParser object that will be used for parsing
+     * @var object
+     */
+    public $parser;
+    /**
+     * Creates a media wiki archive iterator with the given parameters.
+     *
+     * @param string $iterate_timestamp timestamp of the arc archive bundle to
+     *     iterate  over the pages of
+     * @param string $iterate_dir folder of files to iterate over
+     * @param string $result_timestamp timestamp of the arc archive bundle
+     *     results are being stored in
+     * @param string $result_dir where to write last position checkpoints to
+     */
+    public function __construct($iterate_timestamp, $iterate_dir,
+            $result_timestamp, $result_dir)
+    {
+        $ini = [ 'compression' => 'bzip2',
+            'file_extension' => 'bz2',
+            'encoding' => 'UTF-8',
+            'start_delimiter' => '@page@'];
+        parent::__construct($iterate_timestamp, $iterate_dir,
+            $result_timestamp, $result_dir, $ini);
+        $this->switch_partition_callback_name = "readMediaWikiHeader";
+    }
+    /**
+     * Estimates the important of the site according to the weighting of
+     * the particular archive iterator
+     * @param $site an associative array containing info about a web page
+     * @return int a 4-bit number based on the log_2 size - 10 of the wiki
+     *     entry (@see nextPage).
+     */
+    public function weight(&$site)
+    {
+        return min($site[self::WEIGHT], 15);
+    }
+    /**
+     * Reads the siteinfo tag of the mediawiki xml file and extract data that
+     * will be used in constructing page summaries.
+     */
+    public function readMediaWikiHeader()
+    {
+        $this->header = [];
+        $site_info = $this->getNextTagData("siteinfo");
+        $found_lang =
+            preg_match('/lang\=\"(.*)\"/', $this->remainder, $matches);
+        if ($found_lang) {
+            $this->header['lang'] = $matches[1];
+        }
+        if ($site_info === false) {
+            $this->bz2_iterator = null;
+            return false;
+        }
+        $dom = new \DOMDocument();
+        @$dom->loadXML($site_info);
+        $this->header['sitename'] = $this->getTextContent($dom,
+            "/siteinfo/sitename");
+        $pre_host_name =
+            $this->getTextContent($dom, "/siteinfo/base");
+        $this->header['base_address'] = substr($pre_host_name, 0,
+            strrpos($pre_host_name, "/") + 1);
+        $url_parts = @parse_url($this->header['base_address']);
+        $this->header['ip_address'] = gethostbyname($url_parts['host']);
+        return true;
+    }
+    /**
+     * Used to initialize the arrays of match/replacements used to format
+     * wikimedia syntax into HTML (not perfectly since we are only doing
+     * regexes)
+     *
+     * @param string $base_address base url for link substitutions
+     */
+    public function initializeSubstitutions($base_address)
+    {
+        $add_substitutions = [
+            ['/{{([^}]*)({{([^{}]*)}})/', '{{$1$3' ],
+            ['/{{([^}]*)({{([^{}]*)}})/', '{{$1$3' ],
+            ['/{{([^}]*)({{([^{}]*)}})/', '{{$1$3' ],
+            ['/{{([^}]*)({{([^{}]*)}})/', '{{$1$3' ],
+            ['/{{([^}]*)({{([^{}]*)}})/', '{{$1$3' ],
+            ['/{{([^}]*)({{([^{}]*)}})/', '{{$1$3' ],
+            ['/\[\[([^\]]*)(\[\[([^\[\]]*)\]\])/', "[[$1$3"],
+            ['/\[\[([^\]]*)(\[\[([^\[\]]*)\]\])/', "[[$1$3"],
+            ['/\[\[([^\]]*)(\[\[([^\[\]]*)\]\])/', "[[$1$3"],
+            ['/\[\[([^\]]*)(\[\[([^\[\]]*)\]\])/', "[[$1$3"],
+            ['/\[\[([^\]]*)(\[\[([^\[\]]*)\]\])/', "[[$1$3"],
+            ['/\[\[([^\]]*)(\[\[([^\[\]]*)\]\])/', "[[$1$3"],
+            ["/\[\[Image:(.+?)(right\|)(.+?)\]\]/s", "[[Image:$1$3]]"],
+            ["/\[\[Image:(.+?)(left\|)(.+?)\]\]/s", "[[Image:$1$3]]"],
+            ["/\[\[Image:(.+?)(\|left)\]\]/s", "[[Image:$1]]"],
+            ["/\[\[Image:(.+?)(\|right)\]\]/s", "[[Image:$1]]"],
+            ["/\[\[Image:(.+?)(thumb\|)(.+?)\]\]/s", "[[Image:$1$3]]"],
+            ["/\[\[Image:(.+?)(live\|)(.+?)\]\]/s", "[[Image:$1$3]]"],
+            ["/\[\[Image:(.+?)(\s*\d*\s*(px|in|cm|".
+                "pt|ex|em)\s*\|)(.+?)\]\]/s","[[Image:$1$4]]"],
+            ["/\[\[Image:([^\|]+?)\]\]/s",
+                "(<a href=\"{$base_address}File:$1\" >Image:$1</a>)"],
+            ["/\[\[Image:(.+?)\|(.+?)\]\]/s",
+                "(<a href=\"{$base_address}File:$1\">Image:$2</a>)"],
+            ["/\[\[File:(.+?)\|(right\|)?thumb(.+?)\]\]/s",
+                "(<a href=\"{$base_address}File:$1\">Image:$1</a>)"],
+            ["/{{Redirect2?\|([^{}\|]+)\|([^{}\|]+)\|([^{}\|]+)}}/i",
+                "<div class='indent'>\"$1\". ($2 &rarr;<a href=\"".
+                $base_address."$3\">$3</a>)</div>"],
+            ["/{{Redirect\|([^{}\|]+)}}/i",
+                "<div class='indent'>\"$1\". (<a href=\"".
+                $base_address. "$1_(disambiguation)\">$1???</a>)</div>"],
+            ["/#REDIRECT:\s+\[\[(.+?)\]\]/",
+                "<a href='{$base_address}$1'>$1</a>"],
+            ["/{{pp-(.+?)}}/s", ""],
+            ["/{{bot(.*?)}}/si", ""],
+            ['/{{Infobox.*?\n}}/si', ""],
+            ['/{{Clear.*?\n}}/si', ""],
+            ["/{{clarify\|(.+?)}}/si", ""],
+            ['/{{[^}]*}}/s', ""],
+        ];
+        $this->parser = new WikiParser($base_address, $add_substitutions);
+    }
+    /**
+     * Restores the internal state from the file iterate_status.txt in the
+     * result dir such that the next call to nextPages will pick up from just
+     * after the last checkpoint. We also reset up our regex substitutions
+     *
+     * @return array the data serialized when saveCheckpoint was called
+     */
+    public function restoreCheckPoint()
+    {
+        $info = parent::restoreCheckPoint();
+        if (!$this->iterate_dir) { // do on client not name server
+            $this->initializeSubstitutions($this->header['base_address']);
+        }
+        return $info;
+    }
+    /**
+     * Gets the text content of the first dom node satisfying the
+     * xpath expression $path in the dom document $dom
+     *
+     * @param object $dom DOMDocument to get the text from
+     * @param $path xpath expression to find node with text
+     *
+     * @return string text content of the given node if it exists
+     */
+    public function getTextContent($dom, $path)
+    {
+        $xpath = new \DOMXPath($dom);
+        $objects = $xpath->evaluate($path);
+        if ($objects  && is_object($objects) && $objects->item(0) != null ) {
+            return $objects->item(0)->textContent;
+        }
+        return "";
+    }
+    /**
+     * Gets the next doc from the iterator
+     * @param bool $no_process do not do any processing on page data
+     * @return array associative array for doc or string if no_process true
+     */
+    public function nextPage($no_process = false)
+    {
+        static $minimal_regexes = false;
+        static $first_call = true;
+        if ($first_call) {
+            if (!isset($this->header['base_address'])) {
+                $this->header['base_address'] = "";
+            }
+            $this->initializeSubstitutions($this->header['base_address']);
+        }
+        $page_info = $this->getNextTagData("page");
+        if ($no_process) { return $page_info; }
+        $dom = new \DOMDocument();
+        @$dom->loadXML($page_info);
+        $site = [];
+        $pre_url = $this->getTextContent($dom, "/page/title");
+        $pre_url = str_replace(" ", "_", $pre_url);
+        $site[self::URL] = $this->header['base_address'].$pre_url;
+        $site[self::IP_ADDRESSES] = [$this->header['ip_address']];
+        $pre_timestamp = $this->getTextContent($dom,
+            "/page/revision/timestamp");
+        $site[self::MODIFIED] = date("U", strtotime($pre_timestamp));
+        $site[self::TIMESTAMP] = time();
+        $site[self::TYPE] = "text/html";
+        $site[self::HEADER] = "MediawikiBundleIterator extractor";
+        $site[self::HTTP_CODE] = 200;
+        $site[self::ENCODING] = "UTF-8";
+        $site[self::SERVER] = "unknown";
+        $site[self::SERVER_VERSION] = "unknown";
+        $site[self::OPERATING_SYSTEM] = "unknown";
+        $site[self::PAGE] = "<html lang='".$this->header['lang']."' >\n".
+            "<head><title>$pre_url</title>\n".
+            WIKI_PAGE_STYLES . "\n</head>\n".
+            "<body><h1>$pre_url</h1>\n";
+        $pre_page = $this->getTextContent($dom, "/page/revision/text");
+        $current_hash = L\crawlHash($pre_page);
+        if ($first_call) {
+            $this->saveCheckPoint(); //ensure we remember to advance one on fail
+            $first_call = false;
+        }
+        $pre_page = $this->parser->parse($pre_page, false, true);
+        $pre_page = preg_replace("/{{Other uses}}/i",
+                "<div class='indent'>\"$1\". (<a href='".
+                $site[self::URL]. "_(disambiguation)'>$pre_url</a>)</div>",
+                $pre_page);
+        $site[self::PAGE] .= $pre_page;
+        $site[self::PAGE] .= "\n</body>\n</html>";
+        $site[self::HASH] = FetchUrl::computePageHash($site[self::PAGE]);
+        $site[self::WEIGHT] = ceil(max(
+            log(strlen($site[self::PAGE]) + 1, 2) - 10, 1));
+        return $site;
+    }
+}
diff --git a/library/indexing_plugins/IndexingPlugin.php b/library/indexing_plugins/IndexingPlugin.php
index 41ee47c73..5cad80e17 100644
--- a/library/indexing_plugins/IndexingPlugin.php
+++ b/library/indexing_plugins/IndexingPlugin.php
@@ -38,6 +38,20 @@ if (!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}
  * models)
  */
 define("POST_PROCESSING", true);
+/** import a  tl function into Controller Namespace */
+function tl()
+{
+    return call_user_func_array(NS_LIB . "tl", func_get_args());
+}
+/**
+ * shorthand for echo
+ *
+ * @param string $text string to send to the current output
+ */
+function e($text)
+{
+    echo $text;
+}
 /**
  * Base indexing plugin Class. An indexing plugin allows a developer
  * to do additional processing on web pages during a crawl, then after
diff --git a/library/indexing_plugins/WordfilterPlugin.php b/library/indexing_plugins/WordfilterPlugin.php
index f88a6311d..f60b55c11 100644
--- a/library/indexing_plugins/WordfilterPlugin.php
+++ b/library/indexing_plugins/WordfilterPlugin.php
@@ -222,6 +222,7 @@ EOD;
      */
     public function pageSummaryProcessing(&$summary, $url)
     {
+        L\crawlLog("  Word filter plugin examining page..");
         $sites = array_keys($this->filter_rules);
         $filter_rules = $this->filter_rules;
         $rules = ($filter_rules['default'])?$filter_rules['default'] : [];
@@ -247,7 +248,7 @@ EOD;
                 $summary[self::TITLE], $summary[self::DESCRIPTION]);
             if ($filter_flag) {
                 if (in_array("NOPROCESS", $actions)) {
-                    crawlLog("  Word filter plugin removed page.");
+                    L\crawlLog("  Word filter plugin removed page.");
                     $summary = false;
                     break;
                 } else {
@@ -390,8 +391,8 @@ EOD;
      */
     public function parseRules()
     {
-        $rule_blocks_regex = "/\n*\s*\[(.*)\]\s*\n+/";
-        $blocks = preg_split($rule_blocks_regex, $this->rules_string, -1,
+        $rule_blocks_regex = "/\n\s*\[(.*)\]\s*\n+/";
+        $blocks = preg_split($rule_blocks_regex, "\n".$this->rules_string, -1,
             PREG_SPLIT_DELIM_CAPTURE);
         $num_blocks = count($blocks);
         $block_name = "default";

ViewGit