viewgit/inc/functions.php:22 Function utf8_encode() is deprecated [8192]

Gets MediaWiki crawls working after switch to namespaces, a=chris

Chris Pollett [2015-07-03 00:Jul:rd]
Gets MediaWiki crawls working after switch to namespaces, a=chris
Filename
controllers/FetchController.php
controllers/SearchController.php
executables/Fetcher.php
library/WikiParser.php
library/archive_bundle_iterators/DatabaseBundleIterator.php
library/archive_bundle_iterators/TextArchiveBundleIterator.php
library/archive_bundle_iterators/WebArchiveBundleIterator.php
diff --git a/controllers/FetchController.php b/controllers/FetchController.php
index 1dee71449..7d1c6da3a 100755
--- a/controllers/FetchController.php
+++ b/controllers/FetchController.php
@@ -204,7 +204,7 @@ class FetchController extends Controller implements CrawlConstants
             $pages = false;
             if ($archive_iterator && !$archive_iterator->end_of_iterator) {
                 if (L\generalIsA($archive_iterator,
-                    "TextArchiveBundleIterator")) {
+                    NS_ARCHIVE . "TextArchiveBundleIterator")) {
                     $pages = $archive_iterator->nextChunk();
                     $chunk = true;
                 } else {
diff --git a/controllers/SearchController.php b/controllers/SearchController.php
index 34444e86d..fb1664166 100755
--- a/controllers/SearchController.php
+++ b/controllers/SearchController.php
@@ -34,6 +34,7 @@ use seekquarry\yioop\library as L;
 use seekquarry\yioop\library\CrawlConstants;
 use seekquarry\yioop\library\PhraseParser;
 use seekquarry\yioop\library\FetchUrl;
+use seekquarry\yioop\library\UrlParser;

 if (!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}
 /** Get the crawlHash function */
diff --git a/executables/Fetcher.php b/executables/Fetcher.php
index 16d4a840e..6a874c380 100755
--- a/executables/Fetcher.php
+++ b/executables/Fetcher.php
@@ -1172,9 +1172,7 @@ class Fetcher implements CrawlConstants
             "&session=$session&robot_instance=".$prefix.ROBOT_INSTANCE.
             "&machine_uri=".WEB_URI."&crawl_time=".$this->crawl_time.
             "&check_crawl_time=".$this->check_crawl_time;
-        L\crawlLog($request);
         $response_string = FetchUrl::getPage($request, null, true);
-L\crawlLog($response_string);
         if ($response_string === false) {
             L\crawlLog("The following request failed:");
             L\crawlLog($request);
diff --git a/library/WikiParser.php b/library/WikiParser.php
index 1bc2f7f50..5ca0b70e4 100644
--- a/library/WikiParser.php
+++ b/library/WikiParser.php
@@ -70,7 +70,7 @@ class WikiParser implements CrawlConstants
         $minimal = false)
     {
         $esc = $this->esc;
-        $not_braces = '(?:[^\}]|\}[^\}])*';
+        $not_braces = '[^\}\{]*';
         $not_paragraph = '(?:\A|[^\n]|[^\n]\n)';
         $class_or_id = '0-9a-zA-Z\_\-\s';
         $this->minimal = $minimal;
@@ -94,11 +94,11 @@ class WikiParser implements CrawlConstants
             ["/\n*?{{\s*Related articles\s*\|(.+?)\|(.+?)}}/si",
                 "$esc<div class='indent'>\n\n(<a href=\"".
                 $base_address . "$1\">$1?</a>)\n\n$esc</div>"],
-            ['/{{Hatnote\|'.$not_braces.'}}/', "($1)"],
+            ['/{{Hatnote\|('.$not_braces.')}}/', "($1)"],
             ["/{{lang[\||\-](.+?)\|(.+?)}}/si", "$1 &rarr; $2"],
             ["/{{convert\|(.+?)\|(.+?)\|(.+?)}}/si", "$1$2"],
             ["/{{IPA-(.+?)\|(.+?)}}/si", "(IPA $2)"],
-            ["/{{dablink\|'.$not_braces.'}}/", "($1)"],
+            ["/{{dablink\|('.$not_braces.')}}/", "($1)"],
         ];
         $minimal_substitutions = [
             ['/\[\[(http[^\s\|\]]+)\|([^\[\]]+?)\]\]/s',
@@ -344,7 +344,7 @@ class WikiParser implements CrawlConstants
             $document = $this->cleanLinksAndParagraphs($document);
         } else {
             if (strlen($document) > 0.9 * MAX_GROUP_PAGE_LEN) {
-                $document = substr($document, 0, 0.9*MAX_GROUP_PAGE_LEN);
+                $document = substr($document, 0, 0.9 * MAX_GROUP_PAGE_LEN);
             }
             $document = $this->processRegexes($document);
             $document = $this->cleanLinksAndParagraphs($document);
@@ -400,8 +400,7 @@ class WikiParser implements CrawlConstants
     public function processProvidedRegexes($matches, $replaces, $document)
     {
         if (strlen($document) < MAX_GROUP_PAGE_LEN) {
-            $document = preg_replace($matches,
-                $replaces, $document);
+            $document = preg_replace($matches, $replaces, $document);
         } else {
             $num_matches = count($matches);
             for ($i = 0; $i < $num_matches; $i++) {
@@ -569,7 +568,10 @@ class WikiParser implements CrawlConstants
                                 $ref_data["author$i"].
                                 "\">".$ref_data["author$i"]."</a>";
                         }
-                        $ref_data["author"] .= " and " . $ref_data["author$i"];
+                        if(isset($ref_data["author"])) {
+                            $ref_data["author"] .=
+                                " and " . $ref_data["author$i"];
+                        }
                     }
                     if (!isset($ref_data['title']) && isset($ref_data['url'])) {
                         $ref_data['title'] = $ref_data['url'];
diff --git a/library/archive_bundle_iterators/DatabaseBundleIterator.php b/library/archive_bundle_iterators/DatabaseBundleIterator.php
index 814696e9d..46ec6b947 100644
--- a/library/archive_bundle_iterators/DatabaseBundleIterator.php
+++ b/library/archive_bundle_iterators/DatabaseBundleIterator.php
@@ -192,8 +192,8 @@ class DatabaseBundleIterator extends ArchiveBundleIterator
         $result = $db->execute($query);
         $i = 0;
         while($row = $db->fetchArray($result)) {
-            crawlTimeoutLog("..Still getting pages from archive iterator. At %s"
-                ." of %s", $i, $num);
+            L\crawlTimeoutLog("..Still getting pages from archive iterator. ".
+                "At %s of %s", $i, $num);
             $page = "";
             foreach ($row as $key => $value) {
                 $page .= "$key{$this->field_value_separator}".
diff --git a/library/archive_bundle_iterators/TextArchiveBundleIterator.php b/library/archive_bundle_iterators/TextArchiveBundleIterator.php
index 705564c29..d6ef2c88e 100644
--- a/library/archive_bundle_iterators/TextArchiveBundleIterator.php
+++ b/library/archive_bundle_iterators/TextArchiveBundleIterator.php
@@ -188,7 +188,7 @@ class TextArchiveBundleIterator extends ArchiveBundleIterator
             mkdir($result_dir);
         }
         $this->partitions = [];
-        if ($this->iterate_dir != false) { // false =network/fetcher iterator
+        if ($this->iterate_dir != false) { // false = network/fetcher iterator
             if ($ini == []) {
                 $ini = L\parse_ini_with_fallback(
                     "{$this->iterate_dir}/arc_description.ini");
@@ -198,7 +198,7 @@ class TextArchiveBundleIterator extends ArchiveBundleIterator
         $this->setIniInfo($ini);
         if ($this->start_delimiter == "" && $this->end_delimiter == "" &&
             $this->iterate_dir != false) {
-            crawlLog("At least one of start or end delimiter must be set!!");
+            L\crawlLog("At least one of start or end delimiter must be set!!");
             exit();
         }
         if ($this->iterate_dir != false) {
@@ -394,7 +394,7 @@ class TextArchiveBundleIterator extends ArchiveBundleIterator
      */
     public function nextPage($no_process = false)
     {
-        if (!$this->checkFileHandle()) return null;
+        if (!$this->checkFileHandle()) { return null; }
         $matches = [];
         while((preg_match($this->delimiter, $this->buffer, $matches,
             PREG_OFFSET_CAPTURE)) != 1) {
@@ -426,7 +426,7 @@ class TextArchiveBundleIterator extends ArchiveBundleIterator
         }
         if ($no_process == true) {return $page; }
         $site = [];
-        $site[self::HEADER] = "text_archive_bundle_iterator extractor";
+        $site[self::HEADER] = "TextArchiveBundleIterator extractor";
         $site[self::IP_ADDRESSES] = ["0.0.0.0"];
         $site[self::TIMESTAMP] = date("U", time());
         $site[self::TYPE] = "text/plain";
diff --git a/library/archive_bundle_iterators/WebArchiveBundleIterator.php b/library/archive_bundle_iterators/WebArchiveBundleIterator.php
index f55d107bb..4730665a8 100644
--- a/library/archive_bundle_iterators/WebArchiveBundleIterator.php
+++ b/library/archive_bundle_iterators/WebArchiveBundleIterator.php
@@ -186,8 +186,8 @@ class WebArchiveBundleIterator extends ArchiveBundleIterator
         $num_to_get = 1;
         $objects = [];
         for ($i = 0; $i < $num; $i += $num_to_get) {
-            crawlTimeoutLog("..Still getting pages from archive iterator. At %s"
-                ." of %s", $i, $num);
+            L\crawlTimeoutLog("..Still getting pages from archive iterator. ".
+                "At %s of %s", $i, $num);
             $num_to_get = min($num, $this->partition->count -
                 $this->partition_index);
             $pre_new_objects = $this->partition->nextObjects($num_to_get);
@@ -206,7 +206,6 @@ class WebArchiveBundleIterator extends ArchiveBundleIterator
         }
         $this->end_of_iterator = ($this->overall_index >= $this->count ) ?
             true : false;
-
         $this->saveCheckpoint();
         return $objects;
     }
ViewGit