viewgit/inc/functions.php:22 Function utf8_encode() is deprecated [8192]
diff --git a/controllers/FetchController.php b/controllers/FetchController.php index 1dee71449..7d1c6da3a 100755 --- a/controllers/FetchController.php +++ b/controllers/FetchController.php @@ -204,7 +204,7 @@ class FetchController extends Controller implements CrawlConstants $pages = false; if ($archive_iterator && !$archive_iterator->end_of_iterator) { if (L\generalIsA($archive_iterator, - "TextArchiveBundleIterator")) { + NS_ARCHIVE . "TextArchiveBundleIterator")) { $pages = $archive_iterator->nextChunk(); $chunk = true; } else { diff --git a/controllers/SearchController.php b/controllers/SearchController.php index 34444e86d..fb1664166 100755 --- a/controllers/SearchController.php +++ b/controllers/SearchController.php @@ -34,6 +34,7 @@ use seekquarry\yioop\library as L; use seekquarry\yioop\library\CrawlConstants; use seekquarry\yioop\library\PhraseParser; use seekquarry\yioop\library\FetchUrl; +use seekquarry\yioop\library\UrlParser; if (!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} /** Get the crawlHash function */ diff --git a/executables/Fetcher.php b/executables/Fetcher.php index 16d4a840e..6a874c380 100755 --- a/executables/Fetcher.php +++ b/executables/Fetcher.php @@ -1172,9 +1172,7 @@ class Fetcher implements CrawlConstants "&session=$session&robot_instance=".$prefix.ROBOT_INSTANCE. "&machine_uri=".WEB_URI."&crawl_time=".$this->crawl_time. "&check_crawl_time=".$this->check_crawl_time; - L\crawlLog($request); $response_string = FetchUrl::getPage($request, null, true); -L\crawlLog($response_string); if ($response_string === false) { L\crawlLog("The following request failed:"); L\crawlLog($request); diff --git a/library/WikiParser.php b/library/WikiParser.php index 1bc2f7f50..5ca0b70e4 100644 --- a/library/WikiParser.php +++ b/library/WikiParser.php @@ -70,7 +70,7 @@ class WikiParser implements CrawlConstants $minimal = false) { $esc = $this->esc; - $not_braces = '(?:[^\}]|\}[^\}])*'; + $not_braces = '[^\}\{]*'; $not_paragraph = '(?:\A|[^\n]|[^\n]\n)'; $class_or_id = '0-9a-zA-Z\_\-\s'; $this->minimal = $minimal; @@ -94,11 +94,11 @@ class WikiParser implements CrawlConstants ["/\n*?{{\s*Related articles\s*\|(.+?)\|(.+?)}}/si", "$esc<div class='indent'>\n\n(<a href=\"". $base_address . "$1\">$1?</a>)\n\n$esc</div>"], - ['/{{Hatnote\|'.$not_braces.'}}/', "($1)"], + ['/{{Hatnote\|('.$not_braces.')}}/', "($1)"], ["/{{lang[\||\-](.+?)\|(.+?)}}/si", "$1 → $2"], ["/{{convert\|(.+?)\|(.+?)\|(.+?)}}/si", "$1$2"], ["/{{IPA-(.+?)\|(.+?)}}/si", "(IPA $2)"], - ["/{{dablink\|'.$not_braces.'}}/", "($1)"], + ["/{{dablink\|('.$not_braces.')}}/", "($1)"], ]; $minimal_substitutions = [ ['/\[\[(http[^\s\|\]]+)\|([^\[\]]+?)\]\]/s', @@ -344,7 +344,7 @@ class WikiParser implements CrawlConstants $document = $this->cleanLinksAndParagraphs($document); } else { if (strlen($document) > 0.9 * MAX_GROUP_PAGE_LEN) { - $document = substr($document, 0, 0.9*MAX_GROUP_PAGE_LEN); + $document = substr($document, 0, 0.9 * MAX_GROUP_PAGE_LEN); } $document = $this->processRegexes($document); $document = $this->cleanLinksAndParagraphs($document); @@ -400,8 +400,7 @@ class WikiParser implements CrawlConstants public function processProvidedRegexes($matches, $replaces, $document) { if (strlen($document) < MAX_GROUP_PAGE_LEN) { - $document = preg_replace($matches, - $replaces, $document); + $document = preg_replace($matches, $replaces, $document); } else { $num_matches = count($matches); for ($i = 0; $i < $num_matches; $i++) { @@ -569,7 +568,10 @@ class WikiParser implements CrawlConstants $ref_data["author$i"]. "\">".$ref_data["author$i"]."</a>"; } - $ref_data["author"] .= " and " . $ref_data["author$i"]; + if(isset($ref_data["author"])) { + $ref_data["author"] .= + " and " . $ref_data["author$i"]; + } } if (!isset($ref_data['title']) && isset($ref_data['url'])) { $ref_data['title'] = $ref_data['url']; diff --git a/library/archive_bundle_iterators/DatabaseBundleIterator.php b/library/archive_bundle_iterators/DatabaseBundleIterator.php index 814696e9d..46ec6b947 100644 --- a/library/archive_bundle_iterators/DatabaseBundleIterator.php +++ b/library/archive_bundle_iterators/DatabaseBundleIterator.php @@ -192,8 +192,8 @@ class DatabaseBundleIterator extends ArchiveBundleIterator $result = $db->execute($query); $i = 0; while($row = $db->fetchArray($result)) { - crawlTimeoutLog("..Still getting pages from archive iterator. At %s" - ." of %s", $i, $num); + L\crawlTimeoutLog("..Still getting pages from archive iterator. ". + "At %s of %s", $i, $num); $page = ""; foreach ($row as $key => $value) { $page .= "$key{$this->field_value_separator}". diff --git a/library/archive_bundle_iterators/TextArchiveBundleIterator.php b/library/archive_bundle_iterators/TextArchiveBundleIterator.php index 705564c29..d6ef2c88e 100644 --- a/library/archive_bundle_iterators/TextArchiveBundleIterator.php +++ b/library/archive_bundle_iterators/TextArchiveBundleIterator.php @@ -188,7 +188,7 @@ class TextArchiveBundleIterator extends ArchiveBundleIterator mkdir($result_dir); } $this->partitions = []; - if ($this->iterate_dir != false) { // false =network/fetcher iterator + if ($this->iterate_dir != false) { // false = network/fetcher iterator if ($ini == []) { $ini = L\parse_ini_with_fallback( "{$this->iterate_dir}/arc_description.ini"); @@ -198,7 +198,7 @@ class TextArchiveBundleIterator extends ArchiveBundleIterator $this->setIniInfo($ini); if ($this->start_delimiter == "" && $this->end_delimiter == "" && $this->iterate_dir != false) { - crawlLog("At least one of start or end delimiter must be set!!"); + L\crawlLog("At least one of start or end delimiter must be set!!"); exit(); } if ($this->iterate_dir != false) { @@ -394,7 +394,7 @@ class TextArchiveBundleIterator extends ArchiveBundleIterator */ public function nextPage($no_process = false) { - if (!$this->checkFileHandle()) return null; + if (!$this->checkFileHandle()) { return null; } $matches = []; while((preg_match($this->delimiter, $this->buffer, $matches, PREG_OFFSET_CAPTURE)) != 1) { @@ -426,7 +426,7 @@ class TextArchiveBundleIterator extends ArchiveBundleIterator } if ($no_process == true) {return $page; } $site = []; - $site[self::HEADER] = "text_archive_bundle_iterator extractor"; + $site[self::HEADER] = "TextArchiveBundleIterator extractor"; $site[self::IP_ADDRESSES] = ["0.0.0.0"]; $site[self::TIMESTAMP] = date("U", time()); $site[self::TYPE] = "text/plain"; diff --git a/library/archive_bundle_iterators/WebArchiveBundleIterator.php b/library/archive_bundle_iterators/WebArchiveBundleIterator.php index f55d107bb..4730665a8 100644 --- a/library/archive_bundle_iterators/WebArchiveBundleIterator.php +++ b/library/archive_bundle_iterators/WebArchiveBundleIterator.php @@ -186,8 +186,8 @@ class WebArchiveBundleIterator extends ArchiveBundleIterator $num_to_get = 1; $objects = []; for ($i = 0; $i < $num; $i += $num_to_get) { - crawlTimeoutLog("..Still getting pages from archive iterator. At %s" - ." of %s", $i, $num); + L\crawlTimeoutLog("..Still getting pages from archive iterator. ". + "At %s of %s", $i, $num); $num_to_get = min($num, $this->partition->count - $this->partition_index); $pre_new_objects = $this->partition->nextObjects($num_to_get); @@ -206,7 +206,6 @@ class WebArchiveBundleIterator extends ArchiveBundleIterator } $this->end_of_iterator = ($this->overall_index >= $this->count ) ? true : false; - $this->saveCheckpoint(); return $objects; }