viewgit/inc/functions.php:22 Function utf8_encode() is deprecated [8192]
diff --git a/src/configs/TokenTool.php b/src/configs/TokenTool.php index f0262d046..1e8704284 100644 --- a/src/configs/TokenTool.php +++ b/src/configs/TokenTool.php @@ -461,7 +461,7 @@ function makeKwikiEntriesGetSeedSites($locale_tag, $page_count_file, -strlen("</title>")); $query = mb_strtolower(trim($title)); $wiki_title = trim(str_replace(" ", "_", $title)); - $underscore_title = mb_strtolower(trim(str_replace(" ", "_", $title))); + $underscore_title = mb_strtolower($wiki_title); if (isset($title_counts[$underscore_title])) { $text_offset = getTagOffsetPage($page, "text", 0); if (!is_array($text_offset) || diff --git a/src/controllers/AdminController.php b/src/controllers/AdminController.php index 003dcfb64..8d5051a8d 100755 --- a/src/controllers/AdminController.php +++ b/src/controllers/AdminController.php @@ -506,8 +506,8 @@ class AdminController extends Controller implements CrawlConstants $script_array = ['SIDE_ADSCRIPT', 'TOP_ADSCRIPT', 'GLOBAL_ADSCRIPT']; foreach ($script_array as $value) { if (isset($_REQUEST[$value])) { - $_REQUEST[$value] = str_replace("(","(",$_REQUEST[$value]); - $_REQUEST[$value] = str_replace(")",")",$_REQUEST[$value]); + $_REQUEST[$value] = strtr($_REQUEST[$value], ["(" => "(", + ")" => ")"]); } } $color_fields = ['BACKGROUND_COLOR', 'FOREGROUND_COLOR', diff --git a/src/controllers/Controller.php b/src/controllers/Controller.php index 826d0b7bb..1aabd21ce 100755 --- a/src/controllers/Controller.php +++ b/src/controllers/Controller.php @@ -773,11 +773,8 @@ abstract class Controller break; case "file_name": if (isset($value)) { - $value = str_replace("&", "&", $value); - $value = str_replace("\\", "/", $value); - $value = str_replace("*", "-", $value); - $clean_value = str_replace(":", "-", $value); - $clean_value = str_replace("..", "-", $value); + $clean_value = strtr($value, ["&" => "&", "\\" => "/", + "*" => "-", ":" => "-", ".." => "-"]); } else { $clean_value = $default; } diff --git a/src/controllers/FetchController.php b/src/controllers/FetchController.php index ea9cc910f..8084142fb 100755 --- a/src/controllers/FetchController.php +++ b/src/controllers/FetchController.php @@ -497,8 +497,7 @@ class FetchController extends Controller implements CrawlConstants unlink($filename); } $logging = "... Data upload complete\n"; - $address = str_replace(".", "-", L\remoteAddress()); - $address = str_replace(":", "_", $address); + $address = strtr(L\remoteAddress(), ["." => "-", ":" => "_"]); $time = time(); $day = floor($time/C\ONE_DAY); $byte_counts = []; @@ -556,8 +555,7 @@ class FetchController extends Controller implements CrawlConstants $crawl_time = substr($this->clean($_REQUEST['crawl_time'], "int"), 0, C\TIMESTAMP_LEN); $dir = C\CRAWL_DIR . "/schedules/" . $schedule_name . $crawl_time; - $address = str_replace(".", "-", L\remoteAddress()); - $address = str_replace(":", "_", $address); + $address = strtr(L\remoteAddress(), ["." => "-", ":" => "_"]); $time = time(); $day = floor($time/C\ONE_DAY); if (!file_exists($dir)) { diff --git a/src/controllers/SearchController.php b/src/controllers/SearchController.php index 9ecee9dd2..91660ebff 100755 --- a/src/controllers/SearchController.php +++ b/src/controllers/SearchController.php @@ -2051,11 +2051,8 @@ EOD; $pattern = "/(\b)($meta_word(\S)+)/"; $terms = preg_replace($pattern, "", $terms); } - $terms = str_replace("'", " ", $terms); - $terms = str_replace('"', " ", $terms); - $terms = str_replace('\\', " ", $terms); - $terms = str_replace('|', " ", $terms); - $terms = $this->clean($terms, "string"); + $terms = $this->clean(strtr($terms, ["'" => " ", '"' => " ", + '\\' => " ", '|' => " "]), "string"); $phrase_string = mb_ereg_replace("[[:punct:]]", " ", $terms); $words = mb_split(" ", $phrase_string); if (!in_array("highlight", $ui_flags)) { diff --git a/src/controllers/components/SocialComponent.php b/src/controllers/components/SocialComponent.php index c236cfc88..2bb059282 100644 --- a/src/controllers/components/SocialComponent.php +++ b/src/controllers/components/SocialComponent.php @@ -2050,11 +2050,11 @@ class SocialComponent extends Component implements CrawlConstants $recent_found = false; $time = time(); $j = 0; - $parser = new WikiParser("", [], true, false); + $parser = new WikiParser("", [], true); $locale_tag = L\getLocaleTag(); $page = false; $math = false; - $csrf_token = C\CSRF_TOKEN . "=" .$this->parent->generateCSRFToken( + $csrf_token = C\CSRF_TOKEN . "=" . $this->parent->generateCSRFToken( $user_id); foreach ($group_items as $item) { $page = $item; diff --git a/src/library/IndexShard.php b/src/library/IndexShard.php index 26f1f957e..514efa1f5 100644 --- a/src/library/IndexShard.php +++ b/src/library/IndexShard.php @@ -662,6 +662,11 @@ class IndexShard extends PersistentStructure implements CrawlConstants } /** * + * @param int $start_offset + * @param int &$next_offset + * @param int $last_offset + * @param int $len + * @return array */ public function postingsSliceAscending($start_offset, &$next_offset, $last_offset, $len) @@ -701,7 +706,11 @@ class IndexShard extends PersistentStructure implements CrawlConstants return $results; } /** - * + * @param int $start_offset + * @param int &$next_offset + * @param int $last_offset + * @param int $len + * @return array */ public function postingsSliceDescending($start_offset, &$next_offset, $last_offset, $len) diff --git a/src/library/UrlParser.php b/src/library/UrlParser.php index a3bd507b0..667c8d514 100755 --- a/src/library/UrlParser.php +++ b/src/library/UrlParser.php @@ -560,7 +560,7 @@ class UrlParser return ""; } $last_path = $pre_path_parts[$count - 1]; - $path_parts = preg_split("/(_|-|\ |\+|\.)/", $last_path); + $path_parts = preg_split("/(_|-|\ |\+|\.)/", urldecode($last_path)); foreach ($path_parts as $part) { if (strlen($part) > 0 ) { $words[] = $part; diff --git a/src/library/Utility.php b/src/library/Utility.php index bc3796db9..d12ae90cc 100755 --- a/src/library/Utility.php +++ b/src/library/Utility.php @@ -1410,11 +1410,7 @@ function compareWordHashes($id1, $id2) */ function base64Hash($string) { - $hash = rtrim(base64_encode($string), "="); - $hash = str_replace("/", "_", $hash); - $hash = str_replace("+", "-" , $hash); - - return $hash; + return strtr(rtrim(base64_encode($string), "="), ["/" => "_", "+" => "-"]); } /** * Decodes a crawl hash number from base64 to raw ASCII @@ -1425,12 +1421,7 @@ function base64Hash($string) function unbase64Hash($base64) { //get rid of out modified base64 encoding - $hash = str_replace("_", "/", $base64); - $hash = str_replace("-", "+" , $hash); - $hash .= "="; - $raw = base64_decode($hash); - - return $raw; + return base64_decode(strtr($base64, ["_" => "/", "-" => "+"]) . "="); } /** * Encodes a string in a format suitable for post data @@ -1441,11 +1432,7 @@ function unbase64Hash($base64) */ function webencode($str) { - $str = base64_encode($str); - $str = str_replace("/", "_", $str); - $str = str_replace("=", "~", $str); - $str = str_replace("+", ".", $str); - return $str; + return strtr(base64_encode($str), ["/" => "_", "=" => "~", "+" => "."]); } /** * Decodes a string encoded by webencode @@ -1455,10 +1442,7 @@ function webencode($str) */ function webdecode($str) { - $str = str_replace(".", "+", $str); - $str = str_replace("~", "=", $str); - $str = str_replace("_", "/", $str); - return base64_decode($str); + return base64_decode(strtr($str, ["." => "+", "~" => "=", "_" => "/"])); } /** * The crawlHash function is used to encrypt passwords stored in the database. diff --git a/src/library/WikiParser.php b/src/library/WikiParser.php index 9801ae4f0..e5d619ced 100644 --- a/src/library/WikiParser.php +++ b/src/library/WikiParser.php @@ -401,10 +401,9 @@ class WikiParser implements CrawlConstants $this->replaces, $document); $i = 1; while(strpos($document, "@@~$i") !== false) { - $document = str_replace("@@~$i", "{{", $document); - $document = str_replace("~$i@@", "}}", $document); $document = $this->processProvidedRegexes($this->braces_matches, - $this->braces_replaces, $document); + $this->braces_replaces, strtr($document, ["@@~$i" => "{{", + "~$i@@" => "}}"] )); $i++; } return $document; @@ -749,10 +748,8 @@ class WikiParser implements CrawlConstants */ function makeTableCallback($matches) { - $table = str_replace("\n!","\n|#",$matches[2]); - $table = str_replace("!!","||#",$table); - $table = str_replace("||","\n|",$table); - $row_data = explode("|", $table); + $row_data = explode("|", strtr($matches[2], ["\n!" => "\n|#", "!!" => "||#", + "||" => "\n|"])); $first = true; $out = $matches[1]; $state = ""; @@ -766,8 +763,7 @@ function makeTableCallback($matches) crawlTimeoutLog("..Making Wiki Tables.."); } if ($first) { - $item = trim(str_replace("\n", " ", $item)); - $item = str_replace(""", "\"", $item); + $item = trim(strtr($item, ["\n" => " ", """ => "\""])); $item = stripAttributes($item, ['id', 'class', 'style']); $out .= "<table $item >\n<tr>"; $first = false; @@ -796,8 +792,7 @@ function makeTableCallback($matches) $type = "td"; } $trim_item = trim($item); - $attribute_trim = str_replace("\n", " ", $trim_item); - $attribute_trim = str_replace(""", "\"", $attribute_trim); + $attribute_trim = strtr($trim_item, ["\n" => " ", """ => "\""]); if (!$skip && $state = trim( stripAttributes($attribute_trim, $table_cell_attributes))) { $old_type = $type; diff --git a/src/library/archive_bundle_iterators/OdpRdfArchiveBundleIterator.php b/src/library/archive_bundle_iterators/OdpRdfArchiveBundleIterator.php index 8c61cacdd..3ea056f0a 100644 --- a/src/library/archive_bundle_iterators/OdpRdfArchiveBundleIterator.php +++ b/src/library/archive_bundle_iterators/OdpRdfArchiveBundleIterator.php @@ -166,11 +166,12 @@ class OdpRdfArchiveBundleIterator extends TextArchiveBundleIterator return false; } list($page_info, $tag) = $tag_data; - if ($no_process) { return $page_info; } - $page_info = str_replace("r:id","id", $page_info); - $page_info = str_replace("r:resource","resource", $page_info); - $page_info = str_replace("d:Title","Title", $page_info); - $page_info = str_replace("d:Description","Description", $page_info); + if ($no_process) { + return $page_info; + } + $page_info = strtr($page_info, ["r:id" => "id", + "r:resource" => "resource", "d:Title" => "Title", + "d:Description" => "Description"] ); $dom = new \DOMDocument(); @$dom->loadXML($page_info); $processMethod = "process".$tag; diff --git a/src/library/index_bundle_iterators/IntersectIterator.php b/src/library/index_bundle_iterators/IntersectIterator.php index d6012b2e0..51799ea7c 100644 --- a/src/library/index_bundle_iterators/IntersectIterator.php +++ b/src/library/index_bundle_iterators/IntersectIterator.php @@ -200,7 +200,7 @@ class IntersectIterator extends IndexBundleIterator } else { $i_docs = $this->index_bundle_iterators[ - $this->word_iterator_map[$i] + $this->word_iterator_map[$i] ]->currentDocsWithWord(); if (isset($i_docs[$key][self::POSITION_LIST]) && ($ct = count($i_docs[$key][self::POSITION_LIST]) > 0 )) { diff --git a/src/library/index_bundle_iterators/WordIterator.php b/src/library/index_bundle_iterators/WordIterator.php index 5dfda5f13..369ed0ef9 100644 --- a/src/library/index_bundle_iterators/WordIterator.php +++ b/src/library/index_bundle_iterators/WordIterator.php @@ -341,8 +341,8 @@ class WordIterator extends IndexBundleIterator } $this->count_block = count($results); if ($this->generation_pointer == $this->num_generations - 1 && - $results == []) { - $results = null; + empty($pre_results)) { + $results = -1; } $this->pages = $results; return $results; diff --git a/src/library/media_jobs/WikiMediaJob.php b/src/library/media_jobs/WikiMediaJob.php index 15472b8d5..b20a47df9 100644 --- a/src/library/media_jobs/WikiMediaJob.php +++ b/src/library/media_jobs/WikiMediaJob.php @@ -841,16 +841,10 @@ class WikiMediaJob extends MediaJob private function makeFileNamePattern($file_name, $file_pattern, $title = "", $pubdate = null) { + $translates = ["'" => "", "\"" => "", "/" => "-", "|" => "-"]; $file_name_parts = explode("?", $file_name); - $file_name = basename($file_name_parts[0]); - $file_name = str_replace("'", "", $file_name); - $file_name = str_replace("\"", "", $file_name); - $file_name = str_replace("/", "-", $file_name); - $file_name = str_replace("|", "-", $file_name); - $title = str_replace("'", "", $title); - $title = str_replace("\"", "", $title); - $title = str_replace("/", "-", $title); - $title = str_replace("|", "-", $title); + $file_name = strtr(basename($file_name_parts[0]), $translates); + $title = strtr($title, $translates); if (empty($file_pattern)) { return $file_name; } diff --git a/src/library/processors/BmpProcessor.php b/src/library/processors/BmpProcessor.php index 0c3370b1b..e4191f442 100644 --- a/src/library/processors/BmpProcessor.php +++ b/src/library/processors/BmpProcessor.php @@ -96,7 +96,7 @@ class BmpProcessor extends ImageProcessor $this->addWidthHeightSummary($summary, $page); $summary[self::TITLE] = ""; $summary[self::DESCRIPTION] = - UrlParser::getDocumentFilename($url) . "\n"; + UrlParser::getWordsLastPathPartUrl($url) . "\n"; if (ComputerVision::ocrEnabled()) { set_error_handler(null); $temp_file = $this->saveTempFile($page, $url, "bmp"); diff --git a/src/library/processors/GifProcessor.php b/src/library/processors/GifProcessor.php index 7fb5f926b..28019b331 100755 --- a/src/library/processors/GifProcessor.php +++ b/src/library/processors/GifProcessor.php @@ -81,8 +81,8 @@ class GifProcessor extends ImageProcessor $summary = []; $this->addWidthHeightSummary($summary, $page); $summary[self::TITLE] = ""; - $summary[self::DESCRIPTION] = UrlParser::getDocumentFilename($url) - . "\n"; + $summary[self::DESCRIPTION] = + UrlParser::getWordsLastPathPartUrl($url) . "\n"; if (ComputerVision::ocrEnabled()) { set_error_handler(null); $temp_file = $this->saveTempFile($page, $url, "gif"); diff --git a/src/library/processors/JpgProcessor.php b/src/library/processors/JpgProcessor.php index 84d753d31..a765ef6e8 100755 --- a/src/library/processors/JpgProcessor.php +++ b/src/library/processors/JpgProcessor.php @@ -85,7 +85,7 @@ class JpgProcessor extends ImageProcessor $summary = []; $this->addWidthHeightSummary($summary, $page); $summary[self::TITLE] = ""; - $file_name = UrlParser::getDocumentFilename($url); + $file_name = UrlParser::getWordsLastPathPartUrl($url); $summary[self::DESCRIPTION] = $file_name . "\n"; if (ComputerVision::ocrEnabled()) { set_error_handler(null); diff --git a/src/library/processors/PngProcessor.php b/src/library/processors/PngProcessor.php index 264d04ce7..873dc5480 100755 --- a/src/library/processors/PngProcessor.php +++ b/src/library/processors/PngProcessor.php @@ -81,8 +81,8 @@ class PngProcessor extends ImageProcessor $summary = []; $this->addWidthHeightSummary($summary, $page); $summary[self::TITLE] = ""; - $summary[self::DESCRIPTION] = UrlParser::getDocumentFilename($url) - . "\n"; + $summary[self::DESCRIPTION] = + UrlParser::getWordsLastPathPartUrl($url) . "\n"; if (ComputerVision::ocrEnabled()) { set_error_handler(null); $temp_file = $this->saveTempFile($page, $url, "png"); diff --git a/src/models/PhraseModel.php b/src/models/PhraseModel.php index a7b6c44e9..16a6edfe7 100755 --- a/src/models/PhraseModel.php +++ b/src/models/PhraseModel.php @@ -1210,9 +1210,11 @@ class PhraseModel extends ParallelModel $pages = []; $retrieve_postings_time = microtime(true); if (is_object($query_iterator)) { - while ($num_retrieved < $to_retrieve && - is_array($next_docs = - $query_iterator->nextDocsWithWord())) { + while ($num_retrieved < $to_retrieve) { + $next_docs = $query_iterator->nextDocsWithWord(); + if (!is_array($next_docs)) { + break; + } $pages += $next_docs; $num_retrieved = count($pages); } diff --git a/tests/UtilityTest.php b/tests/UtilityTest.php index 1d42223b4..f0e25e683 100644 --- a/tests/UtilityTest.php +++ b/tests/UtilityTest.php @@ -54,7 +54,7 @@ class UtilityTest extends UnitTest { } /** - * Determines if the checkTimeInterval method can correctly determin + * Determines if the checkTimeInterval method can correctly determine * if a time of day is between the times of day of two timestamps */ public function checkTimeIntervalTestCase() @@ -170,4 +170,18 @@ class UtilityTest extends UnitTest $this->assertEqual([$weight3, $depth3], [15, 8], "Weight adjustment works correctly"); } + /** + * Checks webencode/webdecode to see inverses. Checks base64Hash/ + * unbase64Hash to see inverses + */ + public function encodeDecodeTestCase() + { + $expected = "=+~-@hi ya everyone!!@~+-="; + $encode_decoded = L\webdecode(L\webencode($expected)); + $this->assertEqual($expected, $encode_decoded, + "Webencode/Webdecode works correctly"); + $encode_decoded = L\unbase64Hash(L\base64Hash($expected)); + $this->assertEqual($expected, $encode_decoded, + "base64Hash/unbase64Hash works correctly"); + } } diff --git a/tests/WikiParserTest.php b/tests/WikiParserTest.php new file mode 100644 index 000000000..b19d5b684 --- /dev/null +++ b/tests/WikiParserTest.php @@ -0,0 +1,93 @@ +<?php +/** + * SeekQuarry/Yioop -- + * Open Source Pure PHP Search Engine, Crawler, and Indexer + * + * Copyright (C) 2009 - 2020 Chris Pollett chris@pollett.org + * + * LICENSE: + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <https://www.gnu.org/licenses/>. + * + * END LICENSE + * + * @author Chris Pollett chris@pollett.org + * @license https://www.gnu.org/licenses/ GPL3 + * @link https://www.seekquarry.com/ + * @copyright 2009 - 2020 + * @filesource + */ +namespace seekquarry\yioop\tests; + +use seekquarry\yioop\controllers\SearchController; +use seekquarry\yioop\library as L; +use seekquarry\yioop\library\UnitTest; + +/** + * Tests the functionaility of WikiParser used when processing Wikipedia dumps + * and used for Yioop's internal wiki infrastructure + * + * @author Chris Pollett + */ +class WikiParserTest extends UnitTest +{ + /** + * No set up being done for the time being + */ + public function setUp() + { + } + /** + * No tear down being done for the time being + */ + public function tearDown() + { + } + /** + * Checks that the basic WikiParser substitutions are done correctly + */ + public function checkBasicSubstitutionsTestCase() + { + $controller = new SearchController(); + $parser = new L\WikiParser(); + for ($i = 1; $i < 6; $i++) { + $heading = str_repeat("=", $i); + $parsed = $parser->parse("{$heading}Title{$heading}"); + $expected = "\n<div>\n\n<h$i id='Title'>Title</h$i>\n</div>\n"; + $this->assertEqual($parsed, $expected, + "Level $i heading parses as expected!"); + } + $this->assertEqual($parsed, $expected, + "Level $i Heading Parses as Expected!"); + $parsed = $parser->parse($controller->clean("'''Bold'''", "string")); + $expected = "\n<div>\n<b>Bold</b>\t\n</div>\n"; + $this->assertEqual($parsed, $expected, "Bold text parses as expected!"); + $parsed = $parser->parse($controller->clean("''Italics''", "string")); + $expected = "\n<div>\n<i>Italics</i>\t\n</div>\n"; + $this->assertEqual($parsed, $expected, + "Italics text parses as expected!"); + $parsed = $parser->parse($controller->clean("#item1\n#item2", + "string")); + $expected = "\n<div>\n\n<ol>\n<li>item1</li>\n<li>item2</li>\n". + "</ol>\n\n</div>\n"; + $this->assertEqual($parsed, $expected, + "Ordered list parses as expected!"); + $parsed = $parser->parse($controller->clean("*item1\n*item2", + "string")); + $expected = "\n<div>\n\n<ul>\n<li>item1</li>\n<li>item2</li>\n". + "</ul>\n\n</div>\n"; + $this->assertEqual($parsed, $expected, + "Unordered list parses as expected!"); + } +}