viewgit/inc/functions.php:22 Function utf8_encode() is deprecated [8192]
diff --git a/src/configs/Config.php b/src/configs/Config.php index 922ae144d..6b287f0f3 100755 --- a/src/configs/Config.php +++ b/src/configs/Config.php @@ -667,22 +667,21 @@ nsconddefine('VERSION_0_TIMESTAMP', 1369754208); nsconddefine('VERSION_1_TIMESTAMP', 1528045371); /** What version format to use for default indexing **/ nsconddefine('DEFAULT_CRAWL_FORMAT', 2); -/** Max memory a QueueServer can use */ -nsconddefine('QUEUE_SERVER_MEMORY_LIMIT', "3000M"); -/** Max memory a Fetcher can use */ -nsconddefine('FETCHER_MEMORY_LIMIT', "1500M"); -defineMemoryProfile(); +/** 1 Gigibyte (GiB)*/ +nsdefine('ONE_GIB', 1073741824); /** * Code to determine how much memory current machine has */ function defineMemoryProfile() { - //assume have at least 4GB on a Mac(could use vm_stat) - $memory = 4000000000; + //assume have at least 4GiB + $memory = 4 * ONE_GIB; if (strstr(PHP_OS, "WIN")) { if (function_exists("exec")) { exec('wmic memorychip get capacity', $memory_array); - $memory = array_sum($memory_array); + if ($memory_array) { + $memory = array_sum($memory_array); + } } } else if (stristr(PHP_OS, "LINUX")) { set_error_handler(null); @@ -692,35 +691,40 @@ function defineMemoryProfile() $data = preg_split("/\s+/", $mem_data); $memory = 1024 * intval($data[1]); } + } else if (stristr(PHP_OS, "DARWIN")) { + exec('sysctl hw.memsize', $memory_array); + if (!empty($memory_array)) { + preg_match("/\d+/", $memory_array[0], $mem_matches); + $memory = $mem_matches[0]; + } } - /** - * Factor to multiply sizes of Yioop data structures with in low ram memory - * setting (2GB) - */ - nsdefine('MEMORY_LOW', 1); - /** - * Factor to multiply sizes of Yioop data structures with if have more than - * (2GB) - */ - nsdefine('MEMORY_STANDARD', 4); - if ($memory < 2200000000) { - /** - * Based on system memory, either the low or high memory factor - */ - nsdefine('MEMORY_PROFILE', MEMORY_LOW); - } else { - /** - * @ignore - */ - nsdefine('MEMORY_PROFILE', MEMORY_STANDARD); - } - /** - * Delay in microseconds between processing pages to try to avoid - * CPU overheating. On some systems, you can set this to 0. - */ - nsconddefine('FETCHER_PROCESS_DELAY', 10000); + $memory_factor = ceil($memory / (2 * ONE_GIB)); + nsdefine('MEMORY_PROFILE', min(4, $memory_factor)); + nsdefine('SYSTEM_RAM', $memory); } - +//Check system memory then set up limits for prcoesses based on this +defineMemoryProfile(); +/** Max memory index.php can use */ +nsconddefine('INDEX_FILE_MEMORY_LIMIT', ceil(MEMORY_PROFILE/4) . "000M"); +/** Max memory a QueueServer can use */ +nsconddefine('QUEUE_SERVER_MEMORY_LIMIT', MEMORY_PROFILE . "000M"); +/** Max memory a Fetcher can use */ +nsconddefine('FETCHER_MEMORY_LIMIT', ceil(MEMORY_PROFILE/2) . "000M"); +/** Max memory a MediaUpdater can use */ +nsconddefine('MEDIA_UPDATER_MEMORY_LIMIT', ceil(MEMORY_PROFILE/2) . "000M"); +/** Max memory a Mirror can use */ +nsconddefine('MIRROR_MEMORY_LIMIT', ceil(MEMORY_PROFILE/4) ."000M"); +/** Max memory a ClassifierTrainer can use */ +nsconddefine('CLASSIFIER_TRAINER_LIMIT', ceil(MEMORY_PROFILE/4) ."000M"); +/** Max memory a QueueServer can use */ +nsconddefine('ARC_TOOL_MEMORY_LIMIT', (2 * MEMORY_PROFILE) . "000M"); +/** Max memory a TokenTool can use */ +nsconddefine('TOKEN_TOOL_MEMORY_LIMIT', ceil(MEMORY_PROFILE/2) . "000M"); +/** Used to control fraction of memory filled of current process + * (usually Fetcher or QueueServer) before action (such as switch shard) + * on current class (usually IndexArchiveBundle) is taken. + */ +nsconddefine('MEMORY_FILL_FACTOR', 0.65); /** * bloom filters are used to keep track of which urls are visited, * this parameter determines up to how many @@ -823,6 +827,11 @@ nsconddefine('PROCESS_TIMEOUT', 15 * ONE_MINUTE); * crawl is likely stalled */ nsconddefine("CRAWL_TIMEOUT", 2 * PROCESS_TIMEOUT); +/** + * Delay in microseconds between processing pages to try to avoid + * CPU overheating. On some systems, you can set this to 0. + */ +nsconddefine('FETCHER_PROCESS_DELAY', 10000); /** * Number of error page 400 or greater seen from a host before crawl-delay * host and dump remainder from current schedule diff --git a/src/configs/TokenTool.php b/src/configs/TokenTool.php index 7a6af80b7..a9266b22b 100644 --- a/src/configs/TokenTool.php +++ b/src/configs/TokenTool.php @@ -54,10 +54,9 @@ if (php_sapi_name() != 'cli' || defined("seekquarry\\yioop\\configs\\IS_OWN_WEB_SERVER")) { echo "BAD REQUEST"; exit(); } -ini_set("memory_limit","1500M"); /** Load in global configuration settings and crawlHash function */ require_once __DIR__ . "/../library/Utility.php"; - +ini_set("memory_limit", C\TOKEN_TOOL_MEMORY_LIMIT); /* The phrase "More at Wikipedia..." with a link concludes the knowledge wiki entries we generate from wikipedia data. @@ -707,6 +706,8 @@ function smartOpen($file_name) */ function translateLocale($locale_tag) { + global $public_pages; + global $help_pages; if (!C\nsdefined('TRANSLATE_API_KEY')) { echo "You need to get a Yandex translate API key to use this command"; return; diff --git a/src/executables/ArcTool.php b/src/executables/ArcTool.php index fe5f38c31..34ac42e5d 100755 --- a/src/executables/ArcTool.php +++ b/src/executables/ArcTool.php @@ -48,7 +48,6 @@ if (php_sapi_name() != 'cli' || defined("seekquarry\\yioop\\configs\\IS_OWN_WEB_SERVER")) { echo "BAD REQUEST"; exit(); } -ini_set("memory_limit","2500M"); /** This tool does not need logging*/ $_SERVER["LOG_TO_FILES"] = false; /** USE_CACHE false rules out file cache as well*/ @@ -60,6 +59,9 @@ if (!C\PROFILE) { "its web interface on localhost.\n"; exit(); } +ini_set("memory_limit", C\ARC_TOOL_MEMORY_LIMIT); /*reading in a whole + shard might take a fair bit of memory +*/ /* * We'll set up multi-byte string handling to use UTF-8 */ @@ -337,9 +339,6 @@ class ArcTool implements CrawlConstants */ public function outputShardInfo($archive_path, $generation) { - ini_set("memory_limit","8000M"); /*reading in a whole shard might take - a bit more memory - */ if (preg_match("/\-\d$/", $archive_path)) { $bundle_num = substr($archive_path, -1); $archive_path = substr($archive_path, 0, -2); diff --git a/src/executables/ClassifierTrainer.php b/src/executables/ClassifierTrainer.php index 1e0a59af4..547f82ef5 100755 --- a/src/executables/ClassifierTrainer.php +++ b/src/executables/ClassifierTrainer.php @@ -52,16 +52,16 @@ if (!C\PROFILE) { "its web interface on localhost.\n"; exit(); } +/* + If possible, set the memory limit high enough to fit all of the features and + training documents into memory. + */ +ini_set("memory_limit", C\CLASSIFIER_TRAINER_LIMIT); /* We'll set up multi-byte string handling to use UTF-8 */ mb_internal_encoding("UTF-8"); mb_regex_encoding("UTF-8"); -/* - If possible, set the memory limit high enough to fit all of the features and - training documents into memory. - */ -ini_set("memory_limit", "500M"); /** * This class is used to finalize a classifier via the web interface. * @@ -76,7 +76,7 @@ ini_set("memory_limit", "500M"); * second command-line argument. The following command would be used to run * this script directly from the command-line: * - * $ php bin/ClassifierTrainer.php terminal LABEL + * $ php ClassifierTrainer.php terminal LABEL * * @author Shawn Tice */ @@ -93,14 +93,19 @@ class ClassifierTrainer { global $argv; CrawlDaemon::init($argv, "ClassifierTrainer"); - $label = $argv[2]; - L\crawlLog("Initializing classifier trainer log..", - $label.'-ClassifierTrainer', true); - $classifier = Classifier::getClassifier($label); - $classifier->prepareToFinalize(); - $classifier->finalize(); - Classifier::setClassifier($classifier); - L\crawlLog("Training complete.\n"); + $label = $argv[2] ?? ""; + $classifier = null; + if (!empty($label)) { + L\crawlLog("Initializing classifier trainer log..", + $label . '-ClassifierTrainer', true); + $classifier = Classifier::getClassifier($label); + } + if (!empty($classifier)) { + $classifier->prepareToFinalize(); + $classifier->finalize(); + Classifier::setClassifier($classifier); + L\crawlLog("Training complete.\n"); + } CrawlDaemon::stop('ClassifierTrainer', $label); } } diff --git a/src/executables/Fetcher.php b/src/executables/Fetcher.php index 34acbe75d..22485c591 100755 --- a/src/executables/Fetcher.php +++ b/src/executables/Fetcher.php @@ -1381,7 +1381,7 @@ class Fetcher implements CrawlConstants public function exceedMemoryThreshold() { return memory_get_usage() > (L\metricToInt( - ini_get("memory_limit")) * 0.7); + ini_get("memory_limit")) * C\MEMORY_FILL_FACTOR); } /** * At least once, and while memory is low picks at server at random and send diff --git a/src/executables/MediaUpdater.php b/src/executables/MediaUpdater.php index 3b3eb5344..d09cd8990 100644 --- a/src/executables/MediaUpdater.php +++ b/src/executables/MediaUpdater.php @@ -43,7 +43,6 @@ if (php_sapi_name() != 'cli' || defined("seekquarry\\yioop\\configs\\IS_OWN_WEB_SERVER")) { echo "BAD REQUEST"; exit(); } -ini_set("memory_limit", "1300M"); /** We do want logging, but crawl model and others will try to turn off * if we don't set this */ @@ -55,6 +54,7 @@ if (!C\PROFILE) { "its web interface on localhost.\n"; exit(); } +ini_set("memory_limit", C\MEDIA_UPDATER_MEMORY_LIMIT); /* * We'll set up multi-byte string handling to use UTF-8 */ diff --git a/src/executables/Mirror.php b/src/executables/Mirror.php index a3db91b97..8c4cd34e7 100644 --- a/src/executables/Mirror.php +++ b/src/executables/Mirror.php @@ -40,8 +40,6 @@ if (php_sapi_name() != 'cli' || defined("seekquarry\\yioop\\configs\\IS_OWN_WEB_SERVER")) { echo "BAD REQUEST"; exit(); } -ini_set("memory_limit","850M"); //so have enough memory to crawl big pages - /** CRAWLING means don't try to use cache * @ignore */ @@ -53,6 +51,7 @@ if (!C\PROFILE) { "its web interface on localhost.\n"; exit(); } +ini_set("memory_limit", C\MIRROR_MEMORY_LIMIT); /* * We'll set up multi-byte string handling to use UTF-8 */ diff --git a/src/executables/QueueServer.php b/src/executables/QueueServer.php index 3b2541a87..c09bf406e 100755 --- a/src/executables/QueueServer.php +++ b/src/executables/QueueServer.php @@ -1986,18 +1986,19 @@ class QueueServer implements CrawlConstants, Join } /** * Tries to prevent Indexer from crashing do to excessive memory use. - * If Indexer is using more that .7 of its allowed memory, tries to - * free memory by saving index bunlde to disk, freeing memory, then - * reloading. + * If Indexer is using more that C\MEMORY_FILL_FACTOR of its allowed memory, + * tries to free memory by saving index bundle to disk, freeing memory, + * then reloading. */ public function constrainIndexerMemoryUsage() { $memory_limit = L\metricToInt(ini_get("memory_limit")); $current_usage = memory_get_usage(); - if ((0.7 * $memory_limit) < $current_usage || + if ((C\MEMORY_FILL_FACTOR * $memory_limit) < $current_usage || in_array($this->debug, ['EXCEED_MEMORY', 'EXCEED_MEMORY_HARD'])) { L\crawlLog("Indexer memory usage threshold exceeded!!!"); - L\crawlLog("...Indexer Threshold is: " . (0.7 * $memory_limit)); + L\crawlLog("...Indexer Threshold is: " . (C\MEMORY_FILL_FACTOR * + $memory_limit)); L\crawlLog("...Indexer Current usage is: " . $current_usage); L\crawlLog("...Indexer trying to free memory by resetting " . "index bundle."); @@ -2014,7 +2015,7 @@ class QueueServer implements CrawlConstants, Join if ($this->debug == 'EXCEED_MEMORY') { $this->debug = ""; } - if ((0.7 * $memory_limit) < $current_usage || + if ((C\MEMORY_FILL_FACTOR * $memory_limit) < $current_usage || $this->debug == 'EXCEED_MEMORY_HARD') { $message_file = C\CRAWL_DIR . "/schedules/" . $this->process_name . "Messages.txt"; diff --git a/src/index.php b/src/index.php index e74896d97..3a797e438 100644 --- a/src/index.php +++ b/src/index.php @@ -78,7 +78,7 @@ function bootstrap($web_site = null, $start_new_session = true) * Load global functions related to localization */ require_once __DIR__ . "/library/LocaleFunctions.php"; - ini_set("memory_limit","1000M"); + ini_set("memory_limit", C\INDEX_FILE_MEMORY_LIMIT); if (!empty($web_site)) { if ((empty($_REQUEST['c']) || $_REQUEST['c'] != 'resource')) { $web_site->header("X-FRAME-OPTIONS: DENY"); //prevent click-jacking diff --git a/src/library/FetchUrl.php b/src/library/FetchUrl.php index 8e01b1e26..e7ef26937 100755 --- a/src/library/FetchUrl.php +++ b/src/library/FetchUrl.php @@ -249,7 +249,8 @@ class FetchUrl implements CrawlConstants $start = time(); //Wait for responses $running = null; - $memory_limit = metricToInt(ini_get("memory_limit")) * 0.7; + $memory_limit = metricToInt(ini_get("memory_limit")) * + C\MEMORY_FILL_FACTOR; $mrc_check = CURLM_CALL_MULTI_PERFORM; set_error_handler(null); do { diff --git a/src/library/IndexArchiveBundle.php b/src/library/IndexArchiveBundle.php index a84eaf628..cef420567 100644 --- a/src/library/IndexArchiveBundle.php +++ b/src/library/IndexArchiveBundle.php @@ -123,7 +123,7 @@ class IndexArchiveBundle implements CrawlConstants /** * Threshold index shard beyond which we force the generation to advance */ - const FORCE_ADVANCE_SIZE = 150000000; + const FORCE_ADVANCE_SIZE = 120000000; /** * Makes or initializes an IndexArchiveBundle with the provided parameters * @@ -200,7 +200,7 @@ class IndexArchiveBundle implements CrawlConstants crawlLog("**ADD INDEX DIAGNOSTIC INFO..."); $start_time = microtime(true); $this->getActiveShard()->appendIndexShard($index_shard); - crawlLog("Append Index Shard: Memory usage:".memory_get_usage() . + crawlLog("Append Index Shard: Memory usage:" . memory_get_usage() . " Time: ".(changeInMicrotime($start_time))); } /** @@ -236,7 +236,8 @@ class IndexArchiveBundle implements CrawlConstants $this->num_docs_per_generation; $shard_size_too_big = (file_exists($active_file_name) && filesize($active_file_name) > self::FORCE_ADVANCE_SIZE); - $too_close_to_memory_limit = 1.2 * $before_usage > $memory_limit; + $too_close_to_memory_limit = $before_usage > + C\MEMORY_FILL_FACTOR * $memory_limit; if ($too_many_docs || $shard_size_too_big || $too_close_to_memory_limit) { if ($blocking == true) { diff --git a/src/library/PhraseParser.php b/src/library/PhraseParser.php index 6ded427db..c16882e1b 100755 --- a/src/library/PhraseParser.php +++ b/src/library/PhraseParser.php @@ -739,12 +739,16 @@ class PhraseParser { $char_class = C\NS_LOCALE . $lang . "\\resources\\Tokenizer"; mb_internal_encoding("UTF-8"); - if ($pre_terms == []) { return [];} + if (empty($pre_terms)) { + return []; + } $terms = []; $tokenizer = PhraseParser::getTokenizer($lang); if (class_exists($char_class) && isset($char_class::$char_gram_len)) { foreach ($pre_terms as $pre_term) { - if ($pre_term == "") { continue; } + if (empty($pre_term)) { + continue; + } if (substr($pre_term, 0, 4) == 'http') { $terms[] = $pre_term; // don't chargram urls continue; @@ -755,7 +759,7 @@ class PhraseParser } } } else { - $terms = & $pre_terms; + $terms = $pre_terms; } return $terms; } diff --git a/src/models/SearchverticalsModel.php b/src/models/SearchverticalsModel.php index 51f2d0a99..413c14533 100644 --- a/src/models/SearchverticalsModel.php +++ b/src/models/SearchverticalsModel.php @@ -246,13 +246,14 @@ class SearchverticalsModel extends GroupModel * when a given url appears in search results * @param int $id if the url has been edited previous then the id of the * group item with the edit. If this is 0/empty then a new group item - * for the edit is created + * for the edit is created. If -1 then deletes the entry * @param int $type either SEARCH_FILTER_GROUP_ITEM or * SEARCH_EDIT_GROUP_ITEM * @param string $url to change search result for * @param string $title new title for search result * @param string $description new snippet text for search result - * @return int id of edited/created result + * @return mixed integer id of edited/created result or if used + * to delete then false */ function updateUrlResult($id, $type, $url, $title, $description) { @@ -265,7 +266,7 @@ class SearchverticalsModel extends GroupModel if (empty($id)) { $id = $this->addGroupItem($parent_id, C\SEARCH_GROUP_ID, $user_id, $title, $description, $type, $this->last_change, $url); - } else if ($type == -1) { + } else { $item = $this->getEditedPageResult($url); $sql = "DELETE FROM GROUP_ITEM WHERE ID = ?"; $db->execute($sql, [$id]); @@ -275,13 +276,10 @@ class SearchverticalsModel extends GroupModel $sql = "DELETE FROM GROUP_ITEM WHERE URL = ?"; $db->execute($sql, [$url . "/"]); } - } else { - $sql = "UPDATE GROUP_ITEM SET TYPE = ?, URL = ?, TITLE = ?, ". - "DESCRIPTION = ?, EDIT_DATE = ? WHERE ID = ?"; - $db->execute($sql, [$type, $url, $title, $description, - $this->last_change, $id]); + $id = $this->addGroupItem($parent_id, C\SEARCH_GROUP_ID, $user_id, + $title, $description, $type, $this->last_change, $url); } - return $id; + return $id ?? false; } /** * Returns any edited search result associated with a url