viewgit/inc/functions.php:22 Function utf8_encode() is deprecated [8192]
diff --git a/src/executables/QueueServer.php b/src/executables/QueueServer.php index 7f2d726d8..cf717c194 100755 --- a/src/executables/QueueServer.php +++ b/src/executables/QueueServer.php @@ -1290,9 +1290,23 @@ class QueueServer implements CrawlConstants, Join $this->updateDisallowedQuotaSites(); } $this->initializeWebQueue(); - if(empty($info[self::REPEAT_TYPE]) || $info[self::REPEAT_TYPE] < 0) { + $this->initializeIndexBundle($info); + $info[self::STATUS] = self::CONTINUE_STATE; + return $info; + } + /** + * Function used to set up an indexer's IndexArchiveBundle or + * DoubleIndexBundle according to the current crawl parameters or + * the values stored in an existing bundle. + * + * @param array $info if initializing a new crawl this should contain + * the crawl parameters + */ + public function initializeIndexBundle($info = []) + { + if(empty($this->repeat_type) || $this->repeat_type < 0) { $class_name = C\NS_LIB . "IndexArchiveBundle"; - $dir = C\CRAWL_DIR.'/cache/' . self::index_data_base_name . + $dir = C\CRAWL_DIR . '/cache/' . self::index_data_base_name . $this->crawl_time; } else { $class_name = C\NS_LIB . "DoubleIndexBundle"; @@ -1320,8 +1334,7 @@ class QueueServer implements CrawlConstants, Join (might take a while if merging dictionary) */ $this->writeCrawlStatus($sites); - } else if (!empty($info[self::REPEAT_TYPE]) && - $info[self::REPEAT_TYPE] >= 0) { + } else if (!empty($this->repeat_type) && $this->repeat_type >= 0) { $this->index_archive = new $class_name($dir, false, serialize($info), C\NUM_DOCS_PER_GENERATION, $info[self::REPEAT_TYPE]); @@ -1341,8 +1354,6 @@ class QueueServer implements CrawlConstants, Join //Get modified time of initial setting of crawl params $this->archive_modified_time = $class_name::getParamModifiedTime($dir); - $info[self::STATUS] = self::CONTINUE_STATE; - return $info; } /** * This is called whenever the crawl options are modified to parse @@ -1842,19 +1853,19 @@ class QueueServer implements CrawlConstants, Join unset($seen_sites); } L\crawlLog("C. Indexer init local shard, store ". - "Summaries memory usage:". memory_get_usage() . + "Summaries memory usage: ". memory_get_usage() . " time: " . L\changeInMicrotime($start_time)); $start_time = microtime(true); // added summary offset info to inverted index data $index_shard->changeDocumentOffsets($summary_offsets); - L\crawlLog("D. Indexer Update shard offsets. Memory usage:". + L\crawlLog("D. Indexer Update shard offsets. Memory usage: ". memory_get_usage() . " time: " . L\changeInMicrotime($start_time)); $start_time = microtime(true); $this->index_archive->addIndexData($index_shard); $this->index_dirty = true; } - L\crawlLog("E. Indexer Add index shard. Memory usage:". + L\crawlLog("E. Indexer Add index shard. Memory usage: ". memory_get_usage() . " time: " . L\changeInMicrotime($start_time)); L\crawlLog("Indexer Done Index Processing File: $file. Total time: ". @@ -1867,6 +1878,38 @@ class QueueServer implements CrawlConstants, Join //Haven't tracked down yet, but can try to delete twice giving warn unlink($file); } + $this->constrainIndexerMemoryUsage(); + } + /** + * Tries to prevent Indexer from crashing do to excessive memory use. + * If Indexer is using more that .7 of its allowed memory, tries to + * free memory by saving index bunlde to disk, freeing memory, then + * reloading. + */ + public function constrainIndexerMemoryUsage() + { + $memory_limit = L\metricToInt(ini_get("memory_limit")); + $current_usage = memory_get_usage(); + if ((0.7 * $memory_limit) < $current_usage) { + L\crawlLog("Indexer memory usage threshold exceeded!!!"); + L\crawlLog("...Threshold is: " . (0.7 * $memory_limit)); + L\crawlLog("...Current usage is: " . $current_usage); + L\crawlLog("...Trying to free memory by resetting " . + "index bundle."); + $this->index_archive->forceSave(); + $this->index_archive = null; + $num_freed = L\garbageCollect(); + L\crawlLog("...Indexer force running garbage collector " . + "after reset. This freed " . $num_freed . " bytes."); + $this->initializeIndexBundle(); + $current_usage = memory_get_usage(); + L\crawlLog("Done index bundle reset, current memory usage is: ". + $current_usage); + if ((0.7 * $memory_limit) < $current_usage) { + L\crawlLog("!!!Usage still exceeds threshold, exiting"); + exit(); + } + } } /** * Checks how old the oldest robot data is and dumps if older then a diff --git a/src/library/IndexArchiveBundle.php b/src/library/IndexArchiveBundle.php index 28fd86628..a294522f4 100644 --- a/src/library/IndexArchiveBundle.php +++ b/src/library/IndexArchiveBundle.php @@ -223,8 +223,8 @@ class IndexArchiveBundle implements CrawlConstants $before_usage = memory_get_usage(); crawlLog("Indexer Memory limit is " . $memory_limit . ". Usage is ". $before_usage); - if ($current_num_docs + $add_num_docs > $this->num_docs_per_generation - || (0.65 * $memory_limit) < $before_usage ) { + if ($current_num_docs + $add_num_docs > + $this->num_docs_per_generation) { if ($blocking == true) { return -1; } @@ -233,16 +233,11 @@ class IndexArchiveBundle implements CrawlConstants // Save current shard dictionary to main dictionary $this->forceSave(); $this->addAdvanceGeneration($callback); - $num_freed= garbageCollect(); + $num_freed = garbageCollect(); crawlLog("Indexer force running garbage collector after generation". " advance. This freed " . $num_freed . " bytes."); $after_usage = memory_get_usage(); crawlLog("Indexer after switch memory usage: $after_usage"); - if ((0.65 * $memory_limit) < $after_usage) { - crawlLog("Index Shard Switching did not free sufficiently ". - "memory, exiting"); - exit(); - } crawlLog("Switch Index Shard time:". changeInMicrotime($switch_time)); } diff --git a/src/library/Utility.php b/src/library/Utility.php index 809df4bc2..56e95a2e5 100755 --- a/src/library/Utility.php +++ b/src/library/Utility.php @@ -2233,7 +2233,7 @@ function lineFilter($lines, $filters) */ function logLineTimestamp($line) { - preg_match("/^\s*\[(.*)\]/", $line, $matches); + preg_match("/^\s*\[\d+\s+(.*)\]/", $line, $matches); if (isset($matches[1])) { return @strtotime($matches[1]); }