viewgit/inc/functions.php:22 Function utf8_encode() is deprecated [8192]
diff --git a/src/executables/QueueServer.php b/src/executables/QueueServer.php index f5f8e20e4..8c83094b8 100755 --- a/src/executables/QueueServer.php +++ b/src/executables/QueueServer.php @@ -502,7 +502,7 @@ class QueueServer implements CrawlConstants, Join L\crawlLog("Checking if both processes still running ..."); $lines_to_check = C\LOG_LINES_TO_RESTART; //about 20-30 minutes of log data - $lines = L\tail(C\LOG_DIR."/" . $this->process_name . ".log", + $lines = L\tail(C\LOG_DIR . "/" . $this->process_name . ".log", $lines_to_check); L\crawlLog("...Got " . $this->process_name . ".log lines"); if (count($lines) < $lines_to_check) { @@ -522,10 +522,14 @@ class QueueServer implements CrawlConstants, Join L\logLineTimestamp($process_lines[$num_lines - 1]); L\crawlLog("...Timestamp of last processed line: ". $timestamp); - if ($timestamp > 1408934838) { //rules out false for timestamp + if (is_numeric($timestamp)) { + /* + Note if 0 then we have seen LOG_LINES_TO_RESTART lines + with no message from other process + */ $last_process_timestamp = $timestamp; L\crawlLog("...seems to be a valid timestep, so using."); - } else { + } else { //hopefully doesn't occur, maybe log file garbled L\crawlLog("...invalid timestep, so using current time."); } } @@ -542,6 +546,11 @@ class QueueServer implements CrawlConstants, Join foreach ($process_lines as $line) { $out_msg .= "!!!!$line\n"; } + if (empty($process_lines)) { + $out_msg .= "!!!!No messages seen this log file!\n"; + $out_msg .= "!!!!$process must have died before: " . + L\logLineTimestamp($lines[0]); + } $error_log = C\CRASH_LOG_NAME; if (!file_exists($error_log) || filesize($error_log)> C\MAX_LOG_FILE_SIZE) { @@ -689,7 +698,9 @@ class QueueServer implements CrawlConstants, Join } switch ($this->crawl_type) { case self::WEB_CRAWL: - if ($this->isOnlyIndexer()) { return; } + if ($this->isOnlyIndexer()) { + return; + } $this->processRobotUrls(); if (C\USE_ETAG_EXPIRES) { $this->processEtagExpires(); @@ -1222,7 +1233,7 @@ class QueueServer implements CrawlConstants, Join $this->index_dirty = false; // chmod so apache can also write to these directories $this->db->setWorldPermissionsRecursive( - C\CRAWL_DIR.'/cache/'. $base_name . $this->crawl_time); + C\CRAWL_DIR.'/cache/' . $base_name . $this->crawl_time); } } /** @@ -1611,9 +1622,9 @@ class QueueServer implements CrawlConstants, Join public function processDataFile($base_dir, $callback_method, $blocking = false) { - $dirs = glob($base_dir.'/*', GLOB_ONLYDIR); + $dirs = glob($base_dir . '/*', GLOB_ONLYDIR); foreach ($dirs as $dir) { - $files = glob($dir.'/*.txt'); + $files = glob($dir . '/*.txt'); if (isset($old_dir)) { L\crawlLog("Deleting $old_dir\n"); $this->db->unlinkRecursive($old_dir); diff --git a/src/library/IndexArchiveBundle.php b/src/library/IndexArchiveBundle.php index 6ef729cd2..680d8edd0 100644 --- a/src/library/IndexArchiveBundle.php +++ b/src/library/IndexArchiveBundle.php @@ -217,7 +217,8 @@ class IndexArchiveBundle implements CrawlConstants $blocking = false) { $current_num_docs = $this->getActiveShard()->num_docs; - crawlLog("Current index shard has ".$current_num_docs." documents."); + crawlLog("Current index shard has " . $current_num_docs . + " documents."); $memory_limit = metricToInt(ini_get("memory_limit")); crawlLog("Memory Indexer limit is ".$memory_limit.". Usage is ". memory_get_usage()); @@ -231,6 +232,10 @@ class IndexArchiveBundle implements CrawlConstants // Save current shard dictionary to main dictionary $this->forceSave(); $this->addAdvanceGeneration($callback); + $num_freed = gc_collect_cycles(); + crawlLog("Indexer force running garbage collector after generation". + "advance. This freed " . $num_freed . " bytes."); + crawlLog("Switch Index Shard time:". changeInMicrotime($switch_time)); } @@ -257,7 +262,7 @@ class IndexArchiveBundle implements CrawlConstants $this->generation_info['CURRENT'] = $this->generation_info['ACTIVE']; $current_index_shard_file = $this->dir_name. - "/posting_doc_shards/index". $this->generation_info['ACTIVE']; + "/posting_doc_shards/index" . $this->generation_info['ACTIVE']; $this->current_shard = new IndexShard( $current_index_shard_file, $this->generation_info['ACTIVE'], $this->num_docs_per_generation); @@ -436,11 +441,11 @@ class IndexArchiveBundle implements CrawlConstants */ public static function getArchiveInfo($dir_name) { - if (file_exists($dir_name."/arc_description.txt")) { + if (file_exists($dir_name . "/arc_description.txt")) { $crawl = []; $info = []; $crawl['DESCRIPTION'] = substr( - file_get_contents($dir_name."/arc_description.txt"), 0, 256); + file_get_contents($dir_name . "/arc_description.txt"), 0, 256); $crawl['ARCFILE'] = true; $info['VISITED_URLS_COUNT'] = 0; $info['COUNT'] = 0; diff --git a/src/library/IndexDictionary.php b/src/library/IndexDictionary.php index 146aa08ab..f362ee882 100644 --- a/src/library/IndexDictionary.php +++ b/src/library/IndexDictionary.php @@ -293,7 +293,7 @@ class IndexDictionary implements CrawlConstants $tier++; if ($tier > $this->max_tier) { $this->max_tier = $tier; - file_put_contents($this->dir_name."/max_tier.txt", + file_put_contents($this->dir_name . "/max_tier.txt", serialize($this->max_tier)); } } diff --git a/src/library/IndexShard.php b/src/library/IndexShard.php index 9b20967a6..0df6a9492 100644 --- a/src/library/IndexShard.php +++ b/src/library/IndexShard.php @@ -677,7 +677,9 @@ class IndexShard extends PersistentStructure implements CrawlConstants $total_posting_len = 0; $num_postings_so_far = 0; do { - if ($next > $end) {break;} + if ($next > $end) { + break; + } $posting_start = $next; $posting = $this->getPostingAtOffset( $next, $posting_start, $posting_end); diff --git a/src/library/Utility.php b/src/library/Utility.php index 2e5f18156..1065829ca 100755 --- a/src/library/Utility.php +++ b/src/library/Utility.php @@ -332,7 +332,9 @@ function addDocIndexPostings(&$postings, $add_offset) !($tmp = unpack("N*", $post_string))) {continue; } $posting_list = call_user_func_array("array_merge", array_map(C\NS_LIB . "unpackListModified9", $tmp)); - if (!is_array($posting_list)) { continue; } + if (!is_array($posting_list)) { + continue; + } $doc_index = array_shift($posting_list); if (($doc_index & (2 << 26)) > 0) { $post0 = ($doc_index & ((2 << 9) - 1)); diff --git a/src/library/WebArchive.php b/src/library/WebArchive.php index 0d0ce8f10..b1cfdcd20 100755 --- a/src/library/WebArchive.php +++ b/src/library/WebArchive.php @@ -181,7 +181,7 @@ class WebArchive $len = strlen($info_string) + $compressed_int_len; $offset = ftell($fh); ftruncate($fh, $offset); - $out = $info_string.$this->compressor->compressInt($len); + $out = $info_string . $this->compressor->compressInt($len); fwrite($fh, $out, $len); if ($open_flag) { fclose($fh); diff --git a/src/library/WebArchiveBundle.php b/src/library/WebArchiveBundle.php index a70e5a7ff..4233d9200 100755 --- a/src/library/WebArchiveBundle.php +++ b/src/library/WebArchiveBundle.php @@ -191,8 +191,8 @@ class WebArchiveBundle return $this->write_partition; } /** - * Advances the index of the write partition by one and creates the - * corresponding web archive. + * Sets the write partition to the provided value and if this is not + * a read only archive stores, this value persistently to archive info * * @param int $i the number of the current write partition */ @@ -200,6 +200,10 @@ class WebArchiveBundle { $this->write_partition = $i; if (!$this->read_only_archive) { + /* clear the partition array just to avoid memory leak in + crawling setting + */ + $this->partition = []; $info = $this->getArchiveInfo($this->dir_name); $info['WRITE_PARTITION'] = $this->write_partition; $this->setArchiveInfo($this->dir_name, $info); @@ -246,7 +250,7 @@ class WebArchiveBundle $create_flag = false; $compressor = C\NS_LIB . "compressors\\" . $this->compressor; $compressor_obj = new $compressor(); - $archive_name = $this->dir_name."/web_archive_" . $index + $archive_name = $this->dir_name . "/web_archive_" . $index . $compressor_obj->fileExtension(); if (!file_exists($archive_name)) { $create_flag = true; diff --git a/src/locale/zh_CN/resources/segment_word_grams.ftr b/src/locale/zh_CN/resources/segment_word_grams.ftr old mode 100755 new mode 100644 index c39e16021..0a49a222a Binary files a/src/locale/zh_CN/resources/segment_word_grams.ftr and b/src/locale/zh_CN/resources/segment_word_grams.ftr differ