viewgit/inc/functions.php:22 Function utf8_encode() is deprecated [8192]
Filename | |
---|---|
src/executables/Fetcher.php | |
src/executables/QueueServer.php | |
src/library/IndexDictionary.php |
diff --git a/src/executables/Fetcher.php b/src/executables/Fetcher.php index 031baece0..6b5c16d63 100755 --- a/src/executables/Fetcher.php +++ b/src/executables/Fetcher.php @@ -2889,9 +2889,13 @@ class Fetcher implements CrawlConstants robots.txt. Sitemap will still be in TO_CRAWL, but that's done elsewhere */ - if (strlen($url) == 0 || is_numeric($url)) { continue; } + if (strlen($url) == 0 || is_numeric($url)) { + continue; + } $link_host = UrlParser::getHost($url); - if (strlen($link_host) == 0) continue; + if (strlen($link_host) == 0) { + continue; + } $part_num = L\calculatePartition($link_host, $num_queue_servers); $summary = []; diff --git a/src/executables/QueueServer.php b/src/executables/QueueServer.php index 5adc90db4..0084776fd 100755 --- a/src/executables/QueueServer.php +++ b/src/executables/QueueServer.php @@ -1651,7 +1651,7 @@ class QueueServer implements CrawlConstants, Join $host = UrlParser::getHost($site_url); $hash = L\crawlHash($site_url, true). $site[self::HASH] . - "d". substr(L\crawlHash($host."/",true), 1); + "d". substr(L\crawlHash($host."/", true), 1); } else { $hash = $site[self::HASH_URL]; } diff --git a/src/library/IndexDictionary.php b/src/library/IndexDictionary.php index 2760d11fa..1aa5b792f 100644 --- a/src/library/IndexDictionary.php +++ b/src/library/IndexDictionary.php @@ -719,8 +719,10 @@ class IndexDictionary implements CrawlConstants * @param int $threshold if greater than zero how many posting list * results in dictionary info returned before stopping looking for * more matches - * @param int $start_generation - * @param int $num_distinct_generations + * @param int $start_generation which index shard in inverted index to + * start search from + * @param int $num_distinct_generations how many shard to consider after + * $start_generation * @param bool $with_remaining_total * @return mixed an array of entries of the form * generation, first offset, last offset, count @@ -827,9 +829,9 @@ class IndexDictionary implements CrawlConstants docs and links. If an entry has more than max_entry_count we will assume entry somehow got corrupted and skip that generation for that word. Because we are including link have - set threshold to 5 * number of docs that could be in a shard + set threshold to 100 * number of docs that could be in a shard */ - $max_entry_count = 5 * C\NUM_DOCS_PER_GENERATION; + $max_entry_count = 100 * C\NUM_DOCS_PER_GENERATION; $total_count = 0; $prefix = ord($word_id[1]); $prefix_info = $this->getDictSubstring($file_num, @@ -915,7 +917,7 @@ class IndexDictionary implements CrawlConstants $this->addAuxInfoRecords($id ,$file_num, $num_aux_records, $total_count, $threshold, $info, $previous_generation, $num_generations, $start + - ($check_loc + 1)* $word_item_len, + ($check_loc + 1) * $word_item_len, $num_distinct_generations, $max_retained_generation, $id_info); $check_and_auxes += $num_aux_records;