viewgit/inc/functions.php:22 Function utf8_encode() is deprecated [8192]

Tweak to values in a dictionary sanity check

Chris Pollett [2018-06-19 21:Jun:th]
Tweak to values in a dictionary sanity check
Filename
src/executables/Fetcher.php
src/executables/QueueServer.php
src/library/IndexDictionary.php
diff --git a/src/executables/Fetcher.php b/src/executables/Fetcher.php
index 031baece0..6b5c16d63 100755
--- a/src/executables/Fetcher.php
+++ b/src/executables/Fetcher.php
@@ -2889,9 +2889,13 @@ class Fetcher implements CrawlConstants
                       robots.txt. Sitemap will still be in TO_CRAWL, but that's
                       done elsewhere
                      */
-                    if (strlen($url) == 0 || is_numeric($url)) { continue; }
+                    if (strlen($url) == 0 || is_numeric($url)) {
+                        continue;
+                    }
                     $link_host = UrlParser::getHost($url);
-                    if (strlen($link_host) == 0) continue;
+                    if (strlen($link_host) == 0) {
+                        continue;
+                    }
                     $part_num = L\calculatePartition($link_host,
                         $num_queue_servers);
                     $summary = [];
diff --git a/src/executables/QueueServer.php b/src/executables/QueueServer.php
index 5adc90db4..0084776fd 100755
--- a/src/executables/QueueServer.php
+++ b/src/executables/QueueServer.php
@@ -1651,7 +1651,7 @@ class QueueServer implements CrawlConstants, Join
                         $host = UrlParser::getHost($site_url);
                         $hash = L\crawlHash($site_url, true).
                             $site[self::HASH] .
-                            "d". substr(L\crawlHash($host."/",true), 1);
+                            "d". substr(L\crawlHash($host."/", true), 1);
                     } else {
                         $hash = $site[self::HASH_URL];
                     }
diff --git a/src/library/IndexDictionary.php b/src/library/IndexDictionary.php
index 2760d11fa..1aa5b792f 100644
--- a/src/library/IndexDictionary.php
+++ b/src/library/IndexDictionary.php
@@ -719,8 +719,10 @@ class IndexDictionary implements CrawlConstants
      * @param int $threshold if greater than zero how many posting list
      *    results in dictionary info returned before stopping looking for
      *    more matches
-     * @param int $start_generation
-     * @param int $num_distinct_generations
+     * @param int $start_generation which index shard in inverted index to
+     *      start search from
+     * @param int $num_distinct_generations how many shard to consider after
+     *      $start_generation
      * @param bool $with_remaining_total
      * @return mixed an array of entries of the form
      *     generation, first offset, last offset, count
@@ -827,9 +829,9 @@ class IndexDictionary implements CrawlConstants
             docs and links. If an entry has more than max_entry_count
             we will assume entry somehow got corrupted and skip that
             generation for that word. Because we are including link have
-            set threshold to 5 * number of docs that could be in a shard
+            set threshold to 100 * number of docs that could be in a shard
          */
-        $max_entry_count = 5 * C\NUM_DOCS_PER_GENERATION;
+        $max_entry_count = 100 * C\NUM_DOCS_PER_GENERATION;
         $total_count = 0;
         $prefix = ord($word_id[1]);
         $prefix_info = $this->getDictSubstring($file_num,
@@ -915,7 +917,7 @@ class IndexDictionary implements CrawlConstants
                     $this->addAuxInfoRecords($id ,$file_num, $num_aux_records,
                         $total_count, $threshold, $info, $previous_generation,
                         $num_generations, $start +
-                        ($check_loc + 1)* $word_item_len,
+                        ($check_loc + 1) * $word_item_len,
                         $num_distinct_generations, $max_retained_generation,
                         $id_info);
                     $check_and_auxes += $num_aux_records;
ViewGit