Tweaks to pruneLinks that should make it way random strings in link text less, also add check to produceFetchBatch to see if url is still allowed, a=chris

Chris Pollett [2019-06-21 01:Jun:st]

Tweaks to pruneLinks that should make it way random strings in link text less, also add check to produceFetchBatch to see if url is still allowed, a=chris

Filename
src/executables/QueueServer.php
src/library/UrlParser.php

diff --git a/src/executables/QueueServer.php b/src/executables/QueueServer.php
index cf717c194..0f86d5add 100755
--- a/src/executables/QueueServer.php
+++ b/src/executables/QueueServer.php
@@ -1337,7 +1337,7 @@ class QueueServer implements CrawlConstants, Join
             } else if (!empty($this->repeat_type) && $this->repeat_type >= 0) {
                 $this->index_archive = new $class_name($dir, false,
                     serialize($info), C\NUM_DOCS_PER_GENERATION,
-                    $info[self::REPEAT_TYPE]);
+                    $this->repeat_type);
                 $this->last_index_save_time = time();
             } else {
                 $this->index_archive = new $class_name($dir, false,
@@ -2205,9 +2205,9 @@ class QueueServer implements CrawlConstants, Join
                 }
                 $scheme = UrlParser::getScheme($host_url);
                 if ($scheme == "gopher") {
-                    $host_with_robots = $host_url."/0/robots.txt";
+                    $host_with_robots = $host_url . "/0/robots.txt";
                 } else {
-                    $host_with_robots = $host_url."/robots.txt";
+                    $host_with_robots = $host_url . "/robots.txt";
                 }
                 $robots_in_queue =
                     $this->web_queue->containsUrlQueue($host_with_robots);
@@ -2358,8 +2358,8 @@ class QueueServer implements CrawlConstants, Join
         $crawl_status['QUEUE_PEAK_MEMORY'] = memory_get_peak_usage();
         file_put_contents($stat_file, serialize($crawl_status), LOCK_EX);
         chmod($stat_file, 0777);
-        L\crawlLog(
-            "End checking for new URLs data memory usage" . memory_get_usage());
+        L\crawlLog("End checking for new URLs data memory usage: " .
+            memory_get_usage());
         L\crawlLog("The current crawl description is: ".
                 $index_archive_info['DESCRIPTION']);
         L\crawlLog("Number of unique pages so far: ".
@@ -2531,6 +2531,14 @@ class QueueServer implements CrawlConstants, Join
                     } else {
                         $robots_okay = true;
                     }
+                    if (!$this->allowedToCrawlSite($url) ||
+                        $this->disallowedToCrawlSite($url)) {
+                        /* This is checked when added to queue,
+                           we check again here in case allowed and disallowed
+                           sites have changed since then
+                         */
+                        $robots_okay = false;
+                    }
                     if (!$robots_okay) {
                         $delete_urls[$i] = $url;
                         $this->web_queue->addSeenUrlFilter($url);
diff --git a/src/library/UrlParser.php b/src/library/UrlParser.php
index b2a8eb17a..60f66be37 100755
--- a/src/library/UrlParser.php
+++ b/src/library/UrlParser.php
@@ -888,7 +888,10 @@ class UrlParser
     /**
      * Prunes a list of url => text pairs down to max_link many pairs
      * by choosing those whose text has the most information. Information
-     * crudely measured by the length of the gzipped version of the text.
+     * crudely measured by the effective number of terms in the text.
+     * To compute this, we count the number of terms by splitting on white
+     * space. We then multiply this by the ratio of the compressed length
+     * of the text divided by its uncompressed length.
      *
      * @param array $links list of pairs $url=>$text
      * @param int $max_links maximum number of links from $links to return
@@ -903,11 +906,15 @@ class UrlParser
         $info_link = [];
         // choose the MAX_LINKS_PER_PAGE many pages with most info (crude)
         foreach ($links as $url => $info) {
+            $num_terms = count(preg_split("/\s+/", $info));
             $text = serialize($info);
+            $len_text = strlen($text) + 1;
+            $compressed_len = strlen(gzcompress($text)) + 1;
+            $effective_num_terms = $num_terms * ($compressed_len/$len_text);
             if (!isset($info_link[$url])) {
-                $info_link[$url] = strlen(gzcompress($text));
+                $info_link[$url] = $effective_num_terms;
             } else {
-                $info_link[$url] += strlen(gzcompress($text));
+                $info_link[$url] += $effective_num_terms;
             }
         }
         arsort($info_link);

ViewGit