viewgit/inc/functions.php:22 Function utf8_encode() is deprecated [8192]
diff --git a/src/executables/QueueServer.php b/src/executables/QueueServer.php index cf717c194..0f86d5add 100755 --- a/src/executables/QueueServer.php +++ b/src/executables/QueueServer.php @@ -1337,7 +1337,7 @@ class QueueServer implements CrawlConstants, Join } else if (!empty($this->repeat_type) && $this->repeat_type >= 0) { $this->index_archive = new $class_name($dir, false, serialize($info), C\NUM_DOCS_PER_GENERATION, - $info[self::REPEAT_TYPE]); + $this->repeat_type); $this->last_index_save_time = time(); } else { $this->index_archive = new $class_name($dir, false, @@ -2205,9 +2205,9 @@ class QueueServer implements CrawlConstants, Join } $scheme = UrlParser::getScheme($host_url); if ($scheme == "gopher") { - $host_with_robots = $host_url."/0/robots.txt"; + $host_with_robots = $host_url . "/0/robots.txt"; } else { - $host_with_robots = $host_url."/robots.txt"; + $host_with_robots = $host_url . "/robots.txt"; } $robots_in_queue = $this->web_queue->containsUrlQueue($host_with_robots); @@ -2358,8 +2358,8 @@ class QueueServer implements CrawlConstants, Join $crawl_status['QUEUE_PEAK_MEMORY'] = memory_get_peak_usage(); file_put_contents($stat_file, serialize($crawl_status), LOCK_EX); chmod($stat_file, 0777); - L\crawlLog( - "End checking for new URLs data memory usage" . memory_get_usage()); + L\crawlLog("End checking for new URLs data memory usage: " . + memory_get_usage()); L\crawlLog("The current crawl description is: ". $index_archive_info['DESCRIPTION']); L\crawlLog("Number of unique pages so far: ". @@ -2531,6 +2531,14 @@ class QueueServer implements CrawlConstants, Join } else { $robots_okay = true; } + if (!$this->allowedToCrawlSite($url) || + $this->disallowedToCrawlSite($url)) { + /* This is checked when added to queue, + we check again here in case allowed and disallowed + sites have changed since then + */ + $robots_okay = false; + } if (!$robots_okay) { $delete_urls[$i] = $url; $this->web_queue->addSeenUrlFilter($url); diff --git a/src/library/UrlParser.php b/src/library/UrlParser.php index b2a8eb17a..60f66be37 100755 --- a/src/library/UrlParser.php +++ b/src/library/UrlParser.php @@ -888,7 +888,10 @@ class UrlParser /** * Prunes a list of url => text pairs down to max_link many pairs * by choosing those whose text has the most information. Information - * crudely measured by the length of the gzipped version of the text. + * crudely measured by the effective number of terms in the text. + * To compute this, we count the number of terms by splitting on white + * space. We then multiply this by the ratio of the compressed length + * of the text divided by its uncompressed length. * * @param array $links list of pairs $url=>$text * @param int $max_links maximum number of links from $links to return @@ -903,11 +906,15 @@ class UrlParser $info_link = []; // choose the MAX_LINKS_PER_PAGE many pages with most info (crude) foreach ($links as $url => $info) { + $num_terms = count(preg_split("/\s+/", $info)); $text = serialize($info); + $len_text = strlen($text) + 1; + $compressed_len = strlen(gzcompress($text)) + 1; + $effective_num_terms = $num_terms * ($compressed_len/$len_text); if (!isset($info_link[$url])) { - $info_link[$url] = strlen(gzcompress($text)); + $info_link[$url] = $effective_num_terms; } else { - $info_link[$url] += strlen(gzcompress($text)); + $info_link[$url] += $effective_num_terms; } } arsort($info_link);