viewgit/inc/functions.php:22 Function utf8_encode() is deprecated [8192]
diff --git a/src/controllers/components/CrawlComponent.php b/src/controllers/components/CrawlComponent.php index f84b20285..e7af4871f 100644 --- a/src/controllers/components/CrawlComponent.php +++ b/src/controllers/components/CrawlComponent.php @@ -1930,6 +1930,10 @@ class CrawlComponent extends Component implements CrawlConstants $site[self::ENCODING] = L\guessEncodingHtmlXml($data['TESTPAGE']); } + if (substr($site[self::URL], -strlen("robots.txt")) + == "robots.txt") { + $site[self::TYPE] = 'text/robot'; + } L\convertUtf8IfNeeded($site, self::PAGE, self::ENCODING); $data['TESTPAGE'] = $site[self::PAGE]; if (empty(PageProcessor::$mime_processor[$site[self::TYPE]])) { diff --git a/src/executables/Fetcher.php b/src/executables/Fetcher.php index d9108ee76..bcd32c6eb 100755 --- a/src/executables/Fetcher.php +++ b/src/executables/Fetcher.php @@ -2466,7 +2466,8 @@ class Fetcher implements CrawlConstants } $host = UrlParser::getHost($site[self::URL]); if (isset($site[self::ROBOT_PATHS])) { - if ($site[self::IP_ADDRESSES] == ["0.0.0.0"]) { + if ($site[self::IP_ADDRESSES] == ["0.0.0.0"] + && !str_contains($host, "localhost")) { /* probably couldn't find site so this will block from crawl */ diff --git a/src/library/CrawlQueueBundle.php b/src/library/CrawlQueueBundle.php index 8063def6d..5f690ce23 100644 --- a/src/library/CrawlQueueBundle.php +++ b/src/library/CrawlQueueBundle.php @@ -620,17 +620,20 @@ class CrawlQueueBundle too early in the crawl before all the seed sites are downloaded */ $exp_max_folder++; - $pre_max_folder = floor(log10($exp_max_folder)); /*$exp_max_folder ==2, + $pre_max_folder = floor(log($exp_max_folder, 4)); /*$exp_max_folder ==2, so $pre_max_folder ==1, second time $exp_max_folder ==3, so $pre_max_folder ==1, third time $exp_max_folder ==4, so $pre_max_folder ==2, etc. when == C\SITEMAP_TIER_PENALTY, all folders will be available below */ + $num_sub_dirs = count($sub_dirs); if ($pre_max_folder >= C\SITEMAP_TIER_PENALTY) { - $pre_max_folder = count($sub_dirs); + $pre_max_folder = $num_sub_dirs; } - $max_folder = min(count($sub_dirs), $pre_max_folder); - $last_folder = ($last_folder < $max_folder - 1) ? + $max_folder = min($num_sub_dirs, $pre_max_folder); + $last_folder = ($last_folder < $max_folder) ? $last_folder + 1 : 0; + crawlLog("Tier chosen $last_folder, Max Tier Choice $max_folder, ". + " Highest Tier $num_sub_dirs, Exp Counter $exp_max_folder"); return $sub_dirs[$last_folder]; } /** diff --git a/src/library/UrlParser.php b/src/library/UrlParser.php index e0b7e6e04..cafffc90c 100755 --- a/src/library/UrlParser.php +++ b/src/library/UrlParser.php @@ -1112,6 +1112,9 @@ class UrlParser */ public static function getCompanyLevelDomain($url) { + if (preg_match("/^https?\:\/\/localhost\//", $url)) { + return "localhost"; + } $subdomains = UrlParser::getHostSubdomains($url); if (!isset($subdomains[0]) || !isset($subdomains[2])) { return ""; diff --git a/src/library/index_bundle_iterators/DocIterator.php b/src/library/index_bundle_iterators/DocIterator.php index bb59a63a3..20d7b6377 100755 --- a/src/library/index_bundle_iterators/DocIterator.php +++ b/src/library/index_bundle_iterators/DocIterator.php @@ -278,6 +278,7 @@ class DocIterator extends IndexBundleIterator } $pre_results = []; $num_docs_so_far = 0; + $termsfilter_len = IndexDocumentBundle::TERMSFILTER_LEN; do { if (($is_ascending && $this->next_offset >= $this->last_offset) || (!$is_ascending && $this->next_offset < 0)) { @@ -290,7 +291,13 @@ class DocIterator extends IndexBundleIterator $this->direction); } else { $doc_id = $doc_keys[$this->next_offset]; - $doc_info = $doc_map_tools->unpack($doc_map[$doc_id]); + $map_entry = $doc_map[$doc_id]; + // skip term filter if present + $map_entry = ($map_entry >= ($termsfilter_len + 1) && + $map_entry[0] == 't') ? + substr($map_entry, $termsfilter_len + 1) : + $map_entry; + $doc_info = $doc_map_tools->unpack($map_entry); $item = [self::GENERATION => $this->current_generation]; $item[self::DOC_RANK] = $this->computeDocRank($doc_id, $this->next_offset, $this->current_generation, diff --git a/src/library/index_bundle_iterators/WordIterator.php b/src/library/index_bundle_iterators/WordIterator.php index 385e3fb0e..0b35c4a89 100644 --- a/src/library/index_bundle_iterators/WordIterator.php +++ b/src/library/index_bundle_iterators/WordIterator.php @@ -551,6 +551,7 @@ class WordIterator extends IndexBundleIterator continue; } $doc_key = substr($entry, 0, $docid_len); + $is_text = IndexDocumentBundle::isType($doc_key, "text"); /** * For backward compatibility: only check for the latest * crawled version of a page if $entry[24] == 't' @@ -562,7 +563,7 @@ class WordIterator extends IndexBundleIterator substr($entry, $docid_len + $termsfilter_len + 1) : substr($entry, $docid_len); if ($this->retrieve_latest && $entry[$docid_len] == 't' && - IndexDocumentBundle::isType($doc_key, "text")) { + $is_text) { $url_hash = substr($doc_key, 0, 8); $latest_version_info = IndexManager::lookupLatestVersionPage($url_hash,