viewgit/inc/functions.php:22 Function utf8_encode() is deprecated [8192]

Fix DocIterator to work with new termsfilter

Chris Pollett [2023-12-16 16:Dec:th]
Fix DocIterator to work with new termsfilter
Filename
src/controllers/components/CrawlComponent.php
src/executables/Fetcher.php
src/library/CrawlQueueBundle.php
src/library/UrlParser.php
src/library/index_bundle_iterators/DocIterator.php
src/library/index_bundle_iterators/WordIterator.php
diff --git a/src/controllers/components/CrawlComponent.php b/src/controllers/components/CrawlComponent.php
index f84b20285..e7af4871f 100644
--- a/src/controllers/components/CrawlComponent.php
+++ b/src/controllers/components/CrawlComponent.php
@@ -1930,6 +1930,10 @@ class CrawlComponent extends Component implements CrawlConstants
                 $site[self::ENCODING] =
                     L\guessEncodingHtmlXml($data['TESTPAGE']);
             }
+            if (substr($site[self::URL], -strlen("robots.txt"))
+                == "robots.txt") {
+                $site[self::TYPE] = 'text/robot';
+            }
             L\convertUtf8IfNeeded($site, self::PAGE, self::ENCODING);
             $data['TESTPAGE'] = $site[self::PAGE];
             if (empty(PageProcessor::$mime_processor[$site[self::TYPE]])) {
diff --git a/src/executables/Fetcher.php b/src/executables/Fetcher.php
index d9108ee76..bcd32c6eb 100755
--- a/src/executables/Fetcher.php
+++ b/src/executables/Fetcher.php
@@ -2466,7 +2466,8 @@ class Fetcher implements CrawlConstants
             }
             $host = UrlParser::getHost($site[self::URL]);
             if (isset($site[self::ROBOT_PATHS])) {
-                if ($site[self::IP_ADDRESSES] == ["0.0.0.0"]) {
+                if ($site[self::IP_ADDRESSES] == ["0.0.0.0"]
+                    && !str_contains($host, "localhost")) {
                     /* probably couldn't find site so this will block
                         from crawl
                     */
diff --git a/src/library/CrawlQueueBundle.php b/src/library/CrawlQueueBundle.php
index 8063def6d..5f690ce23 100644
--- a/src/library/CrawlQueueBundle.php
+++ b/src/library/CrawlQueueBundle.php
@@ -620,17 +620,20 @@ class CrawlQueueBundle
            too early in the crawl before all the seed sites are downloaded
          */
         $exp_max_folder++;
-        $pre_max_folder = floor(log10($exp_max_folder)); /*$exp_max_folder ==2,
+        $pre_max_folder = floor(log($exp_max_folder, 4)); /*$exp_max_folder ==2,
             so $pre_max_folder ==1, second time $exp_max_folder ==3,
             so $pre_max_folder ==1, third time $exp_max_folder ==4,
             so $pre_max_folder ==2, etc. when == C\SITEMAP_TIER_PENALTY,
             all folders will be available below */
+        $num_sub_dirs = count($sub_dirs);
         if ($pre_max_folder >=  C\SITEMAP_TIER_PENALTY) {
-            $pre_max_folder = count($sub_dirs);
+            $pre_max_folder = $num_sub_dirs;
         }
-        $max_folder = min(count($sub_dirs), $pre_max_folder);
-        $last_folder = ($last_folder < $max_folder - 1) ?
+        $max_folder = min($num_sub_dirs, $pre_max_folder);
+        $last_folder = ($last_folder < $max_folder) ?
             $last_folder + 1 : 0;
+        crawlLog("Tier chosen $last_folder, Max Tier Choice $max_folder, ".
+            " Highest Tier $num_sub_dirs, Exp Counter $exp_max_folder");
         return $sub_dirs[$last_folder];
     }
     /**
diff --git a/src/library/UrlParser.php b/src/library/UrlParser.php
index e0b7e6e04..cafffc90c 100755
--- a/src/library/UrlParser.php
+++ b/src/library/UrlParser.php
@@ -1112,6 +1112,9 @@ class UrlParser
      */
     public static function getCompanyLevelDomain($url)
     {
+        if (preg_match("/^https?\:\/\/localhost\//", $url)) {
+            return "localhost";
+        }
         $subdomains = UrlParser::getHostSubdomains($url);
         if (!isset($subdomains[0]) || !isset($subdomains[2])) {
             return "";
diff --git a/src/library/index_bundle_iterators/DocIterator.php b/src/library/index_bundle_iterators/DocIterator.php
index bb59a63a3..20d7b6377 100755
--- a/src/library/index_bundle_iterators/DocIterator.php
+++ b/src/library/index_bundle_iterators/DocIterator.php
@@ -278,6 +278,7 @@ class DocIterator extends IndexBundleIterator
         }
         $pre_results = [];
         $num_docs_so_far = 0;
+        $termsfilter_len = IndexDocumentBundle::TERMSFILTER_LEN;
         do {
             if (($is_ascending && $this->next_offset >= $this->last_offset)
                 || (!$is_ascending && $this->next_offset < 0)) {
@@ -290,7 +291,13 @@ class DocIterator extends IndexBundleIterator
                         $this->direction);
             } else {
                 $doc_id = $doc_keys[$this->next_offset];
-                $doc_info = $doc_map_tools->unpack($doc_map[$doc_id]);
+                $map_entry = $doc_map[$doc_id];
+                // skip term filter if present
+                $map_entry = ($map_entry >= ($termsfilter_len + 1) &&
+                    $map_entry[0] == 't') ?
+                    substr($map_entry, $termsfilter_len + 1) :
+                    $map_entry;
+                $doc_info = $doc_map_tools->unpack($map_entry);
                 $item = [self::GENERATION => $this->current_generation];
                 $item[self::DOC_RANK] = $this->computeDocRank($doc_id,
                     $this->next_offset, $this->current_generation,
diff --git a/src/library/index_bundle_iterators/WordIterator.php b/src/library/index_bundle_iterators/WordIterator.php
index 385e3fb0e..0b35c4a89 100644
--- a/src/library/index_bundle_iterators/WordIterator.php
+++ b/src/library/index_bundle_iterators/WordIterator.php
@@ -551,6 +551,7 @@ class WordIterator extends IndexBundleIterator
                 continue;
             }
             $doc_key = substr($entry, 0, $docid_len);
+            $is_text = IndexDocumentBundle::isType($doc_key, "text");
             /**
              * For backward compatibility: only check for the latest
              * crawled version of a page if $entry[24] == 't'
@@ -562,7 +563,7 @@ class WordIterator extends IndexBundleIterator
                 substr($entry, $docid_len + $termsfilter_len + 1) :
                 substr($entry, $docid_len);
             if ($this->retrieve_latest && $entry[$docid_len] == 't' &&
-                IndexDocumentBundle::isType($doc_key, "text")) {
+                $is_text) {
                 $url_hash = substr($doc_key, 0, 8);
                 $latest_version_info =
                     IndexManager::lookupLatestVersionPage($url_hash,
ViewGit