viewgit/inc/functions.php:22 Function utf8_encode() is deprecated [8192]

Fixes bugs in ArcTool relatted to FeedArchiveBundle's, a=chris

Chris Pollett [2020-01-02 17:Jan:nd]
Fixes bugs in ArcTool relatted to FeedArchiveBundle's, a=chris
Filename
src/executables/ArcTool.php
src/library/IndexManager.php
diff --git a/src/executables/ArcTool.php b/src/executables/ArcTool.php
index 685336012..a2ce8d6e8 100755
--- a/src/executables/ArcTool.php
+++ b/src/executables/ArcTool.php
@@ -200,8 +200,8 @@ class ArcTool implements CrawlConstants
                 $name = $this->getArchiveName($archive_path);
                 echo $name . " ";
                 $archive_type = $this->getArchiveKind($archive_path);
-                if (in_array($archive_type, ["IndexArchiveBundle",
-                    "DoubleIndexBundle"])) {
+                if (in_array($archive_type, ["FeedArchiveBundle",
+                    "DoubleIndexBundle", "IndexArchiveBundle",])) {
                     $bundle_class = C\NS_LIB . $archive_type;
                     $info = $bundle_class::getArchiveInfo($archive_path);
                     $info = unserialize($info["DESCRIPTION"]);
@@ -245,8 +245,8 @@ class ArcTool implements CrawlConstants
         if ($archive_type === false) {
             $this->badFormatMessageAndExit($archive_path);
         }
-        if (in_array($archive_type, ["IndexArchiveBundle", "WebArchiveBundle",
-            "DoubleIndexBundle"])){
+        if (in_array($archive_type, ["DoubleIndexBundle", "FeedArchiveBundle",
+            "IndexArchiveBundle", "WebArchiveBundle",])){
             $call = "outputInfo" . $archive_type;
             $archive_name = C\NS_LIB . $archive_type;
             $info = $archive_name::getArchiveInfo($archive_path);
@@ -274,13 +274,13 @@ class ArcTool implements CrawlConstants
         echo "\nBundle Name: $bundle_name\n";
         $archive_type = $this->getArchiveKind($archive_path);
         echo "Bundle Type: $archive_type\n";
-        if (!in_array($archive_type, ["IndexArchiveBundle",
-            "DoubleIndexBundle"])) {
+        if (!in_array($archive_type, ["FeedArchiveBundle",
+            "DoubleIndexBundle", "IndexArchiveBundle",])) {
             $this->badFormatMessageAndExit($archive_path, "index");
         }
         preg_match("/\d+$/", $archive_path, $matches);
         $index_timestamp = (isset($matches[0])) ? $matches[0] : 0;
-        if ($bundle_num >= 0) {
+        if (isset($bundle_num) && $bundle_num >= 0) {
             $index_timestamp .= "-$bundle_num";
         } else if ($bundle_name == "IndexDataFeed") {
             $index_timestamp = "feed";
@@ -348,8 +348,8 @@ class ArcTool implements CrawlConstants
         echo "\nBundle Name: $bundle_name\n";
         $archive_type = $this->getArchiveKind($archive_path);
         echo "Bundle Type: $archive_type\n";
-        if (!in_array($archive_type, ["IndexArchiveBundle",
-            "DoubleIndexBundle"])) {
+        if (!in_array($archive_type, ["FeedArchiveBundle",
+            "DoubleIndexBundle", "IndexArchiveBundle",])) {
             $this->badFormatMessageAndExit($archive_path, "index");
         }
         preg_match("/\d+$/", $archive_path, $matches);
@@ -365,8 +365,13 @@ class ArcTool implements CrawlConstants
         echo "Number of Generations: $num_generations\n";
         echo "\nShard Information for Generation $generation\n";
         echo "====================================\n";
+        $_SERVER["NO_LOGGING"] = true;
         $shard = $index->getCurrentShard(true);
-        echo "Number of Distinct Terms Indexed: ".count($shard->words)."\n";
+        if ($shard === null) {
+            echo "This shard's word info is already in bundles dictionary\n";
+            return;
+        }
+        echo "Number of Distinct Terms Indexed: " . count($shard->words)."\n";
         echo "Number of Docs in Shard: " . $shard->num_docs."\n";
         echo "Number of Link Items in Shard: ".$shard->num_link_docs."\n";
         echo "Total Links and Docs: ".($shard->num_docs +
@@ -405,17 +410,26 @@ class ArcTool implements CrawlConstants
         echo "\nBundle Name: $bundle_name\n";
         $archive_type = $this->getArchiveKind($archive_path);
         echo "Bundle Type: $archive_type\n";
-        if (!in_array($archive_type, ["IndexArchiveBundle",
-            "DoubleIndexBundle"])) {
+        if (!in_array($archive_type, ["FeedArchiveBundle",
+            "DoubleIndexBundle", "IndexArchiveBundle",])) {
             $this->badFormatMessageAndExit($archive_path, "index");
         }
         preg_match("/\d+$/", $archive_path, $matches);
         $index_timestamp = (isset($matches[0])) ? $matches[0] : 0;
-        if ($bundle_num >= 0) {
+        if (isset($bundle_num) && $bundle_num >= 0) {
             $index_timestamp .= "-$bundle_num";
+        } else if ($bundle_name == "IndexDataFeed") {
+            $index_timestamp = "feed";
         }
         $index = IndexManager::getIndex($index_timestamp);
-        $num_generations = $index->generation_info["ACTIVE"] + 1;
+        if (isset($index->generation_info["ACTIVE"])) {
+            $num_generations = $index->generation_info["ACTIVE"] + 1;
+        } else if (isset($index->generation_info["CURRENT"])) {
+            $num_generations = $index->generation_info["CURRENT"] + 1;
+        } else {
+            echo "Archive does not appear to have data yet";
+            exit();
+        }
         $count = 0;
         $visited_urls_count = 0;
         echo "Shard Counts\n===========\n";
@@ -468,15 +482,15 @@ class ArcTool implements CrawlConstants
         echo "Bundle Type: $archive_type\n";
         echo "Generation: $generation\n";
         echo "Offset: $offset\n";
-        if (!in_array($archive_type, ["IndexArchiveBundle",
-            "DoubleIndexBundle"])) {
+        if (!in_array($archive_type, ["FeedArchiveBundle",
+            "DoubleIndexBundle", "IndexArchiveBundle",])) {
             $this->badFormatMessageAndExit($archive_path, "index");
         }
         preg_match("/\d+$/", $archive_path, $matches);
         $index_timestamp = (isset($matches[0])) ? $matches[0] : 0;
-        if ($bundle_num >= 0) {
+        if (isset($bundle_num) && $bundle_num >= 0) {
             $index_timestamp .= "-$bundle_num";
-        } else if ($bundle_num = "IndexDataFeed") {
+        } else if ($bundle_name == "IndexDataFeed") {
             $index_timestamp = "feed";
         }
         $index = IndexManager::getIndex($index_timestamp);
@@ -656,6 +670,28 @@ class ArcTool implements CrawlConstants
                 "  does not contain posting shards so cannot be re-indexed\n\n";
         }
     }
+    /**
+     * Outputs to stdout header information for a FeedArchiveBundle
+     * bundle.
+     *
+     * @param array $info header info that has already been read from
+     *     the description.txt file
+     * @param string $archive_path file path of the folder containing the bundle
+     * @param string $alternate_description used as the text for description
+     *      rather than what's given in $info
+     * @param bool $only_storage_info output only info about storage statistics
+     *      don't output info about crawl parameters
+     * @param bool $only_crawl_params output only info about crawl parameters
+     *      not storage statistics
+     */
+    public function outputInfoFeedArchiveBundle($info, $archive_path,
+        $alternate_description = "", $only_storage_info = false,
+        $only_crawl_params = false)
+    {
+        $this->outputInfoIndexArchiveBundle($info, $archive_path,
+            $alternate_description, $only_storage_info,
+            $only_crawl_params);
+    }
     /**
      * Outputs to stdout header information for a IndexArchiveBundle
      * bundle.
@@ -709,30 +745,35 @@ class ArcTool implements CrawlConstants
             "Breadth First" : "Page Importance";
         echo "Crawl order was: $crawl_order\n";
         $channel = (isset($info[self::CHANNEL])) ? $info[self::CHANNEL] : 0;
-        echo "Crawl Channel was: $channel.";
-        echo "Seed sites:\n";
-        foreach ($info[self::TO_CRAWL] as $seed) {
-            echo "   $seed\n";
-        }
-        if ($info[self::RESTRICT_SITES_BY_URL]) {
-            echo "Sites allowed to crawl:\n";
-            foreach ($info[self::ALLOWED_SITES] as $site) {
-                echo "   $site\n";
+        echo "Crawl Channel was: $channel.\n";
+        if ($info['DESCRIPTION'] == 'feed') {
+            echo "Feed Bundle, look at SearchSsources in web interface to see";
+            echo "\n feed sources.\n";
+        } else {
+            echo "Seed sites:\n";
+            foreach ($info[self::TO_CRAWL] as $seed) {
+                echo "   $seed\n";
             }
-        }
-        echo "Sites not allowed to be crawled:\n";
-        if (is_array($info[self::DISALLOWED_SITES])) {
-            foreach ($info[self::DISALLOWED_SITES] as $site) {
-                echo "   $site\n";
+            if ($info[self::RESTRICT_SITES_BY_URL]) {
+                echo "Sites allowed to crawl:\n";
+                foreach ($info[self::ALLOWED_SITES] as $site) {
+                    echo "   $site\n";
+                }
             }
-        }
-        echo "Page Rules:\n";
-        if (isset($info[self::PAGE_RULES])) {
-            foreach ($info[self::PAGE_RULES] as $rule) {
-                echo "   $rule\n";
+            echo "Sites not allowed to be crawled:\n";
+            if (is_array($info[self::DISALLOWED_SITES])) {
+                foreach ($info[self::DISALLOWED_SITES] as $site) {
+                    echo "   $site\n";
+                }
             }
+            echo "Page Rules:\n";
+            if (isset($info[self::PAGE_RULES])) {
+                foreach ($info[self::PAGE_RULES] as $rule) {
+                    echo "   $rule\n";
+                }
+            }
+            echo "\n";
         }
-        echo "\n";
     }
     /**
      * Outputs to stdout header information for a DoubleIndexBundle
@@ -854,7 +895,7 @@ class ArcTool implements CrawlConstants
         } else if ($archive_type == "WebArchiveBundle") {
             $info = $archive_name::getArchiveInfo($archive_path);
             $num = min($num, $info["COUNT"] - $start);
-            $num_generations = $info["WRITE_PARTITION"]+1;
+            $num_generations = $info["WRITE_PARTITION"] + 1;
             $archive = new WebArchiveBundle($archive_path);
         } else {
             $nonyioop = true;
@@ -973,8 +1014,8 @@ class ArcTool implements CrawlConstants
         }
         $archive_type = $this->getArchiveKind($archive_path);
         $archive_name = C\NS_LIB . $archive_type ;
-        if (!in_array($archive_type, ["IndexArchiveBundle",
-            "DoubleIndexBundle"])) {
+        if (!in_array($archive_type, ["FeedArchiveBundle",
+            "DoubleIndexBundle", "IndexArchiveBundle",])) {
             $this->badFormatMessageAndExit($archive_path, "index");
         }
         preg_match("/\d+$/", $archive_path, $matches);
@@ -1165,6 +1206,9 @@ class ArcTool implements CrawlConstants
         if (file_exists("$archive_path/description.txt")) {
             return "WebArchiveBundle";
         }
+        if (file_exists("$archive_path/filter_a.ftr")) {
+            return "FeedArchiveBundle";
+        }
         if (file_exists("$archive_path/summaries/description.txt")) {
             return "IndexArchiveBundle";
         }
diff --git a/src/library/IndexManager.php b/src/library/IndexManager.php
index e10b4c200..53e022de3 100644
--- a/src/library/IndexManager.php
+++ b/src/library/IndexManager.php
@@ -207,8 +207,8 @@ class IndexManager implements CrawlConstants
             $active_generation = $index->generation_info['ACTIVE'];
             if ((empty($index->generation_info['LAST_DICTIONARY_SHARD']) ||
                 $index->generation_info['LAST_DICTIONARY_SHARD'] <
-                $active_generation) && $active_generation <
-                $last_desired_generation) {
+                $active_generation) && ($active_generation <
+                $last_desired_generation || $last_desired_generation < 0)) {
                 $active_shard_file = $index->dir_name .
                     "/posting_doc_shards/index" . $active_generation;
                 if (file_exists($active_shard_file)) {
ViewGit