viewgit/inc/functions.php:22 Function utf8_encode() is deprecated [8192]
diff --git a/src/executables/ArcTool.php b/src/executables/ArcTool.php index 685336012..a2ce8d6e8 100755 --- a/src/executables/ArcTool.php +++ b/src/executables/ArcTool.php @@ -200,8 +200,8 @@ class ArcTool implements CrawlConstants $name = $this->getArchiveName($archive_path); echo $name . " "; $archive_type = $this->getArchiveKind($archive_path); - if (in_array($archive_type, ["IndexArchiveBundle", - "DoubleIndexBundle"])) { + if (in_array($archive_type, ["FeedArchiveBundle", + "DoubleIndexBundle", "IndexArchiveBundle",])) { $bundle_class = C\NS_LIB . $archive_type; $info = $bundle_class::getArchiveInfo($archive_path); $info = unserialize($info["DESCRIPTION"]); @@ -245,8 +245,8 @@ class ArcTool implements CrawlConstants if ($archive_type === false) { $this->badFormatMessageAndExit($archive_path); } - if (in_array($archive_type, ["IndexArchiveBundle", "WebArchiveBundle", - "DoubleIndexBundle"])){ + if (in_array($archive_type, ["DoubleIndexBundle", "FeedArchiveBundle", + "IndexArchiveBundle", "WebArchiveBundle",])){ $call = "outputInfo" . $archive_type; $archive_name = C\NS_LIB . $archive_type; $info = $archive_name::getArchiveInfo($archive_path); @@ -274,13 +274,13 @@ class ArcTool implements CrawlConstants echo "\nBundle Name: $bundle_name\n"; $archive_type = $this->getArchiveKind($archive_path); echo "Bundle Type: $archive_type\n"; - if (!in_array($archive_type, ["IndexArchiveBundle", - "DoubleIndexBundle"])) { + if (!in_array($archive_type, ["FeedArchiveBundle", + "DoubleIndexBundle", "IndexArchiveBundle",])) { $this->badFormatMessageAndExit($archive_path, "index"); } preg_match("/\d+$/", $archive_path, $matches); $index_timestamp = (isset($matches[0])) ? $matches[0] : 0; - if ($bundle_num >= 0) { + if (isset($bundle_num) && $bundle_num >= 0) { $index_timestamp .= "-$bundle_num"; } else if ($bundle_name == "IndexDataFeed") { $index_timestamp = "feed"; @@ -348,8 +348,8 @@ class ArcTool implements CrawlConstants echo "\nBundle Name: $bundle_name\n"; $archive_type = $this->getArchiveKind($archive_path); echo "Bundle Type: $archive_type\n"; - if (!in_array($archive_type, ["IndexArchiveBundle", - "DoubleIndexBundle"])) { + if (!in_array($archive_type, ["FeedArchiveBundle", + "DoubleIndexBundle", "IndexArchiveBundle",])) { $this->badFormatMessageAndExit($archive_path, "index"); } preg_match("/\d+$/", $archive_path, $matches); @@ -365,8 +365,13 @@ class ArcTool implements CrawlConstants echo "Number of Generations: $num_generations\n"; echo "\nShard Information for Generation $generation\n"; echo "====================================\n"; + $_SERVER["NO_LOGGING"] = true; $shard = $index->getCurrentShard(true); - echo "Number of Distinct Terms Indexed: ".count($shard->words)."\n"; + if ($shard === null) { + echo "This shard's word info is already in bundles dictionary\n"; + return; + } + echo "Number of Distinct Terms Indexed: " . count($shard->words)."\n"; echo "Number of Docs in Shard: " . $shard->num_docs."\n"; echo "Number of Link Items in Shard: ".$shard->num_link_docs."\n"; echo "Total Links and Docs: ".($shard->num_docs + @@ -405,17 +410,26 @@ class ArcTool implements CrawlConstants echo "\nBundle Name: $bundle_name\n"; $archive_type = $this->getArchiveKind($archive_path); echo "Bundle Type: $archive_type\n"; - if (!in_array($archive_type, ["IndexArchiveBundle", - "DoubleIndexBundle"])) { + if (!in_array($archive_type, ["FeedArchiveBundle", + "DoubleIndexBundle", "IndexArchiveBundle",])) { $this->badFormatMessageAndExit($archive_path, "index"); } preg_match("/\d+$/", $archive_path, $matches); $index_timestamp = (isset($matches[0])) ? $matches[0] : 0; - if ($bundle_num >= 0) { + if (isset($bundle_num) && $bundle_num >= 0) { $index_timestamp .= "-$bundle_num"; + } else if ($bundle_name == "IndexDataFeed") { + $index_timestamp = "feed"; } $index = IndexManager::getIndex($index_timestamp); - $num_generations = $index->generation_info["ACTIVE"] + 1; + if (isset($index->generation_info["ACTIVE"])) { + $num_generations = $index->generation_info["ACTIVE"] + 1; + } else if (isset($index->generation_info["CURRENT"])) { + $num_generations = $index->generation_info["CURRENT"] + 1; + } else { + echo "Archive does not appear to have data yet"; + exit(); + } $count = 0; $visited_urls_count = 0; echo "Shard Counts\n===========\n"; @@ -468,15 +482,15 @@ class ArcTool implements CrawlConstants echo "Bundle Type: $archive_type\n"; echo "Generation: $generation\n"; echo "Offset: $offset\n"; - if (!in_array($archive_type, ["IndexArchiveBundle", - "DoubleIndexBundle"])) { + if (!in_array($archive_type, ["FeedArchiveBundle", + "DoubleIndexBundle", "IndexArchiveBundle",])) { $this->badFormatMessageAndExit($archive_path, "index"); } preg_match("/\d+$/", $archive_path, $matches); $index_timestamp = (isset($matches[0])) ? $matches[0] : 0; - if ($bundle_num >= 0) { + if (isset($bundle_num) && $bundle_num >= 0) { $index_timestamp .= "-$bundle_num"; - } else if ($bundle_num = "IndexDataFeed") { + } else if ($bundle_name == "IndexDataFeed") { $index_timestamp = "feed"; } $index = IndexManager::getIndex($index_timestamp); @@ -656,6 +670,28 @@ class ArcTool implements CrawlConstants " does not contain posting shards so cannot be re-indexed\n\n"; } } + /** + * Outputs to stdout header information for a FeedArchiveBundle + * bundle. + * + * @param array $info header info that has already been read from + * the description.txt file + * @param string $archive_path file path of the folder containing the bundle + * @param string $alternate_description used as the text for description + * rather than what's given in $info + * @param bool $only_storage_info output only info about storage statistics + * don't output info about crawl parameters + * @param bool $only_crawl_params output only info about crawl parameters + * not storage statistics + */ + public function outputInfoFeedArchiveBundle($info, $archive_path, + $alternate_description = "", $only_storage_info = false, + $only_crawl_params = false) + { + $this->outputInfoIndexArchiveBundle($info, $archive_path, + $alternate_description, $only_storage_info, + $only_crawl_params); + } /** * Outputs to stdout header information for a IndexArchiveBundle * bundle. @@ -709,30 +745,35 @@ class ArcTool implements CrawlConstants "Breadth First" : "Page Importance"; echo "Crawl order was: $crawl_order\n"; $channel = (isset($info[self::CHANNEL])) ? $info[self::CHANNEL] : 0; - echo "Crawl Channel was: $channel."; - echo "Seed sites:\n"; - foreach ($info[self::TO_CRAWL] as $seed) { - echo " $seed\n"; - } - if ($info[self::RESTRICT_SITES_BY_URL]) { - echo "Sites allowed to crawl:\n"; - foreach ($info[self::ALLOWED_SITES] as $site) { - echo " $site\n"; + echo "Crawl Channel was: $channel.\n"; + if ($info['DESCRIPTION'] == 'feed') { + echo "Feed Bundle, look at SearchSsources in web interface to see"; + echo "\n feed sources.\n"; + } else { + echo "Seed sites:\n"; + foreach ($info[self::TO_CRAWL] as $seed) { + echo " $seed\n"; } - } - echo "Sites not allowed to be crawled:\n"; - if (is_array($info[self::DISALLOWED_SITES])) { - foreach ($info[self::DISALLOWED_SITES] as $site) { - echo " $site\n"; + if ($info[self::RESTRICT_SITES_BY_URL]) { + echo "Sites allowed to crawl:\n"; + foreach ($info[self::ALLOWED_SITES] as $site) { + echo " $site\n"; + } } - } - echo "Page Rules:\n"; - if (isset($info[self::PAGE_RULES])) { - foreach ($info[self::PAGE_RULES] as $rule) { - echo " $rule\n"; + echo "Sites not allowed to be crawled:\n"; + if (is_array($info[self::DISALLOWED_SITES])) { + foreach ($info[self::DISALLOWED_SITES] as $site) { + echo " $site\n"; + } } + echo "Page Rules:\n"; + if (isset($info[self::PAGE_RULES])) { + foreach ($info[self::PAGE_RULES] as $rule) { + echo " $rule\n"; + } + } + echo "\n"; } - echo "\n"; } /** * Outputs to stdout header information for a DoubleIndexBundle @@ -854,7 +895,7 @@ class ArcTool implements CrawlConstants } else if ($archive_type == "WebArchiveBundle") { $info = $archive_name::getArchiveInfo($archive_path); $num = min($num, $info["COUNT"] - $start); - $num_generations = $info["WRITE_PARTITION"]+1; + $num_generations = $info["WRITE_PARTITION"] + 1; $archive = new WebArchiveBundle($archive_path); } else { $nonyioop = true; @@ -973,8 +1014,8 @@ class ArcTool implements CrawlConstants } $archive_type = $this->getArchiveKind($archive_path); $archive_name = C\NS_LIB . $archive_type ; - if (!in_array($archive_type, ["IndexArchiveBundle", - "DoubleIndexBundle"])) { + if (!in_array($archive_type, ["FeedArchiveBundle", + "DoubleIndexBundle", "IndexArchiveBundle",])) { $this->badFormatMessageAndExit($archive_path, "index"); } preg_match("/\d+$/", $archive_path, $matches); @@ -1165,6 +1206,9 @@ class ArcTool implements CrawlConstants if (file_exists("$archive_path/description.txt")) { return "WebArchiveBundle"; } + if (file_exists("$archive_path/filter_a.ftr")) { + return "FeedArchiveBundle"; + } if (file_exists("$archive_path/summaries/description.txt")) { return "IndexArchiveBundle"; } diff --git a/src/library/IndexManager.php b/src/library/IndexManager.php index e10b4c200..53e022de3 100644 --- a/src/library/IndexManager.php +++ b/src/library/IndexManager.php @@ -207,8 +207,8 @@ class IndexManager implements CrawlConstants $active_generation = $index->generation_info['ACTIVE']; if ((empty($index->generation_info['LAST_DICTIONARY_SHARD']) || $index->generation_info['LAST_DICTIONARY_SHARD'] < - $active_generation) && $active_generation < - $last_desired_generation) { + $active_generation) && ($active_generation < + $last_desired_generation || $last_desired_generation < 0)) { $active_shard_file = $index->dir_name . "/posting_doc_shards/index" . $active_generation; if (file_exists($active_shard_file)) {