viewgit/inc/functions.php:22 Function utf8_encode() is deprecated [8192]
<?php /** * SeekQuarry/Yioop -- * Open Source Pure PHP Search Engine, Crawler, and Indexer * * Copyright (C) 2009 - 2023 Chris Pollett chris@pollett.org * * LICENSE: * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <https://www.gnu.org/licenses/>. * * END LICENSE * * @author Chris Pollett chris@pollett.org * @license https://www.gnu.org/licenses/ GPL3 * @link https://www.seekquarry.com/ * @copyright 2009 - 2023 * @filesource */ namespace seekquarry\yioop\executables; use seekquarry\yioop\configs as C; use seekquarry\yioop\library as L; use seekquarry\yioop\library\BloomFilterFile; use seekquarry\yioop\library\CrawlConstants; use seekquarry\yioop\library\FetchUrl; use seekquarry\yioop\library\IndexArchiveBundle; use seekquarry\yioop\library\IndexDocumentBundle; use seekquarry\yioop\library\IndexManager; use seekquarry\yioop\library\IndexShard; use seekquarry\yioop\library\PackedTableTools; use seekquarry\yioop\library\PartitionDocumentBundle; use seekquarry\yioop\library\PhraseParser; use seekquarry\yioop\library\UrlParser; use seekquarry\yioop\library\WebArchiveBundle; use seekquarry\yioop\library\media_jobs\FeedsUpdateJob; use seekquarry\yioop\controllers\AdminController; if (php_sapi_name() != 'cli' || defined("seekquarry\\yioop\\configs\\IS_OWN_WEB_SERVER")) { echo "BAD REQUEST"; exit(); } /** This tool does not need logging*/ $_SERVER["LOG_TO_FILES"] = false; /** USE_CACHE false rules out file cache as well*/ $_SERVER["USE_CACHE"] = false; /** For crawlHash, crawlHashWord function */ require_once __DIR__."/../library/Utility.php"; if (!C\PROFILE) { echo "Please configure the search engine instance by visiting" . "its web interface on localhost.\n"; exit(); } ini_set("memory_limit", C\ARC_TOOL_MEMORY_LIMIT); /*reading in a whole shard might take a fair bit of memory */ /* * We'll set up multi-byte string handling to use UTF-8 */ mb_internal_encoding("UTF-8"); mb_regex_encoding("UTF-8"); /** * Command line program that allows one to examine the content of * the WebArchiveBundles and IndexArchiveBundles of Yioop crawls. * To see all of the available command run it from the command line * with a syntax like: * * php ArcTool.php * * @author Chris Pollett (non-yioop archive code derived from earlier * stuff by Shawn Tice) */ class ArcTool extends DictionaryUpdater implements CrawlConstants { /** * The maximum number of documents the ArcTool list function * will read into memory in one go. */ const MAX_BUFFER_DOCS = 200; /** * The maximum number of documents the ArcTool will rebuild/migrate in * one go */ const MAX_REBUILD_DOCS = 8000; /** * Initializes the ArcTool, for now does nothing */ public function __construct() { } /** * Runs the ArcTool on the supplied command line arguments */ public function start() { global $argv; if (!isset($argv[1]) || (!isset($argv[2]) && $argv[1] != "list") || (!isset($argv[3]) && in_array($argv[1], [ "dict", "inject", "make-filter"] ) || ( !isset($argv[4]) && $argv[1] == "doc-lookup")) ) { $this->usageMessageAndExit(); } if (!in_array($argv[1], [ "check-filter", "make-filter", "list"])) { if (!in_array($argv[1], ["inject"])) { $path = UrlParser::getDocumentFilename($argv[2]); if ($path == $argv[2] && !file_exists($path)) { $path = C\CACHE_DIR . "/" . $path; if (!file_exists($path)) { $path = C\ARCHIVES_DIR . "/" . $argv[2]; } } $kind = $this->getArchiveKind($path); if ($kind == "DoubleIndexBundle" && !in_array( $argv[1], ["info", "migrate"]) ) { $bundle_num = $argv[3]; unset($argv[3]); $argv = array_values($argv); $path .= "-$bundle_num"; } } else if (is_numeric($argv[2])) { $path = $argv[2]; } else { $this->usageMessageAndExit(); } } switch ($argv[1]) { case "check-filter": $this->checkFilter($argv[2], $argv[3]); break; case "count": if (!isset($argv[3])) { $argv[3] = false; } $this->outputCountBundle($path, $argv[3]); break; case "doc-lookup": $this->outputDocLookup($argv[2], intval($argv[3]), intval($argv[4])); break; case "dict": if (isset($argv[4]) && $argv[4] == 'details') { $argv[4] = -1; $argv[5] = -1; $argv[6] = "details"; } $argv[4] = (isset($argv[4])) ? intval($argv[4]) : -1; $argv[5] = (isset($argv[5])) ? intval($argv[5]) : -1; $argv[6] = (!empty($argv[6]) && $argv[6] == "details") ? true : false; $this->outputDictInfo($path, $argv[3], $argv[4], $argv[5], $argv[6]); break; case "info": $this->outputInfo($path); break; case "inject": $this->inject($path, $argv[3]); break; case "list": $this->outputArchiveList(); break; case "make-filter": if (!isset($argv[4])) { $argv[4] = -1; } $this->makeFilter($argv[2], $argv[3], $argv[4]); break; case "migrate": $this->migrateIndexArchive($path); break; case "partition": $this->outputPartitionInfo($path, $argv[3]); break; case "fix-partition": if (!isset($argv[3])) { $argv[3] = 0; } if (!isset($argv[4])) { $argv[4] = -1; } $this->fixPartitionIndexes($path, $argv[3], $argv[4]); break; case "rebuild": if (!isset($argv[3])) { $argv[3] = 0; } $this->rebuildIndexBundle($path, $argv[3]); break; case "show": if (!isset($argv[3])) { $this->usageMessageAndExit(); } if (!isset($argv[4])) { $argv[4] = 1; } $this->outputShowPages($path, $argv[3], $argv[4]); break; default: $this->usageMessageAndExit(); } } /** * Lists the Web or IndexArchives in the crawl directory */ public function outputArchiveList() { $yioop_pattern = C\CACHE_DIR . "/{" . self::double_index_base_name . "," . self::index_data_base_name . "}*"; $archives = glob($yioop_pattern, GLOB_BRACE); $archives_found = false; if (is_array($archives) && count($archives) > 0) { $archives_found = true; echo "\nFound Yioop Archives:\n"; echo "=====================\n"; foreach ($archives as $archive_path) { $name = $this->getArchiveName($archive_path); echo $name . " "; $archive_type = $this->getArchiveKind($archive_path); if (in_array($archive_type, ["FeedArchiveBundle", "DoubleIndexBundle", "IndexArchiveBundle",])) { $bundle_class = C\NS_LIB . $archive_type; $info = $bundle_class::getArchiveInfo($archive_path); if (in_array(substr($info["DESCRIPTION"], 0, 2), ['s:', 'a:'])) { $info = unserialize($info["DESCRIPTION"]); } echo $info["DESCRIPTION"]; } echo "\n"; } } $nonyioop_pattern = C\ARCHIVES_DIR . "/*/arc_description.ini"; $archives = glob($nonyioop_pattern); if (is_array($archives) && count($archives) > 0 ) { $archives_found = true; echo "\nFound Non-Yioop Archives:\n"; echo "=========================\n"; foreach ($archives as $archive_path) { $len = strlen("/arc_description.ini"); $path = substr($archive_path, 0, -$len); echo $this->getArchiveName($path)."\n"; } } if (!$archives_found) { echo "No archives currently in crawl directory \n"; } echo "\n"; } /** * Determines whether the supplied path is a WebArchiveBundle, * an IndexArchiveBundle, DoubleIndexBundle, or non-Yioop Archive. * Then outputs to stdout header information about the * bundle by calling the appropriate sub-function. * * @param string $archive_path The path of a directory that holds * WebArchiveBundle,IndexArchiveBundle, or non-Yioop archive data */ public function outputInfo($archive_path) { $bundle_name = $this->getArchiveName($archive_path); echo "Bundle Name: " . $bundle_name."\n"; $archive_type = $this->getArchiveKind($archive_path); echo "Bundle Type: " . $archive_type."\n"; if ($archive_type === false) { $this->badFormatMessageAndExit($archive_path); } if (in_array($archive_type, ["DoubleIndexBundle", "FeedDocumentBundle", "IndexDocumentBundle",])){ $call = "outputInfo" . $archive_type; $archive_name = C\NS_LIB . $archive_type; $info = $archive_name::getArchiveInfo($archive_path); $this->$call($info, $archive_path); } } /** * Outputs the $doc_map_index'th document from the $partition partition * of the IndexDocumentBundle in folder $index_name * * @param string $index_name folder containing an IndexDocumentBundle * DoubleIndexBundle or FeedDocumentBundle * @param int $partition which partition to do the lookup in * @param int $doc_map_index index of which document to lookup * @return array associative array of field => values associated with * document */ public function outputDocLookup($index_name, $partition, $doc_map_index) { if (substr($index_name, 0, strlen(self::index_data_base_name)) == self::index_data_base_name) { $index_name = substr($index_name, strlen(self::index_data_base_name)); } if (substr($index_name, 0, strlen(self::double_index_base_name)) == self::double_index_base_name) { $index_name = substr($index_name, strlen(self::double_index_base_name)); } $index = IndexManager::getIndex($index_name); $base_folder = $index->getPartitionBaseFolder($partition); $doc_map_filename = $base_folder . "/" . IndexDocumentBundle::DOC_MAP_FILENAME; $doc_map_tools = $index->doc_map_tools; $entry = $doc_map_tools->findEntryAtIndexTableName($doc_map_filename, $doc_map_index); $docid_len = IndexDocumentBundle::DOCID_LEN; $termsfilter_len = IndexDocumentBundle::TERMSFILTER_LEN; $doc_key = substr($entry, 0, $docid_len); $entry = (strlen($entry) >= ($docid_len + $termsfilter_len + 1) && $entry[$docid_len] == 't') ? substr($entry, $docid_len + $termsfilter_len + 1) : substr($entry, $docid_len); $doc_map_tools = $index->doc_map_tools; echo "Doc Key: " . L\toHexString($doc_key) . "\n"; echo "Partition: $partition\n"; echo "Doc Map Entries\n------------\n"; $doc_info = $doc_map_tools->unpack($entry); $first_row = array_shift($doc_info); $second_row = array_shift($doc_info); if (!empty($first_row)) { list($num_words, $score) = array_values($first_row); echo "Number of Words in Document: $num_words\n"; echo "Document Score: $score\n"; } $num_scores = count($doc_info); if (!empty($second_row)) { list($title_length, $num_description_scores) = array_values( $second_row); echo "Title Length $title_length\n"; } $score_index = 0; if ($num_description_scores > 0) { echo "Description Scores\n-------------------\n"; while ($score_index < $num_description_scores) { $doc_row = $doc_info[$score_index]; $score_index++; echo "Pos: " . $doc_row['POS'] . " Score: ". $doc_row['SCORE']. "\n"; } } if ($score_index < $num_scores) { echo "User Rank Scores\n-------------------\n"; while ($score_index < $num_scores) { $doc_row = $doc_info[$score_index]; echo "Score Index: " . $score_index . " Score: ". $doc_row['SCORE'] . "\n"; $score_index++; } } echo "Doc:\n----\n"; $document = $index->documents->get($doc_key, $partition); var_dump($document); } /** * Prints the dictionary records for a word in an IndexDocumentBundle * * @param string $archive_path the path of a directory that holds * an IndexArchiveBundle * @param string $word to look up dictionary record for * @param int $start_record first record to list out * @param int $num_records max records to list our * @param bool $details whether to show posting list details or not */ public function outputDictInfo($archive_path, $word, $start_record, $num_records, $details) { $bundle_num = -1; if (preg_match("/\-\d$/", $archive_path)) { $bundle_num = substr($archive_path, -1); $archive_path = substr($archive_path, 0, -2); } $bundle_name = $this->getArchiveName($archive_path); echo "\nBundle Name: $bundle_name\n"; $archive_type = $this->getArchiveKind($archive_path); echo "Bundle Type: $archive_type\n"; if (!in_array($archive_type, ["FeedDocumentBundle", "DoubleIndexBundle", "IndexDocumentBundle",])) { $this->badFormatMessageAndExit($archive_path, "index"); } preg_match("/\d+$/", $archive_path, $matches); $index_timestamp = (isset($matches[0])) ? $matches[0] : 0; if (isset($bundle_num) && $bundle_num >= 0) { $index_timestamp .= "-$bundle_num"; } else if ($bundle_name == "IndexDataFeed") { $index_timestamp = "feed"; } $hash_key = L\canonicalTerm($word); $start_time = microtime(true); echo "Looking up in dictionary:\n"; echo " Key: " . $hash_key . "\n"; echo " Key (in Hex): ". L\toHexString($hash_key) . "\n"; $info = IndexManager::getWordInfo($index_timestamp, $hash_key, -1); $index = IndexManager::getIndex($index_timestamp); echo "Dictionary Lookup Time:" . L\changeInMicrotime($start_time) . "\n"; if (!$info) { echo " Key not found\n"; exit(); } $found = true; echo "B+-tree node file name: ". $info['ARCHIVE_FILE'] . "\n"; echo "\nBundle Dictionary Entries for '$word':\n"; echo "====================================\n"; $i = 0; $archive_file = $info['ARCHIVE_FILE']; $is_old_index = $index->archive_info['VERSION'] < "3.2"; foreach ($info['ROWS'] as $record) { if ($start_record < 0 || $record['PARTITION'] >= $start_record) { echo "RECORD: $i\n"; echo "PARTITION: {$record['PARTITION']}\n"; echo "NUMBER OF DOCS: {$record['NUM_DOCS']}\n\n"; if ($is_old_index) { $postings_offset = (empty($record['POSTINGS'])) ? -1: $record['POSTINGS']; $postings_len = (empty($record['LAST_BLOB_LEN'])) ? -1 : $record['LAST_BLOB_LEN']; } else { $postings_offset = (empty($record['POSTINGS_OFFSET'])) ? -1: $record['POSTINGS_OFFSET']; $postings_len = (empty($record['POSTINGS_LEN']))? -1 : $record['POSTINGS_LEN']; } $is_postings_array = isset($record['POSTINGS']) && is_array($record['POSTINGS']); if ($postings_offset == -1 && !$is_postings_array) { echo "No POSTINGS_LEN or LAST_BLOB_LEN, record postings". " cannot be looked up\n"; } else if ($is_postings_array) { echo "Active Record\n-------------\n"; echo "By default list details of all postings\n"; var_dump($record['POSTINGS']); } else if ($details) { if ($is_old_index) { $postings_entry = $index->dictionary->getArchive( $archive_file, $postings_offset, $postings_len); } else { $postings_entry = $index->getPostingsString( $record['PARTITION'], $postings_offset, $postings_len); } $postings = $index->postings_tools->unpack($postings_entry); $index->deDeltaPostingsSumFrequencies($postings); var_dump($postings); } } $i++; if ($num_records > 0 && $i >= $start_record + $num_records) { break; } } } /** * Prints information about the number of words and frequencies of words * within the $index'th partition in the bundle * * @param string $archive_path the path of a directory that holds * an IndexDocumentBundle * @param int $num of partition to show info for */ public function outputPartitionInfo($archive_path, $num) { if (preg_match("/\-\d$/", $archive_path)) { $bundle_num = substr($archive_path, -1); $archive_path = substr($archive_path, 0, -2); } $bundle_name = $this->getArchiveName($archive_path); echo "\nBundle Name: $bundle_name\n"; $archive_type = $this->getArchiveKind($archive_path); echo "Bundle Type: $archive_type\n"; if (!in_array($archive_type, ["FeedArchiveBundle", "DoubleIndexBundle", "IndexDocumentBundle",])) { $this->badFormatMessageAndExit($archive_path, "index"); } preg_match("/\d+$/", $archive_path, $matches); $index_timestamp = (isset($matches[0])) ? $matches[0] : 0; if (isset($bundle_num) && $bundle_num >= 0) { $index_timestamp .= "-$bundle_num"; } else if ($bundle_name == "IndexDataFeed") { $index_timestamp = "feed"; } $index = IndexManager::getIndex($index_timestamp); $partition_bundle = $index->documents; $num_partitions = $partition_bundle->parameters["SAVE_PARTITION"] ?? -1; $num_partitions++; echo "Number of Partitions: $num_partitions\n"; echo "\nInformation for Partition $num\n"; echo "====================================\n"; $_SERVER["NO_LOGGING"] = true; $partition = $partition_bundle->loadPartitionIndex($num); if (!$partition) { echo "Partition $num does not exists\n"; return; } $statistics = $index->buildInvertedIndexPartition($num, null, true); echo "Number of Distinct Terms Indexed: " . count($statistics['TERM_STATISTICS'])."\n"; echo "Number of Docs in Shard: " . $statistics['NUM_DOCS'] ."\n"; echo "Number of Link Items in Shard: ".$statistics['NUM_LINKS']."\n"; echo "Total Links and Docs: ".($statistics['NUM_DOCS'] + $statistics['NUM_LINKS'])."\n\n"; echo "Term histogram for shard\n"; echo "------------------------\n"; $term_frequencies = $statistics['TERM_STATISTICS']; arsort($term_frequencies); $i = 1; echo "Freq Rank\t# Term\t# Docs Term Appears In\n"; foreach ($term_frequencies as $term => $num_docs) { $term = rtrim($term, '_'); $term = str_replace("3A", ":", $term); echo "$i\t\t\t$term\t\t\t$num_docs\n"; $i ++; } } /** * Recomputes the hash index (.ix) files for a range of partitions * from start_partition to end_partition in the documents subfolder of * an IndexDocumentBundle. An ix file contains a sequence of compressed * 4-tuple (doc_id, summary_offset, summary_length, cache_length) * corresponding to a partition file (these end in .txt.gz and are * a sequence of compressed document summaries followed by orginal * documents). * @param string $archive_path the path of a directory that holds * an IndexDocumentBundle * @param int $start_partition first partition to recompute * @param int $end_partition last partition to recompute (inclusive) */ public function fixPartitionIndexes($archive_path, $start_partition, $end_partition = -1) { if (preg_match("/\-\d$/", $archive_path)) { $bundle_num = substr($archive_path, -1); $archive_path = substr($archive_path, 0, -2); } $bundle_name = $this->getArchiveName($archive_path); echo "\nBundle Name: $bundle_name\n"; $archive_type = $this->getArchiveKind($archive_path); echo "Bundle Type: $archive_type\n"; if (!in_array($archive_type, ["FeedArchiveBundle", "DoubleIndexBundle", "IndexDocumentBundle",])) { $this->badFormatMessageAndExit($archive_path, "index"); } preg_match("/\d+$/", $archive_path, $matches); $index_timestamp = (isset($matches[0])) ? $matches[0] : 0; if (isset($bundle_num) && $bundle_num >= 0) { $index_timestamp .= "-$bundle_num"; } else if ($bundle_name == "IndexDataFeed") { $index_timestamp = "feed"; } $index = IndexManager::getIndex($index_timestamp); $partition_bundle = $index->documents; if ($end_partition == -1) { $end_partition = $partition_bundle->parameters["SAVE_PARTITION"]; } $partition_tools = new PackedTableTools( ["PRIMARY KEY" => ["DOC_ID", IndexDocumentBundle::DOCID_LEN], "SUMMARY_OFFSET" => "INT", "SUMMARY_LENGTH" => "INT", "CACHE_PAGE_LENGTH" => "INT"]); $buffer_size = 1000000; $gz_start = '/\x1F\x8B\x08\x00\x00\x00\x00.../'; for ($i = $start_partition; $i <= $end_partition; $i++) { $partition_filename = $partition_bundle->getPartition($i); echo "Reconstructing Index File (.ix) for $partition_filename\n"; if (!file_exists($partition_filename)) { echo "\nPartition File: $partition_filename does not exists!"; echo "\nStopping!"; exit(); } $first_time = true; $fh = fopen($partition_filename, "r"); $remainder = ""; $offset = 0; $is_summary = true; $cnt = 0; $partition_index = ""; do { $gztext = fread($fh, $buffer_size); $objects = preg_split($gz_start, $remainder . $gztext); $num_objects = count($objects); $start = ($first_time) ? 1 : 0; $first_time = false; for ($j = $start ; $j < $num_objects; $j++) { $compress = $objects[$j]; if ($j == $num_objects - 1 && !feof($fh)) { $remainder = $compress; break; } $len = 10 + strlen($compress); $site_string = @gzinflate($compress); if (empty($site_string)) { echo "Couldn't uncompress item $cnt\n"; $offset += $len; $last_summary = false; continue; } $is_summary = (substr($site_string, 0, 2) == 'a:') ? true : false; $is_cache = (substr($site_string, 0, 2) == 's:') ? true : false; if ($is_summary) { $site = @unserialize($site_string); $last_summary = true; if (empty($site)) { echo "Couldn't unserialize item $cnt\n"; $offset += $len; continue; } else if (is_string($site)) { echo "Item $cnt is a cache page not a summary.\n"; $offset += $len; continue; } $doc_id = IndexDocumentBundle::computeDocId($site); $summary_offset = $offset; $summary_len = $len; } else if ($is_cache && $last_summary) { $last_summary = false; $cache_len = $len; if ($cnt % 1000 == 0) { echo "...extracted and indexed $cnt items from:\n" . "$partition_filename\n"; echo "Last Summary Offset: $summary_offset," . " Summary Length: $summary_len, Cache Length: ". "$cache_len\n"; } $out_value = $partition_tools->pack([ "SUMMARY_OFFSET" => $summary_offset, "SUMMARY_LENGTH" => $summary_len, "CACHE_PAGE_LENGTH" => $cache_len ]); $partition_tools->add($partition_index, $doc_id, $out_value, $partition_tools::ADD_MEM_TABLE_STRING); $cnt++; } else { $last_summary = false; } $offset += $len; } } while(!feof($fh)); $partition_index_name = $partition_bundle->getPartitionIndex($i); echo "Saving $partition_index_name.\n"; $partition_tools->save($partition_index_name, $partition_index); } } /** * Counts and outputs the number of docs and links in each shard * in the archive supplied in $archive_path as well as an overall count * * @param string $archive_path patch of archive to count * @param bool $set_count flag that controls whether after computing * the count to write it back into the archive */ public function outputCountBundle($archive_path, $set_count = false) { $bundle_num = -1; if (preg_match("/\-\d$/", $archive_path)) { $bundle_num = substr($archive_path, -1); $archive_path = substr($archive_path, 0, -2); } $bundle_name = $this->getArchiveName($archive_path); echo "\nBundle Name: $bundle_name\n"; $archive_type = $this->getArchiveKind($archive_path); echo "Bundle Type: $archive_type\n"; if (!in_array($archive_type, ["FeedDocumentBundle", "DoubleIndexBundle", "IndexDocumentBundle",])) { $this->badFormatMessageAndExit($archive_path, "index"); } preg_match("/\d+$/", $archive_path, $matches); $index_timestamp = (isset($matches[0])) ? $matches[0] : 0; if (isset($bundle_num) && $bundle_num >= 0) { $index_timestamp .= "-$bundle_num"; } else if ($bundle_name == "IndexDataFeed") { $index_timestamp = "feed"; } $index = IndexManager::getIndex($index_timestamp); $partition_bundle = $index->documents; $num_partitions = $partition_bundle->parameters["SAVE_PARTITION"] ?? -1; $num_partitions++; $partition_bundle->index_cache_size = 1; if ($num_partitions <= 0) { echo "Archive does not appear to have data yet"; exit(); } else { echo "Number of partitions: $num_partitions\n"; } $count = 0; $visited_urls_count = 0; echo "Partition Counts\n===========\n"; for ($i = 0; $i < $num_partitions; $i++ ) { $partition = $partition_bundle->loadPartitionIndex($i); $partition_keys = array_keys($partition); $docs_partition = 0; foreach($partition_keys as $partition_key) { if (IndexDocumentBundle::isType($partition_key, 'doc')) { $docs_partition++; } } $num_links_and_docs = count($partition_keys); echo "\nPartition:$i\n=======\n"; echo "Number of Docs in Partition: " . $docs_partition . "\n"; echo "Number of Link Items in Partition: " . $num_links_and_docs . "\n"; $visited_urls_count += $docs_partition; $count += $num_links_and_docs; } echo "\n=======\n"; echo "Total Number of Docs Seen:".$visited_urls_count."\n"; echo "Total Number of Link or Doc Items:".$count."\n"; if ($set_count == "save") { echo "\nSaving count to bundle...\n"; $info = IndexDocumentBundle::getArchiveInfo($archive_path); $info['COUNT'] = $count - $docs_partition; $info['ACTIVE_COUNT'] = $docs_partition; $info['VISITED_URLS_COUNT'] = $visited_urls_count; IndexDocumentBundle::setArchiveInfo($archive_path, $info); echo "..done\n"; } } /** * Given a complete path to an archive returns its filename * * @param string $archive_path a path to a yioop or non-yioop archive * @return string its filename */ public function getArchiveName($archive_path) { $start = C\ARCHIVES_DIR . "/"; if (strstr($archive_path, $start)) { $start_len = strlen($start); $name = substr($archive_path, $start_len); } else { $name = UrlParser::getDocumentFilename($archive_path); } return $name; } /** * Outputs tot the terminal if the bloom filter $filter_path contains * the string $item * @param string $filter_path name of bloom filter file to check if * contains item * @param string $item item to chheck in in bloom filter */ public function checkFilter($filter_path, $item) { $item = trim($item); if (!file_exists($filter_path)) { echo "Filter File: $filter_path does not exist."; exit(); } $filter = BloomFilterFile::load($filter_path); if ($filter->contains($item)) { echo "$item is contained in the filter\n"; } else { echo "$item is not contained in the filter\n"; } } /** * Makes a BloomFilterFile object from a dictionary file $dict_file which * has items listed one per line, or items listed as some column of a CSV * file. The result is output to $filter_path * @param string $dict_file to make BloomFilterFile from * @param string $filter_path of file to serialize BloomFilterFile to * @param int $column_num if negative assumes $dict_file has one entry * per line, if >=0 then is the index of the column in a csv to use * for items */ public function makeFilter($dict_file, $filter_path, $column_num = -1) { $lines = file($dict_file); $filter = new BloomFilterFile($filter_path, count($lines)); $i = 0; foreach ($lines as $line) { $item = $line; if ($column_num != -1) { $line_parts = explode(",", $line); $item = $line_parts[$column_num] ?? ""; } $item = trim($item, " \t\n\r\0\x0B\"\'"); if (!empty($item)) { $item = mb_strtolower($item); $i++; if ($i % 10000 == 0) { echo "Added $i items so far. Most recent: $item \n"; } $filter->add($item); } } $filter->save(); } /** * Outputs to stdout header information for a FeedDocumentBundle * bundle. * * @param array $info header info that has already been read from * the description.txt file * @param string $archive_path file path of the folder containing the bundle * @param string $alternate_description used as the text for description * rather than what's given in $info * @param bool $only_storage_info output only info about storage statistics * don't output info about crawl parameters * @param bool $only_crawl_params output only info about crawl parameters * not storage statistics */ public function outputInfoFeedDocumentBundle($info, $archive_path, $alternate_description = "", $only_storage_info = false, $only_crawl_params = false) { $this->outputInfoIndexDocumentBundle($info, $archive_path, $alternate_description, $only_storage_info, $only_crawl_params); } /** * Outputs to stdout header information for a IndexDocumentBundle * bundle. * * @param array $info header info that has already been read from * the description.txt file * @param string $archive_path file path of the folder containing the bundle * @param string $alternate_description used as the text for description * rather than what's given in $info * @param bool $only_storage_info output only info about storage statistics * don't output info about crawl parameters * @param bool $only_crawl_params output only info about crawl parameters * not storage statistics */ public function outputInfoIndexDocumentBundle($info, $archive_path, $alternate_description = "", $only_storage_info = false, $only_crawl_params = false) { $more_info = unserialize($info['DESCRIPTION']); $more_info = is_array($more_info) ? $more_info : []; unset($info['DESCRIPTION']); $info = array_merge($info, $more_info); $description = ($alternate_description) ? $alternate_description : "Description: " . $info['DESCRIPTION']; echo "$description\n"; var_dump($info); if (!$only_crawl_params) { $num_partitions = $info['SAVE_PARTITION'] + 1; echo "Number of partitions: $num_partitions \n"; echo "Maximum number of documents or links per partition:" . $info['MAX_ITEMS_PER_FILE'] . "\n"; echo "Number of stored links and documents: " . ($info['ACTIVE_COUNT'] + $info['COUNT']) . "\n"; echo "Number of stored documents: " . $info['VISITED_URLS_COUNT'] . "\n"; } if ($only_storage_info) { return; } if (isset($info['active_archive'])) { echo "Active Archive Bundle: " . $info['active_archive'] . "\n"; } if (!empty($info['repeat_time'])) { echo "Last Swap Time: " . date("r", $info['repeat_time']) . "\n"; } if (!empty($info['repeat_frequency'])) { echo "Repeat Frequency: " . $info['repeat_frequency'] . " seconds\n"; } $crawl_order = (isset($info[self::CRAWL_ORDER]) && $info[self::CRAWL_ORDER] == self::BREADTH_FIRST) ? "Breadth First" : "Page Importance"; echo "Crawl order was: $crawl_order\n"; $channel = (isset($info[self::CHANNEL])) ? $info[self::CHANNEL] : 0; echo "Crawl Channel was: $channel.\n"; if ($info['DESCRIPTION'] == 'feed') { echo "Feed Bundle, look at SearchSsources in web interface to see"; echo "\n feed sources.\n"; } else { echo "Seed sites:\n"; foreach ($info[self::TO_CRAWL] as $seed) { echo " $seed\n"; } if ($info[self::RESTRICT_SITES_BY_URL]) { echo "Sites allowed to crawl:\n"; foreach ($info[self::ALLOWED_SITES] as $site) { echo " $site\n"; } } echo "Sites not allowed to be crawled:\n"; if (is_array($info[self::DISALLOWED_SITES])) { foreach ($info[self::DISALLOWED_SITES] as $site) { echo " $site\n"; } } echo "Page Rules:\n"; if (isset($info[self::PAGE_RULES])) { foreach ($info[self::PAGE_RULES] as $rule) { echo " $rule\n"; } } echo "\n"; } } /** * Outputs to stdout header information for a DoubleIndexBundle * bundle. * * @param array $info header info that has already been read from * the description.txt file * @param string $archive_path file path of the folder containing the bundle */ public function outputInfoDoubleIndexBundle($info, $archive_path) { $this->outputInfoIndexArchiveBundle($info, $archive_path, "", false, true); $this->outputInfoIndexArchiveBundle($info, $archive_path . "/bundle0", "Bundle 0\n=======", true); echo "\n"; $this->outputInfoIndexArchiveBundle($info, $archive_path . "/bundle1", "Bundle 1\n=======", true); } /** * Adds a list of urls as a upcoming schedule for a given queue bundle. * Can be used to make a closed schedule startable * * @param string $timestamp for a queue bundle to add urls to * @param string $url_file_name name of file consist of urls to inject into * the given crawl */ public function inject($timestamp, $url_file_name) { $admin = new AdminController(); $machine_urls = $admin->model("machine")->getQueueServerUrls(); $num_machines = count($machine_urls); $new_urls = file_get_contents($url_file_name); $inject_urls = $admin->convertStringCleanArray($new_urls); if (!$inject_urls || count($inject_urls) == 0) { echo "\nNo urls in $url_file_name to inject.\n\n"; exit(); } $crawl_model = $admin->model("crawl"); $seed_info = $crawl_model->getCrawlSeedInfo($timestamp, $machine_urls); if (!$seed_info) { echo "\nNo queue bundle with timestamp: $timestamp.\n\n"; exit(); } $seed_info['seed_sites']['url'][] = "#\n#". date('r')."\n#"; $seed_info['seed_sites']['url'] = array_merge( $seed_info['seed_sites']['url'], $inject_urls); $crawl_model->setCrawlSeedInfo($timestamp, $seed_info, $machine_urls); $crawl_model->injectUrlsCurrentCrawl($timestamp, $inject_urls, $machine_urls); echo "Urls injected!"; } /** * Used to list out the pages/summaries stored in a bundle at * $archive_path. It lists to stdout $num many documents starting at $start. * * @param string $archive_path path to bundle to list documents for * @param int $start first document to list * @param int $num number of documents to list */ public function outputShowPages($archive_path, $start, $num) { if (preg_match("/\-\d$/", $archive_path)) { $bundle_num = substr($archive_path, -1); $archive_path = substr($archive_path, 0, -2); } $fields_to_print = [ self::URL => "URL", self::IP_ADDRESSES => "IP ADDRESSES", self::TIMESTAMP => "DATE", self::HTTP_CODE => "HTTP RESPONSE CODE", self::TYPE => "MIMETYPE", self::URL_PARENT => "URL OF PARENT", self::SCRAPER_LABEL => "SCRAPER LABEL", self::AVERAGE_COLOR => "AVERAGE_COLOR", self::DURATION => "DURATION", self::WIDTH => "WIDTH", self::HEIGHT => "HEIGHT", self::ENCODING => "CHARACTER ENCODING", self::DESCRIPTION => "DESCRIPTION", self::PAGE => "PAGE DATA"]; $archive_type = $this->getArchiveKind($archive_path); if ($archive_type === false) { $this->badFormatMessageAndExit($archive_path); } $nonyioop = false; //for yioop archives we set up a dummy iterator $iterator = (object) []; $iterator->end_of_iterator = false; $archive_name = C\NS_LIB . $archive_type; if ($archive_type == "IndexDocumentBundle" || $archive_type == "FeedDocumentBundle") { $info = $archive_name::getArchiveInfo($archive_path); $num = min($num, $info["ACTIVE_COUNT"] + $info["COUNT"] - $start); $num_generations = $info['SAVE_PARTITION'] + 1; $index_archive = new IndexDocumentBundle($archive_path); $archive = $index_archive->documents; } else if ($archive_type == "DoubleIndexBundle") { $bundle_path = "$archive_path/bundle$bundle_num"; $info = IndexDocumentBundle::getArchiveInfo($bundle_path); $num = min($num, $info["ACTIVE_COUNT"] + $info["COUNT"] - $start); $num_generations = $info['SAVE_PARTITION'] + 1; $index_archive = new IndexDocumentBundle($archive_path); $archive = $index_archive->documents; } else { $nonyioop = true; $num_generations = 1; //for non-yioop archives we set up a real iterator $iterator = $this->instantiateIterator($archive_path, $archive_type); if ($iterator === false) { $this->badFormatMessageAndExit($archive_path); } } if (!$nonyioop && isset($this->tmp_results)) { unset($this->tmp_results); } $num = max($num, 0); $total = $start + $num; $seen = 0; $generation = 0; while(!$iterator->end_of_iterator && $seen < $total && $generation < $num_generations) { if ($nonyioop) { $partition = (object) []; $partition_count = 1; $iterator->seekPage($start); if ($iterator->end_of_iterator) { break; } $seen += $start; } else { $partition = $archive->loadPartitionIndex($generation); $partition_count = count($partition); if ($seen + $partition_count < $start) { $generation++; $seen += $partition_count; continue; } $keys = array_keys($partition); unset($partition); $seen_generation = 0; } $seen_generation = 0; while($seen < $total && $seen_generation < $partition_count) { if ($nonyioop) { $num_to_get = min(self::MAX_BUFFER_DOCS, $total - $seen); $objects = $iterator->nextPages($num_to_get); $seen += count($objects); } else { $num_to_get = min($total - $seen, $partition_count - $seen_generation, self::MAX_BUFFER_DOCS); $objects = []; for ($i = $seen_generation; $i < $num_to_get; $i++) { $object = $archive->get($keys[$i], $generation); $summary = $object[self::SUMMARY]; unset($object[self::SUMMARY]); $object = array_merge($object, $summary); $objects[] = $object; } $seen += $num_to_get; $seen_generation += $num_to_get; } $num_to_get = count($objects); if ($seen >= $start) { $num_to_show = min($seen - $start, $num_to_get); $cnt = 0; $first = $num_to_get - $num_to_show; foreach ($objects as $object) { if ($cnt >= $first) { $out = ""; if (isset($object[self::TIMESTAMP])) { $object[self::TIMESTAMP] = date("r", $object[self::TIMESTAMP]); } foreach ($fields_to_print as $key => $name) { if (isset($object[$key])) { $out .= "[$name]\n"; if ($key != self::IP_ADDRESSES) { $out .= $object[$key]."\n"; } else { foreach ($object[$key] as $address) { $out .= $address."\n"; } } } } $out .= "==========\n\n"; echo "BEGIN ITEM, LENGTH:".strlen($out)."\n"; echo $out; } $cnt++; } } if ($objects == null) break; } $generation++; } if (isset($this->tmp_results)) { //garbage collect savepoint folder for non-yioop archives $dbms_manager = C\NS_DATASOURCES . ucfirst(C\DBMS) . "Manager"; $db = new $dbms_manager(); $db->unlinkRecursive($this->tmp_results); } } /** * Copies an IndexArchiveBundle (or derived class) on disk into * an IndexDocumentBundle (on disk). The new bundle will be * at the old bundle's location while the old bundle is renamed * after the process to "Old" . name_of_old_bundle * * @param string $archive_path file path to a IndexArchiveBundle */ public function migrateIndexArchive($archive_path) { $archive_type = $this->getArchiveKind($archive_path); $archive_type_use = ($archive_type == "DoubleIndexBundleOld") ? "DoubleIndexBundle" : $archive_type; $archive_name = C\NS_LIB . $archive_type_use; $archive_map = [ "DoubleIndexBundleOld" => "DoubleIndexBundle", "FeedArchiveBundle" => "FeedDocumentBundle", "IndexArchiveBundle" => "IndexDocumentBundle", ]; if (!isset($archive_map[$archive_type])) { $this->badFormatMessageAndExit($archive_path, "migratable index"); } $file_name_pos = strrpos($archive_path, "/"); $parent_path = substr($archive_path, 0, $file_name_pos); $file_name = substr($archive_path, $file_name_pos + 1); $out_archive_path = "$parent_path/Tmp$file_name"; $new_archive_type = $archive_map[$archive_type]; $new_archive_name = C\NS_LIB . $new_archive_type; $info = $archive_name::getArchiveInfo($archive_path); if ($new_archive_type == "FeedDocumentBundle") { $dbms_manager = C\NS_DATASOURCES . ucfirst(C\DBMS) . "Manager"; $db = new $dbms_manager(); $db->connect(); $out_index_archive = new $new_archive_name($out_archive_path, $db, false, $info["DESCRIPTION"], C\NUM_DOCS_PER_PARTITION); $sql = "SELECT * FROM MEDIA_SOURCE WHERE (TYPE='rss' OR TYPE='html' OR TYPE='json' OR TYPE='regex')"; $result = $db->execute($sql); $feeds = []; while ($feed = $db->fetchArray($result)) { FeedsUpdateJob::parseFeedAuxInfo($feed); $feeds[] = $feed; } $out_index_archive->feeds = $feeds; } else if ($new_archive_type == "DoubleIndexBundle") { $out_index_archive = new $new_archive_name($out_archive_path, false, $info["DESCRIPTION"], C\NUM_DOCS_PER_PARTITION, $info["repeat_frequency"]); } else { $out_index_archive = new $new_archive_name($out_archive_path, false, $info["DESCRIPTION"], C\NUM_DOCS_PER_PARTITION); } $num_bundles = ($new_archive_type == "DoubleIndexBundle") ? 2 : 1; for ($i = 0; $i < $num_bundles; $i++) { $bundle_path = ($new_archive_type == "DoubleIndexBundle") ? "$archive_path/bundle$i" : $archive_path; $archive = new WebArchiveBundle($bundle_path . "/summaries"); L\crawlLog("Processing bundle $i."); $generation_info = unserialize( file_get_contents("$bundle_path/generation.txt")); $num_generations = $generation_info['ACTIVE'] + 1; $seen = 0; $generation = 0; $keypad = "\x00\x00\x00\x00"; while($generation < $num_generations) { $partition = $archive->getPartition($generation, false); L\crawlLog("Processing partition $generation"); L\crawlLog("Number of objects in partition:" . $partition->count); L\crawlLog("Partition Version:" . $partition->version); $seen_partition = 0; $more_objects = ($partition->count > 0); while($more_objects) { $num_to_get = self::MAX_REBUILD_DOCS; $offset = $partition->iterator_pos; $objects = $partition->nextObjects($num_to_get); if (empty($objects)) { $more_objects = false; continue; } $seen_partition += count($objects); $cnt = 0; $store_sites = []; foreach ($objects as $object) { $site = $object[1]; if (isset($site['DUMMY_OFFSET']) || empty($site)) { // first item in a partition is a dummy record continue; } if (!isset($site[self::TYPE]) || $site[self::TYPE] != "link") { $cnt++; } $doc_id = $out_index_archive->computeDocId($site); if ($doc_id) { $page = $site[self::PAGE] ?? ""; unset($site[self::PAGE]); $store_sites[] = [self::DOC_ID => $doc_id, self::SUMMARY => $site, self::PAGE => $page]; } } $seen_partition += $num_to_get; unset($objects); if (!empty($store_sites)) { if ($new_archive_type == "FeedDocumentBundle") { $result = $out_index_archive->addPagesAndSeenKeys( $store_sites, $cnt); } else { $result = $out_index_archive->addPages($store_sites, $cnt); } if ($result) { L\crawlLog("..Store $cnt pages succeeded."); } else { L\crawlLog("..STORE ERROR!!!! for the following ". "documents:"); L\crawlLog(print_r($seen_sites, true)); L\crawlLog("..Store $cnt pages failed."); } unset($store_sites); } } $out_index_archive->updateDictionary(); $generation++; } if ($new_archive_type == "DoubleIndexBundle" && $i < 1) { $out_index_archive->swapActiveBundle(); } else { $out_index_archive->forceSave(); } } if ($new_archive_type == "DoubleIndexBundle") { $status = ["repeat_frequency" => $info['repeat_frequency'], "repeat_time" => $info['repeat_time'], "swap_count" => $info['swap_count'], "DESCRIPTION" => $out_index_archive->description ]; file_put_contents($out_index_archive->dir_name . "/status.txt", serialize($status)); copy($archive_path . "/StartCrawlSchedule.txt", $out_index_archive->dir_name . "/StartCrawlSchedule.txt"); } $old_archive_path = "$parent_path/Old$file_name"; rename($archive_path, $old_archive_path); rename($out_archive_path, $archive_path); echo <<< EOD Migration complete! Test out the migrated index to see if it is working correctly. The original index has been stored at: $old_archive_path If the new migrated index is working correctly you can delete the original index. Otherwise, you can move the original index back to $archive_path to use it again. EOD; } /** * Used to recompute both the index shards and the dictionary * of an index archive. The first step involves re-extracting the * word into an inverted index from the summaries' web_archives. * Then a reindex is done. * * @param string $archive_path file path to a IndexArchiveBundle * @param mixed $start_generation which web archive generation to start * rebuild from. If 'continue' then keeps going from where last attempt at * a rebuild was. */ public function rebuildIndexBundle($archive_path, $start_generation = 0) { L\crawlLog("Rebuilding index!!"); $bundle_num = -1; $bundle_path = $archive_path; if (preg_match("/\-\d$/", $archive_path)) { $bundle_num = substr($archive_path, -1); $archive_path = substr($archive_path, 0, -2); /* we can rebuild DoubleIndexBundle's by rebuilding each archive separate */ $bundle_path = "$archive_path/bundle$bundle_num"; } $archive_type = $this->getArchiveKind($bundle_path); $archive_name = C\NS_LIB . $archive_type ; if (!in_array($archive_type, ["FeedDocumentBundle", "IndexDocumentBundle"])) { $this->badFormatMessageAndExit($archive_path, "rebuildable index"); } $next_partition_path = $bundle_path . "/". IndexDocumentBundle::NEXT_PARTITION_FILE; if (trim($start_generation) === "continue") { if (file_exists($next_partition_path)) { $start_generation = intval(file_get_contents($next_partition_path)); echo "Restarting rebuild index from $start_generation\n"; } else { $start_generation = 0; } } $dictionary_path = $bundle_path . IndexDocumentBundle::DICTIONARY_FOLDER; $dbms_manager = C\NS_DATASOURCES . ucfirst(C\DBMS) . "Manager"; $db = new $dbms_manager(); if ($start_generation == 0) { $db->unlinkRecursive($bundle_path . "/" . IndexDocumentBundle::DICTIONARY_FOLDER, false); $db->unlinkRecursive($bundle_path . "/" . IndexDocumentBundle::POSITIONS_DOC_MAP_FOLDER, false); } file_put_contents($next_partition_path, $start_generation); if ($archive_type == "FeedDocumentBundle") { $dbms_manager = C\NS_DATASOURCES . ucfirst(C\DBMS) . "Manager"; $db = new $dbms_manager(); $db->connect(); $index_archive = new $archive_name($bundle_path, $db, false, null, C\NUM_DOCS_PER_PARTITION); $sql = "SELECT * FROM MEDIA_SOURCE WHERE (TYPE='rss' OR TYPE='html' OR TYPE='json' OR TYPE='regex')"; $result = $db->execute($sql); $feeds = []; while ($feed = $db->fetchArray($result)) { FeedsUpdateJob::parseFeedAuxInfo($feed); $feeds[] = $feed; } $index_archive->feeds = $feeds; } else { $index_archive = new $archive_name($bundle_path, false, "", C\NUM_DOCS_PER_PARTITION); } $save_partition = $index_archive->documents->parameters[ "SAVE_PARTITION"] ?? 0; L\crawlLog("Save partition is $save_partition"); $options = "run 0 '$bundle_path'"; $old_next_partition = -1; $next_partition = $start_generation; $continue = false; $dictionary_log = C\LOG_DIR . "/0-DictionaryUpdater.log"; $fp = fopen($dictionary_log, "w"); fclose($fp); while ($next_partition < $save_partition) { if ($old_next_partition != $next_partition) { $old_next_partition = $next_partition; L\crawlLog("Begin Processing Partition: $next_partition"); L\crawlLog("Exec'ing DictionaryUpdater with parameters: " . $options); L\CrawlDaemon::execScriptInOwnProcess(C\BASE_DIR . "/executables/DictionaryUpdater.php", $options); } else { if (file_exists($dictionary_log)) { $recent_log_data = file_get_contents($dictionary_log); echo $recent_log_data; $fp = fopen($dictionary_log, "w"); fclose($fp); } clearstatcache(); if (time() > filemtime($next_partition_path) + 5 * C\LOG_TIMEOUT) { L\crawlLog("DictionaryUpdater seems to have crashed, ". "exiting ArcTool"); exit(); } sleep(15); } $next_partition = intval(file_get_contents($next_partition_path)); } $index_archive->forceSave(); echo "\nIndex Rebuild Complete!\n"; } /** * Used to create an archive_bundle_iterator for a non-yioop archive * As these iterators sometimes make use of a folder to store savepoints * We create a temporary folder for this purpose in the current directory * This should be garbage collected elsewhere. * * @param string $archive_path path to non-yioop archive * @param string $iterator_type name of archive_bundle_iterator used to * iterate over archive. * @param return an ArchiveBundleIterator of the correct type using * a temporary folder to store savepoints */ public function instantiateIterator($archive_path, $iterator_type) { $iterate_timestamp = filectime($archive_path); $result_timestamp = strval(time()); $this->tmp_results = C\TEMP_DIR . '/TmpArchiveExtract'. $iterate_timestamp; $dbms_manager = C\NS_DATASOURCES. ucfirst(C\DBMS) . "Manager"; $db = new $dbms_manager(); if (file_exists($this->tmp_results)) { $db->unlinkRecursive($this->tmp_results); } @mkdir($this->tmp_results); $iterator_class = C\NS_ARCHIVE . "{$iterator_type}Iterator"; $iterator = new $iterator_class($iterate_timestamp, $archive_path, $result_timestamp, $this->tmp_results); $db->setWorldPermissionsRecursive($this->tmp_results); return $iterator; } /** * Outputs the "hey, this isn't a known bundle message" and then exit()'s. * * @param string $archive_name name or path to what was supposed to be * an archive * @param string $allowed_archives a string list of archives types * that $archive_name could belong to */ public function badFormatMessageAndExit($archive_name, $allowed_archives = "web or index") { echo <<< EOD $archive_name does not appear to be a $allowed_archives archive bundle EOD; exit(); } /** * Outputs the "how to use this tool message" and then exit()'s. */ public function usageMessageAndExit() { echo <<< EOD ArcTool is used to look at the contents of FeedDocumentBundle's, IndexDocumentBundle's, DoubleIndexBundle's, and BloomFilterFile's. It will look for these using the path provided or will check in the Yioop! crawl directory as a fall back. The available commands for ArcTool are: php ArcTool.php check-filter filter_file item /* outputs whether item is in the BloomFilterFile given by filter_file */ php ArcTool.php count bundle_name php ArcTool.php count double_index_name which_bundle or php ArcTool.php count bundle_name save php ArcTool.php count double_index_name which_bundle save /* returns the counts of docs and links for each partition in a bundle as well as an overall total. The second command saves the just computed count into the index description (can be used to fix the index count if it gets screwed up). */ php ArcTool.php doc-lookup bundle_name partition doc_map_index /* returns the document stored in partition at doc_map_index (here doc_map_index is the value that would be stored in a posting) */ php ArcTool.php dict bundle_name word [details] php ArcTool.php dict double_index_name which_bundle word [details] php ArcTool.php dict bundle_name word start_record num_records [details] php ArcTool.php dict double_index_name which_bundle word start_record num_records [details] /* returns index dictionary records for word stored in index archive bundle or double index bundle. In the later case you should provide which bundle you want dictionary info for. This command also supports start and number of record parameters. If the word details is added to the end of the command then additional information about each doc record (as opposed to just their total number) is printed */ php ArcTool.php fix-partition bundle_name php ArcTool.php fix-partition bundle_name start_partition php ArcTool.php fix-partition bundle_name start_partition end_partition /* recomputes the hash index (.ix) files for a range of partitions from start_partition to end_partition in the documents subfolder of an IndexDocumentBundle. An ix file contains a sequence of compressed 4-tuple (doc_id, summary_offset, summary_length, cache_length) corresponding to a partition file (these end in .txt.gz and are a sequence of compressed document summaries followed by orginal documents). */ php ArcTool.php info bundle_name // return info about documents stored in archive. php ArcTool.php inject timestamp file /* injects the urls in file as a schedule into crawl of given timestamp This can be used to make a closed index unclosed and to allow for continued crawling. */ php ArcTool.php list /* returns a list of all the archives in the Yioop! crawl directory, including non-Yioop! archives in the /archives sub-folder.*/ php ArcTool.php make-filter dict_file filter_file php ArcTool.php make-filter dict_file filter_file column_num /* outputs to filter_file a BloomFilterFile made by inserting the items in dict_file. If column_num is negative then dict_file is assumed to list one item to insert per line. If column_num >=0 then dict_file is assumed to be a csv file and column_num is the column that items will be inserted from. */ php ArcTool.php migrate bundle_name /* migrates old Yioop index formats such as FeedArchiveBundle, IndexArchiveBundle, and old style DoubleIndexBundle to their modern respective equivalents FeedDocumentBundle, IndexDocumentBundle, and modern DoubleIndexBundle */ php ArcTool.php partition bundle_name partition_number php ArcTool.php partition double_index_name which_bundle partition_number /* Prints information about the number of words and frequencies of words within the partition_number'th partition in the index document bundle or double index bundle (in which case need to say either 0 or 1 bundle) */ php ArcTool.php rebuild bundle_name php ArcTool.php rebuild double_index_name which_bundle php ArcTool.php rebuild bundle_name continue php ArcTool.php rebuild double_index_name which_bundle continue php ArcTool.php rebuild bundle_name partition_num php ArcTool.php rebuild double_index_name which_bundle partition_num /* re-extracts words from summaries files in bundle_name a partition at a time, builds an inverted index for that partition and adds to the global dictionary. If this process crashes the keyword continue can be used to continue from where it left off. If a partition number is supplied process continue from that partition number. */ php ArcTool.php show bundle_name start num php ArcTool.php show double_index_name which_bundle start num /* outputs items start through num from bundle_name or name of Yioop or non-Yioop archive crawl folder */ EOD; exit(); } } $arc_tool = new ArcTool(); $arc_tool->start();