Last commit for bin/arc_tool.php: 9ff742e4cc2ef0dba312dd0c5f642890b6945730

First pass at converting files to use autoloading! Take care if you have an old yioop system you are upgrading, a=chris

Chris Pollett [2015-07-01 02:Jul:st]

First pass at converting files to use autoloading! Take care if you have an old yioop system you are upgrading, a=chris

<?php
/**
 *  SeekQuarry/Yioop --
 *  Open Source Pure PHP Search Engine, Crawler, and Indexer
 *
 *  Copyright (C) 2009 - 2014  Chris Pollett chris@pollett.org
 *
 *  LICENSE:
 *
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 *  END LICENSE
 *
 * @author Chris Pollett chris@pollett.org
 * @package seek_quarry
 * @subpackage bin
 * @license http://www.gnu.org/licenses/ GPL3
 * @link http://www.seekquarry.com/
 * @copyright 2009 - 2014
 * @filesource
 */

if(php_sapi_name() != 'cli') {echo "BAD REQUEST"; exit();}

/** Calculate base directory of script @ignore*/
define("BASE_DIR", substr(
    dirname(realpath($_SERVER['PHP_SELF'])), 0,
    -strlen("/bin")));

ini_set("memory_limit","2000M"); /*
        reindex sometimes takes more than the default 128M, 850 to be safe */

/** This tool does not need logging*/
define("LOG_TO_FILES", false);

/** Load in global configuration settings */
require_once BASE_DIR.'/configs/config.php';
if(!PROFILE) {
    echo "Please configure the search engine instance by visiting" .
        "its web interface on localhost.\n";
    exit();
}

/** NO_CACHE means don't try to use memcache*/
define("NO_CACHE", true);

/** USE_CACHE false rules out file cache as well*/
define("USE_CACHE", false);

/** Load the class that maintains our URL queue */
require_once BASE_DIR."/lib/web_queue_bundle.php";

/** Load word->{array of docs with word} index class */
require_once BASE_DIR."/lib/index_archive_bundle.php";

/** To be able to determine info about word in a index dictionary*/
require_once BASE_DIR."/lib/index_bundle_iterators/word_iterator.php";

/** Used by word_iterator.php*/
require_once BASE_DIR."/lib/index_manager.php";

/** Load the iterator classes for non-yioop archives*/
foreach(glob(BASE_DIR."/lib/archive_bundle_iterators/*_iterator.php")
    as $filename) {
    require_once $filename;
}

/** Used for manipulating urls*/
require_once BASE_DIR."/lib/url_parser.php";

/**  For crawlHash function */
require_once BASE_DIR."/lib/utility.php";

/** Get the database library based on the current database type */
require_once BASE_DIR."/models/datasources/".DBMS."_manager.php";

/** Load FetchUrl, used by the MediaWiki archive iterator */
require_once BASE_DIR."/lib/fetch_url.php";

/** Loads common constants for web crawling*/
require_once BASE_DIR."/lib/crawl_constants.php";
/*
 *  We'll set up multi-byte string handling to use UTF-8
 */
mb_internal_encoding("UTF-8");
mb_regex_encoding("UTF-8");
/**
 * Command line program that allows one to examine the content of
 * the WebArchiveBundles and IndexArchiveBundles of Yioop crawls.
 * For now it supports returning header information about bundles,
 * as well as pretty printing the page/summary contents of the bundle.
 *
 * The former can be gotten from a bundle by running arc_tool with a
 * command like:
 * php arc_tool.php info bundle_name
 *
 * The latter can be gotten from a bundle by running arc_tool with a
 * command like:
 * php arc_tool.php list bundle_name start_doc_num num_results
 *
 * @author Chris Pollett (non-yioop archive code derived from earlier
 *      stuff by Shawn Tice)
 * @package seek_quarry
 */
class ArcTool implements CrawlConstants
{
    /**
     * The maximum number of documents the arc_tool list function
     * will read into memory in one go.
     */
    const MAX_BUFFER_DOCS = 200;
    /**
     * Initializes the ArcTool, for now does nothing
     */
    function __construct()
    {

    }
    /**
     * Runs the ArcTool on the supplied command line arguments
     */
    function start()
    {
        global $argv;
        if(!isset($argv[1]) || (!isset($argv[2]) && $argv[1] != "list") ||
            (!isset($argv[3]) &&
            ($argv[1] == "dict" || $argv[1] == "posting"))) {
            $this->usageMessageAndExit();
        }
        if($argv[1] != "list") {
            $path =  $bundle_name = UrlParser::getDocumentFilename($argv[2]);
            if($path == $argv[2] && !file_exists($path)) {
                $path = CRAWL_DIR."/cache/".$path;
                if(!file_exists($path)) {
                    $path = CRAWL_DIR."/archives/".$argv[2];
                }
            }
        }
        switch($argv[1])
        {
            case "list":
                $this->outputArchiveList();
            break;

            case "info":
                $this->outputInfo($path);
            break;

            case "shard":
                $this->outputShardInfo($path, $argv[3]);
            break;

            case "dict":
                $this->outputDictInfo($path, $argv[3]);
            break;

            case "posting":
                $num = (isset($argv[5])) ? $argv[5] : 1;
                $this->outputPostingInfo($path, $argv[3], $argv[4], $num);
            break;

            case "rebuild":
                $this->rebuildIndexArchive($path);
            break;

            case "reindex":
                $this->reindexIndexArchive($path);
            break;

            case "mergetiers":
                if(!isset($argv[3])) {
                    $this->usageMessageAndExit();
                }
                $this->reindexIndexArchive($path, $argv[3]);
            break;

            case "show":
                if(!isset($argv[3])) {
                    $this->usageMessageAndExit();
                }
                if(!isset($argv[4])) {
                    $argv[4] = 1;
                }
                $this->outputShowPages($path, $argv[3], $argv[4]);
            break;

            default:
                $this->usageMessageAndExit();
        }

    }

    /**
     * Lists the Web or IndexArchives in the crawl directory
     */
     function outputArchiveList()
     {
        $yioop_pattern = CRAWL_DIR."/cache/*{".self::archive_base_name.",".
            self::index_data_base_name."}*";

        $archives = glob($yioop_pattern, GLOB_BRACE);
        $archives_found = false;
        if(is_array($archives) && count($archives) > 0) {
            $archives_found = true;
            echo "\nFound Yioop Archives:\n";
            echo "=====================\n";
            foreach($archives as $archive_path) {
                echo $this->getArchiveName($archive_path)."\n";
            }
        }
        $nonyioop_pattern = CRAWL_DIR."/archives/*/arc_description.ini";
        $archives = glob($nonyioop_pattern);
        if(is_array($archives) && count($archives) > 0 ) {
            $archives_found = true;
            echo "\nFound Non-Yioop Archives:\n";
            echo "=========================\n";
            foreach($archives as $archive_path) {
                $len = strlen("/arc_description.ini");
                $path = substr($archive_path, 0, -$len);
                echo $this->getArchiveName($path)."\n";
            }
        }
        if(!$archives_found) {
            echo "No archives currently in crawl directory \n";
        }
        echo "\n";
     }
    /**
     * Determines whether the supplied path is a WebArchiveBundle or
     * an IndexArchiveBundle or non-Yioop Archive. Then outputs
     * to stdout header information about the
     * bundle by calling the appropriate sub-function.
     *
     * @param string $archive_path the path of a directory that holds
     *      WebArchiveBundle,IndexArchiveBundle, or non-Yioop archive data
     */
    function outputInfo($archive_path)
    {
        $bundle_name = $this->getArchiveName($archive_path);
        echo "Bundle Name: ".$bundle_name."\n";
        $archive_type = $this->getArchiveKind($archive_path);
        echo "Bundle Type: ".$archive_type."\n";
        if($archive_type === false) {
            $this->badFormatMessageAndExit($archive_path);
        }
        if(in_array($archive_type, array("IndexArchiveBundle",
            "WebArchiveBundle"))) {
            $call = "outputInfo".$archive_type;
            $info = $archive_type::getArchiveInfo($archive_path);
            $this->$call($info, $archive_path);
        }
    }

    /**
     * Prints the IndexDictionary records for a word in an IndexArchiveBundle
     *
     * @param string $archive_path the path of a directory that holds
     *      an IndexArchiveBundle
     * @param string $word to look up dictionary record for
     */
    function outputDictInfo($archive_path, $word)
    {
        $bundle_name = $this->getArchiveName($archive_path);
        echo "\nBundle Name: $bundle_name\n";
        $archive_type = $this->getArchiveKind($archive_path);
        echo "Bundle Type: $archive_type\n";

        if(strcmp($archive_type,"IndexArchiveBundle") != 0) {
            $this->badFormatMessageAndExit($archive_path, "index");
        }
        $index_timestamp = substr($archive_path,
            strpos($archive_path, self::index_data_base_name) +
            strlen(self::index_data_base_name));
        $mask = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00";
        $hash_key = crawlHashWord($word, true, $mask) ;
        $info = IndexManager::getWordInfo($index_timestamp, $hash_key, 0,
            $mask, 1);
        if(!$info) {
            //fallback to old word hashes
            $info = IndexManager::getWordInfo($index_timestamp,
                crawlHash($word, true), 0, "", 1);
            if(!$info) {
                echo "\n$word does not appear in bundle!\n\n";
                exit();
            }
        }
        echo "Dictionary Tiers: ";
        $index = IndexManager::getIndex($index_timestamp);
        $tiers = $index->dictionary->active_tiers;
        foreach($tiers as $tier) {
            echo " $tier";
        }
        echo "\nBundle Dictionary Entries for '$word':\n";
        echo "====================================\n";
        $i = 1;
        foreach($info as $record) {
            echo "RECORD: $i\n";
            echo "GENERATION: {$record[0]}\n";
            echo "FIRST WORD OFFSET: {$record[1]}\n";
            echo "LAST WORD OFFSET: {$record[2]}\n";
            echo "NUMBER OF POSTINGS: {$record[3]}\n\n";
            $i++;
        }
    }
    /**
     * Prints information about the number of words and frequencies of words
     * within the $generation'th index shard in the bundle
     *
     *  @param string $archive_path the path of a directory that holds
     *      an IndexArchiveBundle
     *  @param int $generation which index shard to use
     */
    function outputShardInfo($archive_path, $generation)
    {
        ini_set("memory_limit","2000M"); /*reading in a whole shard might take
                a bit more memory
            */
        $bundle_name = $this->getArchiveName($archive_path);
        echo "\nBundle Name: $bundle_name\n";
        $archive_type = $this->getArchiveKind($archive_path);
        echo "Bundle Type: $archive_type\n";

        if(strcmp($archive_type,"IndexArchiveBundle") != 0) {
            $this->badFormatMessageAndExit($archive_path, "index");
        }
        $index_timestamp = substr($archive_path,
            strpos($archive_path, self::index_data_base_name) +
            strlen(self::index_data_base_name));
        $index = IndexManager::getIndex($index_timestamp);
        $index->setCurrentShard($generation);

        $num_generations = $index->generation_info["ACTIVE"] + 1;
        echo "Number of Generations: $num_generations\n";
        echo "\nShard Information for Generation $generation\n";
        echo "====================================\n";

        $shard = $index->getCurrentShard();
        echo "Number of Distinct Terms Indexed: ".count($shard->words)."\n";
        echo "Number of Docs in Shard: ".$shard->num_docs."\n";
        echo "Number of Link Items in Shard: ".$shard->num_link_docs."\n";
        echo "Total Links and Docs: ".($shard->num_docs +
            $shard->num_link_docs)."\n\n";
        echo "Term histogram for shard\n";
        echo "------------------------\n";
        $word_string_lens = array();
        foreach($shard->words as $word => $posting) {
            $word_string_lens[] = intval(ceil(strlen($posting)/4));
        }
        $word_string_lens = array_count_values($word_string_lens);
        krsort($word_string_lens);
        $i = 1;
        echo "Freq Rank\t# Terms with Rank\t# Docs Term Appears In\n";
        foreach($word_string_lens as $num_docs => $num_terms) {
            echo "$i\t\t\t$num_terms\t\t\t$num_docs\n";
            $i += $num_terms;
        }

    }

    /**
     * Prints information about $num many postings beginning at the
     * provided $generation and $offset
     *
     *  @param string $archive_path the path of a directory that holds
     *      an IndexArchiveBundle
     *  @param int $generation which index shard to use
     *  @param int $offset offset into posting lists for that shard
     *  @param int $num how many postings to print info for
     */
    function outputPostingInfo($archive_path, $generation, $offset, $num = 1)
    {
        $bundle_name = $this->getArchiveName($archive_path);
        echo "\nBundle Name: $bundle_name\n";
        $archive_type = $this->getArchiveKind($archive_path);
        echo "Bundle Type: $archive_type\n";
        echo "Generation: $generation\n";
        echo "Offset: $offset\n";

        if(strcmp($archive_type,"IndexArchiveBundle") != 0) {
            $this->badFormatMessageAndExit($archive_path, "index");
        }
        $index_timestamp = substr($archive_path,
            strpos($archive_path, self::index_data_base_name) +
            strlen(self::index_data_base_name));
        $index = IndexManager::getIndex($index_timestamp);
        $index->setCurrentShard($generation, true);
        $shard = $index->getCurrentShard();
        $next = $offset >> 2;
        $raw_postings = array();
        $doc_indexes = array();
        $documents = array();
        for($i = 0; $i < $num; $i++) {
            $dummy_offset = 0;
            $posting_start = $next;
            $posting_end = $next;
            $old_offset = $next << 2;
            $old_start = $next << 2;
            $old_end = $next << 2;
            $tmp = $shard->getPostingAtOffset(
                $next, $posting_start, $posting_end);
            $next = $posting_end + 1;
            if(!$tmp) break;
            $documents = array_merge($documents,
                $shard->getPostingsSlice($old_offset, $old_start, $old_end, 1));
            $raw_postings[] = $tmp;
            $post_array = unpackPosting($tmp, $dummy_offset);
            $doc_indexes[] = $post_array[0];
        }
        $end_offset = $next << 2;
        echo "Offset After Returned Results: $end_offset\n\n";
        if(!$documents || ($count = count($documents)) < 1) {
            echo "No documents correspond to generation and offset given\n\n";
            exit();
        };
        $document_word = ($count == 1) ? "Document" : "Documents";
        echo "$count $document_word Found:\n";
        echo str_pad("", $count + 1, "=")."================\n";
        $j = 0;
        foreach($documents as $key => $document) {
            echo "\nDOC ID: ".toHexString($key);
            echo "\nTYPE: ".(($document[self::IS_DOC]) ? "Document" : "Link");
            echo "\nDOC INDEX: ".$doc_indexes[$j];
            $summary_offset = $document[self::SUMMARY_OFFSET];
            echo "\nSUMMARY OFFSET: ".$summary_offset;
            echo "\nSCORE: ".$document[self::SCORE];
            echo "\nDOC RANK: ".$document[self::DOC_RANK];
            echo "\nRELEVANCE: ".$document[self::RELEVANCE];
            echo "\nPROXIMITY: ".$document[self::PROXIMITY];
            echo "\nHEX POSTING:\n";
            echo "------------\n";
            echo wordwrap(toHexString($raw_postings[$j]), 80);
            if(isset($document[self::POSITION_LIST])) {
                echo "\nTERM OCCURRENCES IN DOCUMENT (Count starts at title):";
                echo "\n-------------------------".
                    "----------------------------\n";
                $i = 0;
                foreach($document[self::POSITION_LIST] as $position) {
                    printf("%09d ",$position);
                    $i++;
                    if($i >= 5) {
                        echo "\n";
                        $i = 0;
                    }
                }
                if($i != 0) { echo "\n"; }
            }
            $page = @$index->getPage($summary_offset);

            if(isset($page[self::TITLE])) {
                echo "SUMMARY TITLE:\n";
                echo "--------------\n";
                echo wordwrap($page[self::TITLE], 80)."\n";
            }

            if(isset($page[self::DESCRIPTION])) {
                echo "SUMMARY DESCRIPTION:\n";
                echo "--------------\n";
                echo $page[self::DESCRIPTION]."\n";
                }
            $j++;
        }
    }
    /**
     * Given a complete path to an archive returns its filename
     *
     * @param string $archive_path a path to a yioop or non-yioop archive
     * @return string its filename
     */
    function getArchiveName($archive_path)
    {
        $start = CRAWL_DIR."/archives/";
        if(strstr($archive_path, $start)) {
            $start_len = strlen($start);
            $name = substr($archive_path, $start_len);
        } else {
            $name = UrlParser::getDocumentFilename($archive_path);
        }
        return $name;
    }
    /**
     * Used to recompute the dictionary of an index archive -- either from
     * scratch using the index shard data or just using the current dictionary
     * but merging the tiers into one tier
     *
     * @param string $path file path to dictionary of an IndexArchiveBundle
     * @param int $max_tier tier up to which the dictionary tiers should be
     *      merge (typically a value greater than the max_tier of the
     *      dictionary)
     */
    function reindexIndexArchive($path, $max_tier = -1)
    {
        if($this->getArchiveKind($path) != "IndexArchiveBundle") {
            echo "\n$path ...\n".
                "  is not an IndexArchiveBundle so cannot be re-indexed\n\n";
            exit();
        }
        $shards = glob($path."/posting_doc_shards/index*");
        if(is_array($shards)) {
            if($max_tier == -1) {
                $dbms_manager = DBMS."Manager";
                $db = new $dbms_manager();
                $db->unlinkRecursive($path."/dictionary", false);
                IndexDictionary::makePrefixLetters($path."/dictionary");
            }
            $dictionary = new IndexDictionary($path."/dictionary");

            if($max_tier == -1) {
                $max_generation = 0;
                foreach($shards as $shard_name) {
                    $file_name = UrlParser::getDocumentFilename($shard_name);
                    $generation = (int)substr($file_name, strlen("index"));
                    $max_generation = max($max_generation, $generation);
                }
                for($i = 0; $i < $max_generation + 1; $i++) {
                    $shard_name = $path."/posting_doc_shards/index$i";
                    echo "\nShard $i\n";
                    $shard = new IndexShard($shard_name, $i,
                        NUM_DOCS_PER_GENERATION, true);
                    $dictionary->addShardDictionary($shard);
                }
                $max_tier = $dictionary->max_tier;
            }
            echo "\nFinal Merge Tiers\n";
            $dictionary->mergeAllTiers(NULL, $max_tier);
            $db->setWorldPermissionsRecursive($path."/dictionary");
            echo "\nReindex complete!!\n";
        } else {
            echo "\n$path ...\n".
                "  does not contain posting shards so cannot be re-indexed\n\n";

        }
    }

    /**
     * Outputs to stdout header information for a IndexArchiveBundle
     * bundle.
     *
     * @param array $info header info that has already been read from
     *      the description.txt file
     * @param string $archive_path file path of the folder containing the bundle
     */
    function outputInfoIndexArchiveBundle($info, $archive_path)
    {
        $more_info = unserialize($info['DESCRIPTION']);
        unset($info['DESCRIPTION']);
        $info = array_merge($info, $more_info);
        echo "Description: ".$info['DESCRIPTION']."\n";
        $generation_info = unserialize(
            file_get_contents("$archive_path/generation.txt"));
        $num_generations = $generation_info['ACTIVE']+1;
        echo "Number of generations: ".$num_generations."\n";
        echo "Number of stored links and documents: ".$info['COUNT']."\n";
        echo "Number of stored documents: ".$info['VISITED_URLS_COUNT']."\n";
        $crawl_order = ($info[self::CRAWL_ORDER] == self::BREADTH_FIRST) ?
            "Bread First" : "Page Importance";
        echo "Crawl order was: $crawl_order\n";
        echo "Seed sites:\n";
        foreach($info[self::TO_CRAWL] as $seed) {
            echo "   $seed\n";
        }
        if($info[self::RESTRICT_SITES_BY_URL]) {
            echo "Sites allowed to crawl:\n";
            foreach($info[self::ALLOWED_SITES] as $site) {
                echo "   $site\n";
            }
        }
        echo "Sites not allowed to be crawled:\n";
        if(is_array($info[self::DISALLOWED_SITES])) {
            foreach($info[self::DISALLOWED_SITES] as $site) {
                echo "   $site\n";
            }
        }
        echo "Page Rules:\n";
        if(isset($info[self::PAGE_RULES])) {
            foreach($info[self::PAGE_RULES] as $rule) {
                echo "   $rule\n";
            }
        }
        echo "\n";
    }

    /**
     * Outputs to stdout header information for a WebArchiveBundle
     * bundle.
     *
     * @param array $info header info that has already been read from
     *      the description.txt file
     * @param string $archive_path file path of the folder containing the bundle

     */
    function outputInfoWebArchiveBundle($info, $archive_path)
    {
        echo "Description: ".$info['DESCRIPTION']."\n";
        echo "Number of stored documents: ".$info['COUNT']."\n";
        echo "Maximum Number of documents per partition: ".
            $info['NUM_DOCS_PER_PARTITION']."\n";
        echo "Number of partitions: ".
            ($info['WRITE_PARTITION']+1)."\n";
        echo "\n";
    }

    /**
     * Used to list out the pages/summaries stored in a bundle at
     * $archive_path. It lists to stdout $num many documents starting at $start.
     *
     * @param string $archive_path path to bundle to list documents for
     * @param int $start first document to list
     * @param int $num number of documents to list
     */
    function outputShowPages($archive_path, $start, $num)
    {
        $fields_to_print = array(
            self::URL => "URL",
            self::IP_ADDRESSES => "IP ADDRESSES",
            self::TIMESTAMP => "DATE",
            self::HTTP_CODE => "HTTP RESPONSE CODE",
            self::TYPE => "MIMETYPE",
            self::ENCODING => "CHARACTER ENCODING",
            self::DESCRIPTION => "DESCRIPTION",
            self::PAGE => "PAGE DATA");
        $archive_type = $this->getArchiveKind($archive_path);
        if($archive_type === false) {
            $this->badFormatMessageAndExit($archive_path);
        }

        $nonyioop = false;
        //for yioop archives we set up a dummy iterator
        $iterator =  (object) array();
        $iterator->end_of_iterator = false;
        if($archive_type == "IndexArchiveBundle") {
            $info = $archive_type::getArchiveInfo($archive_path);
            $num = min($num, $info["COUNT"] - $start);
            $generation_info = unserialize(
                file_get_contents("$archive_path/generation.txt"));
            $num_generations = $generation_info['ACTIVE']+1;
            $archive = new WebArchiveBundle($archive_path."/summaries");
        } else if ($archive_type == "WebArchiveBundle") {
            $info = $archive_type::getArchiveInfo($archive_path);
            $num = min($num, $info["COUNT"] - $start);
            $num_generations = $info["WRITE_PARTITION"]+1;
            $archive = new WebArchiveBundle($archive_path);
        } else {
            $nonyioop = true;
            $num_generations = 1;
            //for non-yioop archives we set up a real iterator
            $iterator = $this->instantiateIterator($archive_path,
                $archive_type);
            if($iterator === false) {
                $this->badFormatMessageAndExit($archive_path);
            }
        }
        if(!$nonyioop) {
            if(isset($this->tmp_results)) unset($this->tmp_results);
        }
        $num = max($num, 0);
        $total = $start + $num;
        $seen = 0;
        $generation = 0;
        while(!$iterator->end_of_iterator &&
            $seen < $total && $generation < $num_generations) {
            if($nonyioop) {
                $partition = (object) array();
                $partition->count = 1;
                $iterator->seekPage($start);
                if($iterator->end_of_iterator) { break; }
                $seen += $start;
            } else {
                $partition = $archive->getPartition($generation, false);
                if($partition->count < $start && $seen < $start) {
                    $generation++;
                    $seen += $partition->count;
                    continue;
                }
            }
            $seen_generation = 0;
            while($seen < $total && $seen_generation < $partition->count) {
                if($nonyioop) {
                    $num_to_get = min(self::MAX_BUFFER_DOCS, $total - $seen);
                    $objects = $iterator->nextPages($num_to_get);
                    $seen += count($objects);
                } else {
                    $num_to_get = min($total - $seen,
                        $partition->count - $seen_generation,
                        self::MAX_BUFFER_DOCS);
                    $objects = $partition->nextObjects($num_to_get);
                    $seen += $num_to_get;
                    $seen_generation += $num_to_get;
                }
                $num_to_get = count($objects);
                if($seen >= $start) {
                    $num_to_show = min($seen - $start, $num_to_get);
                    $cnt = 0;
                    $first = $num_to_get - $num_to_show;
                    foreach($objects as $pre_object) {
                        if($cnt >= $first) {
                            $out = "";
                            if($nonyioop) {
                                $object = $pre_object;
                            } else {
                                if(!isset($pre_object[1])) continue;
                                $object = $pre_object[1];
                            }
                            if(isset($object[self::TIMESTAMP])) {
                                $object[self::TIMESTAMP] =
                                    date("r", $object[self::TIMESTAMP]);
                            }
                            foreach($fields_to_print as $key => $name) {
                                if(isset($object[$key])) {
                                    $out .= "[$name]\n";
                                    if($key != self::IP_ADDRESSES) {
                                        $out .= $object[$key]."\n";
                                    } else {
                                        foreach($object[$key] as $address) {
                                            $out .= $address."\n";
                                        }
                                    }
                                }
                            }
                            $out .= "==========\n\n";
                            echo "BEGIN ITEM, LENGTH:".strlen($out)."\n";
                            echo $out;
                        }
                        $cnt++;
                    }
                }
                if($objects == NULL) break;
            }
            $generation++;
        }
        if(isset($this->tmp_results)) {
            //garbage collect savepoint folder for non-yioop archives
            $dbms_manager = DBMS."Manager";
            $db = new $dbms_manager();
            $db->unlinkRecursive($this->tmp_results);
        }
    }
    /**
     * Used to recompute both the index shards and the dictionary
     * of an index archive. The first step involves re-extracting the
     * word into an inverted index from the summaries' web_archives.
     * Then a reindex is done.
     *
     * @param string $archivepath file path to a IndexArchiveBundle
     */
    function rebuildIndexArchive($archive_path)
    {
        $archive_type = $this->getArchiveKind($archive_path);
        if($archive_type != "IndexArchiveBundle") {
            $this->badFormatMessageAndExit($archive_path);
        }
        $info = $archive_type::getArchiveInfo($archive_path);
        $tmp = unserialize($info["DESCRIPTION"]);
        $video_sources = $tmp[self::VIDEO_SOURCES];
        $generation_info = unserialize(
            file_get_contents("$archive_path/generation.txt"));
        $num_generations = $generation_info['ACTIVE']+1;
        $archive = new WebArchiveBundle($archive_path."/summaries");
        $seen = 0;
        $generation = 0;
        $keypad = "\x00\x00\x00\x00";
        while($generation < $num_generations) {
            $partition = $archive->getPartition($generation, false);
            $shard_name = $archive_path."/posting_doc_shards/index$generation";
            crawlLog("Processing partition $generation");
            if(file_exists($shard_name)) {
                crawlLog("..Unlinking old shard $generation");
                @unlink($shard_name);
            }
            $shard = new IndexShard($shard_name, $generation,
                NUM_DOCS_PER_GENERATION, true);
            $seen_partition = 0;
            while($seen_partition < $partition->count) {
                $num_to_get = min($partition->count - $seen_partition,
                    8000);
                $offset = $partition->iterator_pos;
                $objects = $partition->nextObjects($num_to_get);
                $cnt = 0;
                foreach($objects as $object) {
                    $cnt++;
                    $site = $object[1];
                    if(isset($site[self::TYPE]) && $site[self::TYPE] == "link"){
                        $is_link = true;
                        $doc_keys = $site[self::HTTP_CODE];
                        $site_url = $site[self::TITLE];
                        $host =  UrlParser::getHost($site_url);
                        $link_parts = explode('|', $site[self::HASH]);
                        if(isset($link_parts[5])) {
                            $link_origin = $link_parts[5];
                        } else {
                            $link_origin = $site_url;
                        }
                        $meta_ids = PhraseParser::calculateLinkMetas($site_url,
                            $host, $site[self::DESCRIPTION], $link_origin);
                        $link_to = "LINK TO:";
                    } else {
                        $is_link = false;
                        $site_url = str_replace('|', "%7C", $site[self::URL]);
                        $host = UrlParser::getHost($site_url);
                        $doc_keys = crawlHash($site_url, true) .
                            $site[self::HASH]."d". substr(crawlHash(
                            $host."/",true), 1);
                        $meta_ids =  PhraseParser::calculateMetas($site,
                            $video_sources);
                        $link_to = "";
                    }
                    $so_far_cnt = $seen_partition + $cnt;
                    $time_out_message = "..still processing $so_far_cnt ".
                        "of {$partition->count} in partition $generation.".
                        "\n..Last processed was: ".
                        ($seen + 1).". $link_to$site_url. ";
                    crawlTimeoutLog($time_out_message);
                    $seen++;
                    $word_lists = array();
                    /*
                        self::JUST_METAS check to avoid getting sitemaps in
                        results for popular words
                     */
                    $lang = NULL;
                    if(!isset($site[self::JUST_METAS])) {
                        $host_words = UrlParser::getWordsIfHostUrl($site_url);
                        $path_words = UrlParser::getWordsLastPathPartUrl(
                            $site_url);
                        if($is_link) {
                            $phrase_string = $site[self::DESCRIPTION];
                        } else {
                            $phrase_string = $host_words." ".$site[self::TITLE]
                                . " ". $path_words . " "
                                . $site[self::DESCRIPTION];
                        }
                        if(isset($site[self::LANG])) {
                            $lang = guessLocaleFromString(
                                mb_substr($site[self::DESCRIPTION], 0,
                                AD_HOC_TITLE_LENGTH), $site[self::LANG]);
                        }
                        $word_lists =
                            PhraseParser::extractPhrasesInLists($phrase_string,
                                $lang);
                        $len = strlen($phrase_string);
                        if(PhraseParser::computeSafeSearchScore($word_lists,
                            $len) < 0.012) {
                            $meta_ids[] = "safe:true";
                            $safe = true;
                        } else {
                            $meta_ids[] = "safe:false";
                            $safe = false;
                        }
                    }
                    if(isset($site[self::USER_RANKS]) &&
                        count($site[self::USER_RANKS]) > 0) {
                        $score_keys = "";
                        foreach($site[self::USER_RANKS] as $label => $score) {
                            $score_keys .= packInt($score);
                        }
                        if(strlen($score_keys) % 8 != 0) {
                            $score_keys .= $keypad;
                        }
                        $doc_keys .= $score_keys;
                    }
                    $shard->addDocumentWords($doc_keys, $offset,
                        $word_lists, $meta_ids,
                        PhraseParser::$materialized_metas, true, false);
                    $offset = $object[0];
                }
                $seen_partition += $num_to_get;
            }
            $shard->save(false, true);
            $generation++;
        }
        $this->reindexIndexArchive($archive_path);
    }

    /**
     * Used to create an archive_bundle_iterator for a non-yioop archive
     * As these iterators sometimes make use of a folder to store savepoints
     * We create a temporary folder for this purpose in the current directory
     * This should be garbage collected elsewhere.
     *
     * @param string $archive_path path to non-yioop archive
     * @param string $iterator_type name of archive_bundle_iterator used to
     *      iterate over archive.
     * @param return an ArchiveBundleIterator of the correct type using
     *      a temporary folder to store savepoints
     */
    function instantiateIterator($archive_path, $iterator_type)
    {
        $iterate_timestamp = filectime($archive_path);
        $result_timestamp = strval(time());
        $this->tmp_results = WORK_DIRECTORY.'/temp/TmpArchiveExtract'.
            $iterate_timestamp;
        $dbms_manager = DBMS."Manager";
        $db = new $dbms_manager();
        if(file_exists($this->tmp_results)) {
            $db->unlinkRecursive($this->tmp_results);
        }
        @mkdir($this->tmp_results);
        $iterator_class = "{$iterator_type}Iterator";
        $iterator = new $iterator_class($iterate_timestamp, $archive_path,
            $result_timestamp, $this->tmp_results);
        $db->setWorldPermissionsRecursive($this->tmp_results);
        return $iterator;
    }


    /**
     * Given a folder name, determines the kind of bundle (if any) it holds.
     * It does this based on the expected location of the description.txt file,
     * or arc_description.ini (in the case of a non-yioop archive)
     *
     * @param string $archive_path the path to archive folder
     * @return string the archive bundle type, either: WebArchiveBundle or
     *      IndexArchiveBundle
     */
    function getArchiveKind($archive_path)
    {
        if(file_exists("$archive_path/description.txt")) {
            return "WebArchiveBundle";
        }
        if(file_exists("$archive_path/summaries/description.txt")) {
            return "IndexArchiveBundle";
        }
        $desc_path = "$archive_path/arc_description.ini";
        if(file_exists($desc_path)) {
            $desc = parse_ini_with_fallback($desc_path);
            if(!isset($desc['arc_type'])) {
                return false;
            }
            return $desc['arc_type'];
        }
        return false;
    }
    /**
     * Outputs the "hey, this isn't a known bundle message" and then exit()'s.
     * @param string $archive_name name or path to what was supposed to be
     *      an archive
     */
    function badFormatMessageAndExit($archive_name,
        $allowed_archives = "web or index")
    {
        echo <<< EOD

$archive_name does not appear to be a $allowed_archives archive bundle

EOD;
        exit();
    }

    /**
     * Outputs the "how to use this tool message" and then exit()'s.
     */
    function usageMessageAndExit()
    {
        echo  <<< EOD

arc_tool is used to look at the contents of WebArchiveBundles and
IndexArchiveBundles. It will look for these using the path provided or
will check in the Yioop! crawl directory as a fall back.

The available commands for arc_tool are:

php arc_tool.php dict bundle_name word
    // returns index dictionary records for word stored in index archive bundle.

php arc_tool.php info bundle_name
    // return info about documents stored in archive.

php arc_tool.php list
    /* returns a list of all the archives in the Yioop! crawl directory,
       including non-Yioop! archives in the /archives sub-folder.*/

php arc_tool.php mergetiers bundle_name max_tier
    // merges tiers of word dictionary into one tier up to max_tier

php arc_tool.php posting bundle_name generation offset
    or
php arc_tool.php posting bundle_name generation offset num
    /* returns info about the posting (num many postings) in bundle_name at
       the given generation and offset */

php arc_tool.php rebuild bundle_name
    /*  re-extracts words from summaries files in bundle_name into index shards
        then builds a new dictionary */

php arc_tool.php reindex bundle_name
    // reindex the word dictionary in bundle_name using existing index shards

php arc_tool.php shard bundle_name generation
    /* Prints information about the number of words and frequencies of words
       within the generation'th index shard in the bundle */

php arc_tool.php show bundle_name start num
    /* outputs items start through num from bundle_name or name of
       non-Yioop archive crawl folder */

EOD;
        exit();
    }
}

$arc_tool =  new ArcTool();
$arc_tool->start();
?>

ViewGit