Last commit for lib/index_archive_bundle.php: 9ff742e4cc2ef0dba312dd0c5f642890b6945730

First pass at converting files to use autoloading! Take care if you have an old yioop system you are upgrading, a=chris

Chris Pollett [2015-07-01 02:Jul:st]
First pass at converting files to use autoloading! Take care if you have an old yioop system you are upgrading, a=chris
<?php
/**
 *  SeekQuarry/Yioop --
 *  Open Source Pure PHP Search Engine, Crawler, and Indexer
 *
 *  Copyright (C) 2009 - 2014  Chris Pollett chris@pollett.org
 *
 *  LICENSE:
 *
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 *  END LICENSE
 *
 * @author Chris Pollett chris@pollett.org
 * @package seek_quarry
 * @subpackage library
 * @license http://www.gnu.org/licenses/ GPL3
 * @link http://www.seekquarry.com/
 * @copyright 2009 - 2014
 * @filesource
 */
if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}
/**
 * Summaries and word document list stored in WebArchiveBundle's so load it
 */
require_once 'web_archive_bundle.php';
/**
 * Used to store word index
 */
require_once 'index_shard.php';
/**
 * Used to store word dictionary
 */
require_once 'index_dictionary.php';
/**
 * Used for crawlLog and crawlHash
 */
require_once 'utility.php';
/**
 *Loads common constants for web crawling
 */
require_once 'crawl_constants.php';
/**
 * Encapsulates a set of web page summaries and an inverted word-index of terms
 * from these summaries which allow one to search for summaries containing a
 * particular word.
 *
 * The basic file structures for an IndexArchiveBundle are:
 * <ol>
 * <li>A WebArchiveBundle for web page summaries.</li>
 * <li>A IndexDictionary containing all the words stored in the bundle.
 * Each word entry in the dictionary contains starting and ending
 * offsets for documents containing that word for some particular IndexShard
 * generation.</li>
 * <li>A set of index shard generations. These generations
 *  have names index0, index1,... A shard has word entries, word doc entries
 *  and document entries. For more information see the index shard
 * documentation.
 * </li>
 * <li>
 * The file generations.txt keeps track of what is the current generation.
 * A given generation can hold NUM_WORDS_PER_GENERATION words amongst all
 * its partitions. After which the next generation begins.
 * </li>
 * </ol>
 *
 *
 * @author Chris Pollett
 * @package seek_quarry
 * @subpackage library
 */
class IndexArchiveBundle implements CrawlConstants
{
    /**
     * Folder name to use for this IndexArchiveBundle
     * @var string
     */
    var $dir_name;
    /**
     * A short text name for this IndexArchiveBundle
     * @var string
     */
    var $description;
    /**
     * Number of partitions in the summaries WebArchiveBundle
     * @var int
     */
    var $num_partitions_summaries;
    /**
     * structure contains info about the current generation:
     * its index (ACTIVE), and the number of words it contains
     * (NUM_WORDS).
     * @var array
     */
    var $generation_info;
    /**
     * Number of docs before a new generation is started
     * @var int
     */
    var $num_docs_per_generation;
    /**
     * WebArchiveBundle for web page summaries
     * @var object
     */
    var $summaries;
    /**
     * IndexDictionary for all shards in the IndexArchiveBundle
     * This contains entries of the form (word, num_shards with word,
     * posting list info 0th shard containing the word,
     * posting list info 1st shard containing the word, ...)
     * @var object
     */
    var $dictionary;
    /**
     * Index Shard for current generation inverted word index
     * @var object
     */
    var $current_shard;
    /**
     * What version of index archive bundle this is
     * @var int
     */
    var $version;
    /**
     *  Threshold hold beyond which we don't load old index shard when
     *  restarting and instead just advance to a new shard
     */
    const NO_LOAD_SIZE = 50000000;
    /**
     * Makes or initializes an IndexArchiveBundle with the provided parameters
     *
     * @param string $dir_name folder name to store this bundle
     * @param int $num_partitions_summaries number of WebArchive partitions
     *      to use in the summmaries WebArchiveBundle
     * @param string $description a text name/serialized info about this
     *      IndexArchiveBundle
     */
    function __construct($dir_name, $read_only_archive = true,
        $description = NULL, $num_docs_per_generation = NUM_DOCS_PER_GENERATION)
    {
        $this->dir_name = $dir_name;
        $index_archive_exists = false;
        $is_dir = is_dir($this->dir_name);
        if(!$is_dir && !$read_only_archive) {
            mkdir($this->dir_name);
            mkdir($this->dir_name."/posting_doc_shards");
        } else if(!$is_dir) {
            return false;
        } else {
            $index_archive_exists = true;
        }
        if(file_exists($this->dir_name."/generation.txt")) {
            $this->generation_info = unserialize(
                file_get_contents($this->dir_name."/generation.txt"));
        } else if(!$read_only_archive) {
            $this->generation_info['ACTIVE'] = 0;
            file_put_contents($this->dir_name."/generation.txt",
                serialize($this->generation_info));
        }
        $this->summaries = new WebArchiveBundle($dir_name."/summaries",
            $read_only_archive, -1, $description);
        if(!$read_only_archive) {
            $this->summaries->initCountIfNotExists("VISITED_URLS_COUNT");
        }
        $this->description = $this->summaries->description;
        if(isset($this->summaries->version)) {
            $this->version = $this->summaries->version;
        }
        $this->num_docs_per_generation = $num_docs_per_generation;
        $this->dictionary = new IndexDictionary($this->dir_name."/dictionary");
    }
    /**
     * Add the array of $pages to the summaries WebArchiveBundle pages being
     * stored in the partition $generation and the field used
     * to store the resulting offsets given by $offset_field.
     *
     * @param int $generation field used to select partition
     * @param string $offset_field field used to record offsets after storing
     * @param array &$pages data to store
     * @param int $visited_urls_count number to add to the count of visited urls
     *      (visited urls is a smaller number than the total count of objects
     *      stored in the index).
     */
    function addPages($generation, $offset_field, &$pages,
        $visited_urls_count)
    {
        $this->summaries->setWritePartition($generation);
        $this->summaries->addPages($offset_field, $pages);
        $this->summaries->addCount($visited_urls_count, "VISITED_URLS_COUNT");
    }
    /**
     * Adds the provided mini inverted index data to the IndexArchiveBundle
     * Expects initGenerationToAdd to be called before, so generation is correct
     *
     * @param object $index_shard a mini inverted index of word_key=>doc data
     *      to add to this IndexArchiveBundle
     */
    function addIndexData($index_shard)
    {
        crawlLog("**ADD INDEX DIAGNOSTIC INFO...");
        $start_time = microtime();
        $this->getActiveShard()->appendIndexShard($index_shard);
        crawlLog("Append Index Shard: Memory usage:".memory_get_usage() .
          " Time: ".(changeInMicrotime($start_time)));
    }
    /**
     * Determines based on its size, if index_shard should be added to
     * the active generation or in a new generation should be started.
     * If so, a new generation is started, the old generation is saved, and
     * the dictionary of the old shard is copied to the bundles dictionary
     * and a log-merge performed if needed
     *
     * @param int $num_docs number of docs in the shard about to be added
     * @param object $callback object with join function to be
     *      called if process is taking too long
     * @return int the active generation after the check and possible change has
     *      been performed
     */
    function initGenerationToAdd($add_num_docs, $callback = NULL,
        $blocking = false)
    {
        $current_num_docs = $this->getActiveShard()->num_docs;
        crawlLog("Current index shard has ".$current_num_docs." documents.");
        $memory_limit = metricToInt(ini_get("memory_limit"));
        crawlLog("Memory Indexer limit is ".$memory_limit.". Usage is ".
            memory_get_usage());
        if($current_num_docs + $add_num_docs > $this->num_docs_per_generation
            || (0.65 * $memory_limit) < memory_get_usage() ) {
            if($blocking == true) {
                return -1;
            }
            crawlLog("Switching Index Shard...");
            $switch_time = microtime();
            // Save current shard dictionary to main dictionary
            $this->forceSave();
            $this->addAdvanceGeneration($callback);
            crawlLog("Switch Index Shard time:".
                changeInMicrotime($switch_time));
        }
        return $this->generation_info['ACTIVE'];
    }
    /**
     * Starts a new generation,  the dictionary of the old shard is copied to
     * the bundles dictionary and a log-merge performed if needed. This
     * function may be called by initGenerationToAdd as well as when resuming
     * a crawl rather than loading the periodic index of save of a too large
     * shard.
     *
     * @param object $callback object with join function to be
     *      called if process is taking too long
     */
    function addAdvanceGeneration($callback = NULL)
    {
        $this->addCurrentShardDictionary($callback);
        //Set up new shard
        $this->generation_info['ACTIVE']++;
        $this->generation_info['CURRENT'] =
            $this->generation_info['ACTIVE'];
        $current_index_shard_file = $this->dir_name.
            "/posting_doc_shards/index". $this->generation_info['ACTIVE'];
        $this->current_shard = new IndexShard(
            $current_index_shard_file, $this->generation_info['ACTIVE'],
                $this->num_docs_per_generation);
        file_put_contents($this->dir_name."/generation.txt",
            serialize($this->generation_info));
    }
    /**
     * Adds the words from this shard to the dictionary
     * @param object $callback object with join function to be
     *      called if process is taking too  long
     */
    function addCurrentShardDictionary($callback = NULL)
    {
        $current_index_shard_file = $this->dir_name.
            "/posting_doc_shards/index". $this->generation_info['ACTIVE'];
        /* want to do the copying of dictionary as files to conserve memory
           in case merge tiers after adding to dictionary
        */
        $this->current_shard = new IndexShard(
            $current_index_shard_file, $this->generation_info['ACTIVE'],
                $this->num_docs_per_generation, true);
        $this->dictionary->addShardDictionary($this->current_shard, $callback);
    }
    /**
     * Sets the current shard to be the active shard (the active shard is
     * what we call the last (highest indexed) shard in the bundle. Then
     * returns a reference to this shard
     * @return object last shard in the bundle
     */
     function getActiveShard()
     {
        if($this->setCurrentShard($this->generation_info['ACTIVE'])) {
            return $this->getCurrentShard();
        } else if(!isset($this->current_shard) ) {
            $current_index_shard_file = $this->dir_name.
                "/posting_doc_shards/index". $this->generation_info['CURRENT'];
            $this->current_shard = new IndexShard($current_index_shard_file,
                $this->generation_info['CURRENT'],
                $this->num_docs_per_generation);
        }
        return $this->current_shard;
     }
    /**
     * Returns the shard which is currently being used to read word-document
     * data from the bundle. If one wants to write data to the bundle use
     * getActiveShard() instead. The point of this method is to allow
     * for lazy reading of the file associated with the shard.
     *
     * @return object the currently being index shard
     */
     function getCurrentShard()
     {
        if(!isset($this->current_shard)) {
            if(!isset($this->generation_info['CURRENT'])) {
                $this->generation_info['CURRENT'] =
                    $this->generation_info['ACTIVE'];
            }
            $current_index_shard_file = $this->dir_name.
                "/posting_doc_shards/index". $this->generation_info['CURRENT'];
            if(file_exists($current_index_shard_file)) {
                if(isset($this->generation_info['DISK_BASED']) &&
                    $this->generation_info['DISK_BASED'] == true) {
                    $this->current_shard = new IndexShard(
                        $current_index_shard_file,
                        $this->generation_info['CURRENT'],
                        $this->num_docs_per_generation, true);
                    $this->current_shard->getShardHeader();
                    $this->current_shard->read_only_from_disk = true;
                } else {
                    if(filesize($current_index_shard_file) >
                        self::NO_LOAD_SIZE) {
                        $this->addAdvanceGeneration();
                    } else {
                        $this->current_shard =
                            IndexShard::load($current_index_shard_file);
                    }
                }
            } else {
                $this->current_shard = new IndexShard($current_index_shard_file,
                    $this->generation_info['CURRENT'],
                    $this->num_docs_per_generation);
            }
        }
        return $this->current_shard;
     }
    /**
     * Sets the current shard to be the $i th shard in the index bundle.
     *
     * @param $i which shard to set the current shard to be
     * @param $disk_based whether to read the whole shard in before using or
     *      leave it on disk except for pages need and use memcache
     */
     function setCurrentShard($i, $disk_based = false)
     {
        $this->generation_info['DISK_BASED'] = $disk_based;
        if(isset($this->generation_info['CURRENT']) &&
            ($i == $this->generation_info['CURRENT'] ||
            $i > $this->generation_info['ACTIVE'])) {
            return false;
        } else {
            $this->generation_info['CURRENT'] = $i;
            unset($this->current_shard);
            return true;
        }
     }
    /**
     * Gets the page out of the summaries WebArchiveBundle with the given
     * offset and generation
     *
     * @param int $offset byte offset in partition of desired page
     * @param int $generation which generation WebArchive to look up in
     *      defaults to the same number as the current shard
     * @return array desired page
     */
    function getPage($offset, $generation = -1)
    {
        if($generation == -1 ) {
            $generation = $this->generation_info['CURRENT'];
        }
        return $this->summaries->getPage($offset, $generation);
    }
    /**
     * Forces the current shard to be saved
     */
    function forceSave()
    {
        $this->getActiveShard()->save(false, true);
    }
    /**
     * Computes the number of occurrences of each of the supplied list of
     * word_keys
     *
     * @param array $word_keys keys to compute counts for
     * @return array associative array of key => count values.
     */
    function countWordKeys($word_keys)
        //lessThan is in utility.php
    {
        $words_array = array();
        if(!is_array($word_keys) || count($word_keys) < 1) { return NULL;}
        foreach($word_keys as $word_key) {
            $tmp = $this->dictionary->getWordInfo($word_key);
            if($tmp === false) {
                $words_array[$word_key] = 0;
            } else {
                $count = 0;
                foreach($tmp as $entry) {
                    $count += $entry[3];
                }
                $words_array[$word_key] = $count;
            }
        }
        return $words_array;
    }
    /**
     * Gets the description, count of summaries, and number of partitions of the
     * summaries store in the supplied directory. If the file
     * arc_description.txt exists, this is viewed as a dummy index archive for
     * the sole purpose of allowing conversions of downloaded data such as arc
     * files into Yioop! format.
     *
     * @param string path to a directory containing a summaries WebArchiveBundle
     * @return array summary of the given archive
     */
    static function getArchiveInfo($dir_name)
    {
        if(file_exists($dir_name."/arc_description.txt")) {
            $crawl = array();
            $info = array();
            $crawl['DESCRIPTION'] = substr(
                file_get_contents($dir_name."/arc_description.txt"), 0, 256);
            $crawl['ARCFILE'] = true;
            $info['VISITED_URLS_COUNT'] = 0;
            $info['COUNT'] = 0;
            $info['NUM_DOCS_PER_PARTITION'] = 0;
            $info['WRITE_PARTITION'] = 0;
            $info['DESCRIPTION'] = serialize($crawl);
            return $info;
        }
        return WebArchiveBundle::getArchiveInfo($dir_name."/summaries");
    }
    /**
     * Sets the archive info (DESCRIPTION, COUNT,
     * NUM_DOCS_PER_PARTITION) for the web archive bundle associated with
     * this bundle. As DESCRIPTION is used to store info about the info
     * bundle this sets the global properties of the info bundle as well.
     *
     * @param string $dir_name folder with archive bundle
     * @param array $info struct with above fields
     */
    static function setArchiveInfo($dir_name, $info)
    {
        WebArchiveBundle::setArchiveInfo($dir_name."/summaries", $info);
    }
    /**
     * Returns the mast time the archive info of the bundle was modified.
     *
     * @param string $dir_name folder with archive bundle
     */
    static function getParamModifiedTime($dir_name)
    {
        return WebArchiveBundle::getParamModifiedTime($dir_name."/summaries");
    }
}
?>
ViewGit