Last commit for lib/archive_bundle_iterators/mix_archive_bundle_iterator.php: 9ff742e4cc2ef0dba312dd0c5f642890b6945730

First pass at converting files to use autoloading! Take care if you have an old yioop system you are upgrading, a=chris

Chris Pollett [2015-07-01 02:Jul:st]
First pass at converting files to use autoloading! Take care if you have an old yioop system you are upgrading, a=chris
<?php
/**
 * SeekQuarry/Yioop --
 * Open Source Pure PHP Search Engine, Crawler, and Indexer
 *
 * Copyright (C) 2009 - 2015  Chris Pollett chris@pollett.org
 *
 * LICENSE:
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 * END LICENSE
 *
 * @author Chris Pollett chris@pollett.org
 * @package seek_quarry
 * @subpackage iterator
 * @license http://www.gnu.org/licenses/ GPL3
 * @link http://www.seekquarry.com/
 * @copyright 2009 - 2015
 * @filesource
 */
if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}
/**
 * Load FileCache class in case used
 */
require_once(BASE_DIR."/lib/file_cache.php");
/**
 *Loads base class for iterating
 */
require_once BASE_DIR.
    '/lib/archive_bundle_iterators/archive_bundle_iterator.php';
/** Loads common constants for web crawling*/
require_once BASE_DIR."/lib/crawl_constants.php";

/** Loads common constants for web crawling*/
require_once BASE_DIR."/lib/locale_functions.php";
/**Load base controller class, if needed. */
require_once BASE_DIR."/controllers/search_controller.php";
/**
 * Used to do an archive crawl based on the results of a crawl mix.
 * the query terms for this crawl mix will have site:any raw 1 appended to them
 *
 * @author Chris Pollett
 * @package seek_quarry
 * @subpackage iterator
 */
class MixArchiveBundleIterator extends ArchiveBundleIterator
    implements CrawlConstants
{
    /**
     * Used to hold timestamp of the crawl mix being used to iterate over
     *
     * @var int
     */
    var $mix_timestamp;
    /**
     * Used to hold timestamp of the index archive bundle of output results
     *
     * @var int
     */
    var $result_timestamp;
    /**
     * count of how far out into the crawl mix we've gone.
     *
     * @var int
     */
    var $limit;
    /**
     * Creates a web archive iterator with the given parameters.
     *
     * @param string $mix_timestamp timestamp of the crawl mix to
     *     iterate over the pages of
     * @param string $result_timestamp timestamp of the web archive bundle
     *     results are being stored in
     */
    function __construct($mix_timestamp, $result_timestamp)
    {
        global $INDEXING_PLUGINS;
        setLocaleObject(getLocaleTag());

        $this->mix_timestamp = $mix_timestamp;
        $this->result_timestamp = $result_timestamp;
        $this->query = "site:any m:".$mix_timestamp;
        $this->searchController = new SearchController($INDEXING_PLUGINS);
        $archive_name = $this->getArchiveName($result_timestamp);
        if(!file_exists($archive_name)) {
            mkdir($archive_name);
        }
        if(file_exists("$archive_name/iterate_status.txt")) {
            $this->restoreCheckpoint();
        } else {
            $this->reset();
        }
    }
    /**
     * Get the filename of the file that says information about the
     * current archive iterator (such as whether the end of the iterator
     * has been reached)
     *
     * @param int $timestamp of current archive crawl
     */
    function getArchiveName($timestamp)
    {
        return CRAWL_DIR."/schedules/".self::name_archive_iterator.$timestamp;
    }
    /**
     * Saves the current state so that a new instantiation can pick up just
     * after the last batch of pages extracted.
     *
     * @param array $info data needed to restore where we are in the process
     *      of iterating through archive. By default save fields LIMIT and
     *      END_OF_ITERATOR
     */
    function saveCheckpoint($info = array())
    {
        if($info == array()) {
            $info["LIMIT"] = $this->limit;
            $info["END_OF_ITERATOR"] = $this->end_of_iterator;
        }
        $archive_name = $this->getArchiveName($this->result_timestamp);
        file_put_contents("$archive_name/iterate_status.txt",
            serialize($info));
    }
    /**
     * Restores state from a previous instantiation, after the last batch of
     * pages extracted.
     */
    function restoreCheckpoint()
    {
        $archive_name = $this->getArchiveName($this->result_timestamp);
        $info = unserialize(
            file_get_contents("$archive_name/iterate_status.txt"));
        if(isset($info["LIMIT"])) {
            $this->limit = $info["LIMIT"];
        }
        if(isset($info["END_OF_ITERATOR"])) {
            $this->end_of_iterator = $info["END_OF_ITERATOR"];
        } else {
            $this->end_of_iterator = false;
        }
    }
    /**
     * Estimates the importance of the site according to the weighting of
     * the particular archive iterator
     * @param $site an associative array containing info about a web page
     * @return bool false we assume files were crawled roughly according to
     *     page importance so we use default estimate of doc rank
     */
    function weight(&$site)
    {
        return false;
    }
    /**
     * Gets the next $num many docs from the iterator
     *
     * @param int $num number of docs to get
     * @param bool $no_process this flag is inherited from base class but
     *     does not do anything in this case
     * @return array associative arrays for $num pages
     */
    function nextPages($num, $no_process = false)
    {
        $objects = array("NO_PROCESS" => false);
        if($this->end_of_iterator) {
            return $objects;
        }
        $results = $this->searchController->queryRequest($this->query,
            $num, $this->limit, 1, $this->result_timestamp);
        $num_results = count($results["PAGES"]);
        if(isset($results["PAGES"]) && $num_results > 0 ) {
            $objects = $results["PAGES"];
            $this->limit += $num_results;
            $objects["NO_PROCESS"] = true;
        } else if ($num_results == 0) {
            $this->end_of_iterator = true;
        } else {
            $objects['NO_PROCESS'] = $results;
        }
        if(isset($results["SAVE_POINT"]) ){
            $end = true;
            foreach($results["SAVE_POINT"] as $save_point)  {
                if($save_point != -1) {
                    $end = false;
                }
            }
            $this->save_points = $results["SAVE_POINT"];
            if($end) {
                $this->end_of_iterator = true;
            }
        }
        $this->saveCheckpoint();
        return $objects;
    }
    /**
     * Resets the iterator to the start of the archive bundle
     */
    function reset()
    {
        $this->limit = 0;
        $this->end_of_iterator = false;
        $this->searchController->clearQuerySavepoint($this->result_timestamp);
        $this->saveCheckpoint();
    }
}
?>
ViewGit