Last commit for models/crawl_model.php: 9ff742e4cc2ef0dba312dd0c5f642890b6945730

First pass at converting files to use autoloading! Take care if you have an old yioop system you are upgrading, a=chris

Chris Pollett [2015-07-01 02:Jul:st]

First pass at converting files to use autoloading! Take care if you have an old yioop system you are upgrading, a=chris

<?php
/**
 * SeekQuarry/Yioop --
 * Open Source Pure PHP Search Engine, Crawler, and Indexer
 *
 * Copyright (C) 2009 - 2015  Chris Pollett chris@pollett.org
 *
 * LICENSE:
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 * END LICENSE
 *
 * @author Chris Pollett chris@pollett.org
 * @license http://www.gnu.org/licenses/ GPL3
 * @link http://www.seekquarry.com/
 * @copyright 2009 - 2015
 * @filesource
 */
if (!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}
/** For base class*/
require_once BASE_DIR."/models/parallel_model.php";
/** For deleting save points*/
require_once BASE_DIR."/controllers/search_controller.php";
/** used to prevent cache page requests from being logged*/
if (!defined("POST_PROCESSING") && !defined("NO_LOGGING")) {
    define("NO_LOGGING", true);
}
/**
 * This is class is used to handle getting/setting crawl parameters, CRUD
 * operations on current crawls, start, stop, status of crawls,
 * getting cache files out of crawls, determining
 * what is the default index to be used, marshalling/unmarshalling crawl mixes,
 * and handling data from suggest-a-url forms
 *
 * @author Chris Pollett
 * @package seek_quarry\model
 */
class CrawlModel extends ParallelModel implements CrawlConstants
{
    /**
     * Used to map between search crawl mix form variables and database columns
     * @var array
     */
    var $search_table_column_map = array("name"=>"NAME",
        "owner_id"=>"OWNER_ID");

    /**
     * File to be used to store suggest-a-url form data
     * @var string
     */
    var $suggest_url_file;
    /**
     * {@inheritDoc}
     *
     * @param string $db_name the name of the database for the search engine
     * @param bool $connect whether to connect to the database by default
     *     after making the datasource class
     */
    function __construct($db_name = DB_NAME, $connect = true)
    {
        $this->suggest_url_file = WORK_DIRECTORY."/data/suggest_url.txt";
        parent::__construct($db_name, $connect);
    }
    /**
     * {@inheritDoc}
     *
     * @param mixed $args any additional arguments which should be used to
     *     determine these tables (in this case none)
     * @return string a comma separated list of tables suitable for a SQL
     *     query
     */
    function fromCallback($args = null)
    {
        return "CRAWL_MIXES";
    }
    /**
     * {@inheritDoc}
     *
     * @param array $row row as retrieved from database query
     * @param mixed $args additional arguments that might be used by this
     *      callback. In this case, should be a boolean flag that says whether
     *      or not to add information about the components of the crawl mix
     * @return array $row after callback manipulation
     */
    function rowCallback($row, $args)
    {
        if ($args) {
            $mix = $this->getCrawlMix($row['TIMESTAMP'], true);
            $row['FRAGMENTS'] = $mix['FRAGMENTS'];
        }
        return $row;
    }

    /**
     * Gets the cached version of a web page from the machine on which it was
     * fetched.
     *
     * Complete cached versions of web pages typically only live on a fetcher
     * machine. The queue server machine typically only maintains summaries.
     * This method makes a REST request of a fetcher machine for a cached page
     * and get the results back.
     *
     * @param string $machine the ip address of domain name of the machine the
     *     cached page lives on
     * @param string $machine_uri the path from document root on $machine where
     *     the yioop scripts live
     * @param int $partition the partition in the WebArchiveBundle the page is
     *      in
     * @param int $offset the offset in bytes into the WebArchive partition in
     *     the WebArchiveBundle at which the cached page lives.
     * @param string $crawl_time the timestamp of the crawl the cache page is
     *     from
     * @param int $instance_num which fetcher instance for the particular
     *     fetcher crawled the page (if more than one), false otherwise
     * @return array page data of the cached page
     */
    function getCacheFile($machine, $machine_uri, $partition,
        $offset, $crawl_time, $instance_num = false)
    {
        $time = time();
        $session = md5($time . AUTH_KEY);
        if ($machine == '::1') { //IPv6 :(
            $machine = "[::1]";
            //used if the fetching and queue serving were on the same machine
        }
        // we assume all machines use the same scheme & port of the name server
        $port = UrlParser::getPort(NAME_SERVER);
        $scheme = UrlParser::getScheme(NAME_SERVER);
        $request = "$scheme://$machine:$port$machine_uri?c=archive&a=cache&".
            "time=$time&session=$session&partition=$partition&offset=$offset".
            "&crawl_time=$crawl_time";
        if ($instance_num !== false) {
            $request .= "&instance_num=$instance_num";
        }
        $tmp = FetchUrl::getPage($request);

        $page = @unserialize(base64_decode($tmp));
        $page['REQUEST'] = $request;

        return $page;
    }
    /**
     * Gets the name (aka timestamp) of the current index archive to be used to
     * handle search queries
     *
     * @return string the timestamp of the archive
     */
    function getCurrentIndexDatabaseName()
    {
        $db = $this->db;
        $sql = "SELECT CRAWL_TIME FROM CURRENT_WEB_INDEX";
        $result = $db->execute($sql);

        $row =  $db->fetchArray($result);

        return $row['CRAWL_TIME'];
    }
    /**
     * Sets the IndexArchive that will be used for search results
     *
     * @param $timestamp  the timestamp of the index archive. The timestamp is
     *      when the crawl was started. Currently, the timestamp appears as
     *      substring of the index archives directory name
     */
    function setCurrentIndexDatabaseName($timestamp)
    {
        $db = $this->db;
        $db->execute("DELETE FROM CURRENT_WEB_INDEX");
        $sql = "INSERT INTO CURRENT_WEB_INDEX VALUES ( ? )";
        $db->execute($sql, array($timestamp));
    }
    /**
     * Returns all the files in $dir or its subdirectories with modfied times
     * more recent than timestamp. The file which have
     * in their path or name a string in the $excludes array will be exclude
     *
     * @param string $dir a directory to traverse
     * @param int $timestamp used to check modified times against
     * @param array $excludes an array of path substrings tot exclude
     * @return array of file structs consisting of name, modified time and
     *     size.
     */
    function getDeltaFileInfo($dir, $timestamp, $excludes)
    {
        $dir_path_len = strlen($dir) + 1;
        $files = $this->db->fileInfoRecursive($dir, true);
        $names = array();
        $results = array();
        foreach ($files as $file) {
            $file["name"] = substr($file["name"], $dir_path_len);
            if ($file["modified"] > $timestamp && $file["name"] !="") {
                $flag = true;
                foreach ($excludes as $exclude) {
                    if (stristr($file["name"], $exclude)) {
                        $flag = false;
                        break;
                    }
                }
                if ($flag) {
                    $results[$file["name"]] = $file;
                }
            }
        }
        $results = array_values($results);
        return $results;
    }
    /**
     * Gets a list of all mixes of available crawls
     *
     * @param int $user_id user that we are getting a list of mixes for
     * @param bool $with_components if false then don't load the factors
     *     that make up the crawl mix, just load the name of the mixes
     *     and their timestamps; otherwise, if true loads everything
     * @return array list of available crawls
     */
    function getMixList($user_id, $with_components = false)
    {
        $sql = "SELECT TIMESTAMP, NAME FROM CRAWL_MIXES WHERE OWNER_ID=?";
        $result = $this->db->execute($sql, array($user_id));
        $rows = array();
        while($row = $this->db->fetchArray($result)) {
            if ($with_components) {
                $mix = $this->getCrawlMix($row['TIMESTAMP'], true);
                $row['FRAGMENTS'] = $mix['FRAGMENTS'];
            }
            $rows[] = $row;
        }
        return $rows;
    }
    /**
     * Retrieves the weighting component of the requested crawl mix
     *
     * @param string $timestamp of the requested crawl mix
     * @param bool $just_components says whether to find the mix name or
     *     just the components array.
     * @return array the crawls and their weights that make up the
     *     requested crawl mix.
     */
    function getCrawlMix($timestamp, $just_components = false)
    {
        $db = $this->db;
        if (!$just_components) {
            $sql = "SELECT TIMESTAMP, NAME, OWNER_ID, PARENT FROM CRAWL_MIXES ".
                "WHERE TIMESTAMP = ?";
            $result = $db->execute($sql, array($timestamp));
            $mix =  $db->fetchArray($result);
        } else {
            $mix = array();
        }
        $sql = "SELECT FRAGMENT_ID, RESULT_BOUND".
            " FROM MIX_FRAGMENTS WHERE ".
            " TIMESTAMP = ?";
        $result = $db->execute($sql, array($timestamp));
        $mix['FRAGMENTS'] = array();
        while($row = $db->fetchArray($result)) {
            $mix['FRAGMENTS'][$row['FRAGMENT_ID']]['RESULT_BOUND'] =
                $row['RESULT_BOUND'];
        }
        $sql = "SELECT CRAWL_TIMESTAMP, WEIGHT, KEYWORDS ".
            " FROM MIX_COMPONENTS WHERE ".
            " TIMESTAMP=:timestamp AND FRAGMENT_ID=:fragment_id";
        $params = array(":timestamp" => $timestamp);
        foreach ($mix['FRAGMENTS'] as $fragment_id => $data) {
            $params[":fragment_id"] = $fragment_id;
            $result = $db->execute($sql, $params);
            $mix['COMPONENTS'] = array();
            $count = 0;
            while($row =  $db->fetchArray($result)) {
                $mix['FRAGMENTS'][$fragment_id]['COMPONENTS'][$count] =$row;
                $count++;
            }
        }
        return $mix;
    }
    /**
     * Returns the timestamp associated with a mix name;
     *
     * @param string $mix_name name to lookup
     * @return mixed timestamp associated with name if exists false otherwise
     */
    function getCrawlMixTimestamp($mix_name)
    {
        $db = $this->db;
        $sql = "SELECT TIMESTAMP, NAME FROM CRAWL_MIXES WHERE ".
            " NAME= ?";
        $result = $db->execute($sql, array($mix_name));
        $mix =  $db->fetchArray($result);
        if (isset($mix["TIMESTAMP"])) {
            return $mix["TIMESTAMP"];
        }
        return false;
    }
    /**
     * Returns whether the supplied timestamp corresponds to a crawl mix
     *
     * @param string $timestamp of the requested crawl mix
     *
     * @return bool true if it does; false otherwise
     */
    function isCrawlMix($timestamp)
    {
        $db = $this->db;
        $sql = "SELECT TIMESTAMP, NAME FROM CRAWL_MIXES WHERE ".
            " TIMESTAMP = ?";
        $result = $db->execute($sql, array($timestamp));
        if ($result) {
            if ($mix = $db->fetchArray($result)) {
                return true;
            } else {
                return false;
            }
        }
    }
    /**
     * Returns whether there is a mix with the given $timestamp that $user_id
     * owns
     *
     * @param string $timestamp to see if exists
     * @param string $user_id id of would be owner
     *
     * @return bool true if owner; false otherwise
     */
    function isMixOwner($timestamp, $user_id)
    {
        $db = $this->db;
        $sql = "SELECT TIMESTAMP, NAME FROM CRAWL_MIXES WHERE ".
            " TIMESTAMP = ? and OWNER_ID = ?";
        $result = $db->execute($sql, array($timestamp, $user_id));
        if ($result) {
            if ($mix = $db->fetchArray($result)) {
                return true;
            } else {
                return false;
            }
        }
    }
    /**
     * Stores in DB the supplied crawl mix object
     *
     * @param array $mix an associative array representing the crawl mix object
     */
    function setCrawlMix($mix)
    {
        $db = $this->db;
        //although maybe slower, we first get rid of any old data
        $timestamp = $mix['TIMESTAMP'];
        $this->deleteCrawlMix($timestamp);
        //next we store the new data
        $sql = "INSERT INTO CRAWL_MIXES VALUES (?, ?, ?, ?)";
        $db->execute($sql, array($timestamp, $mix['NAME'], $mix['OWNER_ID'],
            $mix['PARENT']));
        $fid = 0;
        foreach ($mix['FRAGMENTS'] as $fragment_id => $fragment_data) {
            $sql = "INSERT INTO MIX_FRAGMENTS VALUES (?, ?, ?)";
            $db->execute($sql, array($timestamp, $fid,
                $fragment_data['RESULT_BOUND']));
            foreach ($fragment_data['COMPONENTS'] as $component) {
                $sql = "INSERT INTO MIX_COMPONENTS VALUES (?, ?, ?, ?, ?)";
                $db->execute($sql, array($timestamp, $fid,
                    $component['CRAWL_TIMESTAMP'], $component['WEIGHT'],
                    $component['KEYWORDS']));
            }
            $fid++;
        }
    }
    /**
     * Deletes from the DB the crawl mix ans its associated components and
     * fragments
     *
     * @param int $timestamp of the mix to delete
     */
    function deleteCrawlMix($timestamp)
    {
        $sql = "DELETE FROM CRAWL_MIXES WHERE TIMESTAMP=?";
        $this->db->execute($sql, array($timestamp));
        $sql = "DELETE FROM MIX_FRAGMENTS WHERE TIMESTAMP=?";
        $this->db->execute($sql, array($timestamp));
        $sql = "DELETE FROM MIX_COMPONENTS WHERE TIMESTAMP=?";
        $this->db->execute($sql, array($timestamp));
    }
    /**
     * Deletes the archive iterator and savepoint files created during the
     * process of iterating through a crawl mix.
     *
     * @param int $timestamp The timestamp of the crawl mix
     */
    function deleteCrawlMixIteratorState($timestamp)
    {
        global $INDEXING_PLUGINS;
        setLocaleObject(getLocaleTag());
        $searchController = new SearchController($INDEXING_PLUGINS);
        $searchController->clearQuerySavepoint($timestamp);

        $archive_dir = WORK_DIRECTORY."/schedules/".
            self::name_archive_iterator.$timestamp;
        if (file_exists($archive_dir)) {
            $this->db->unlinkRecursive($archive_dir);
        }
    }
    /**
     * Returns the initial sites that a new crawl will start with along with
     * crawl parameters such as crawl order, allowed and disallowed crawl sites
     * @param bool $use_default whether or not to use the Yioop! default
     *     crawl.ini file rather than the one created by the user.
     * @return array  the first sites to crawl during the next crawl
     *     restrict_by_url, allowed, disallowed_sites
     */
    function getSeedInfo($use_default = false)
    {
        if (file_exists(WORK_DIRECTORY."/crawl.ini") && !$use_default) {
            $info = parse_ini_with_fallback(WORK_DIRECTORY."/crawl.ini");
        } else {
            $info = parse_ini_with_fallback(
                BASE_DIR."/configs/default_crawl.ini");
        }

        return $info;
    }
    /**
     * Writes a crawl.ini file with the provided data to the user's
     * WORK_DIRECTORY
     *
     * @param array $info an array containing information about the crawl
     */
    function setSeedInfo($info)
    {
        if (!isset($info['general']['crawl_index'])) {
            $info['general']['crawl_index']='12345678';
        }
        if (!isset($info["general"]["arc_dir"])) {
            $info["general"]["arc_dir"] = "";
        }
        if (!isset($info["general"]["arc_type"])) {
            $info["general"]["arc_type"] = "";
        }
        if (!isset($info["general"]["cache_pages"])) {
            $info["general"]["cache_pages"] = true;
        }
        if (!isset($info["general"]["summarizer_option"])) {
            $info["general"]["summarizer_option"] = "";
            }
        $n = array();
        $n[] = <<<EOT
; ***** BEGIN LICENSE BLOCK *****
;  SeekQuarry/Yioop Open Source Pure PHP Search Engine, Crawler, and Indexer
;  Copyright (C) 2009 - 2015  Chris Pollett chris@pollett.org
;
;  This program is free software: you can redistribute it and/or modify
;  it under the terms of the GNU General Public License as published by
;  the Free Software Foundation, either version 3 of the License, or
;  (at your option) any later version.
;
;  This program is distributed in the hope that it will be useful,
;  but WITHOUT ANY WARRANTY; without even the implied warranty of
;  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;  GNU General Public License for more details.
;
;  You should have received a copy of the GNU General Public License
;  along with this program.  If not, see <http://www.gnu.org/licenses/>.
;  ***** END LICENSE BLOCK *****
;
; crawl.ini
;
; crawl configuration file
;
EOT;
        if (!isset($info['general']['page_range_request'])) {
            $info['general']['page_range_request'] = PAGE_RANGE_REQUEST;
        }
        if (!isset($info['general']['page_recrawl_frequency'])) {
            $info['general']['page_recrawl_frequency'] = PAGE_RECRAWL_FREQUENCY;
        }
        if (!isset($info['general']['max_description_len'])) {
            $info['general']['max_description_len'] = MAX_DESCRIPTION_LEN;
        }
        $n[] = '[general]';
        $n[] = "crawl_order = '".$info['general']['crawl_order']."';";
        $n[] = "summarizer_option = '".
            $info['general']['summarizer_option']."';";
        $n[] = "crawl_type = '".$info['general']['crawl_type']."';";
        $n[] = "crawl_index = '".$info['general']['crawl_index']."';";
        $n[] = 'arc_dir = "'.$info["general"]["arc_dir"].'";';
        $n[] = 'arc_type = "'.$info["general"]["arc_type"].'";';
        $n[] = "page_recrawl_frequency = '".
            $info['general']['page_recrawl_frequency']."';";
        $n[] = "page_range_request = '".
            $info['general']['page_range_request']."';";
        $n[] = "max_description_len = '".
            $info['general']['max_description_len']."';";
        $bool_string =
            ($info['general']['cache_pages']) ? "true" : "false";
        $n[] = "cache_pages = $bool_string;";
        $bool_string =
            ($info['general']['restrict_sites_by_url']) ? "true" : "false";
        $n[] = "restrict_sites_by_url = $bool_string;";
        $n[] = "";

        $n[] = "[indexed_file_types]";
        if (isset($info["indexed_file_types"]['extensions'])) {
            foreach ($info["indexed_file_types"]['extensions'] as $extension) {
                $n[] = "extensions[] = '$extension';";
            }
        }
        $n[] = "";

        $n[] = "[active_classifiers]";
        if (isset($info['active_classifiers']['label'])) {
            foreach ($info['active_classifiers']['label'] as $label) {
                $n[] = "label[] = '$label';";
            }
        }
        $n[] = "";

        $n[] = "[active_rankers]";
        if (isset($info['active_rankers']['label'])) {
            foreach ($info['active_rankers']['label'] as $label) {
                $n[] = "label[] = '$label';";
            }
        }
        $n[] = "";

        $site_types =
            array('allowed_sites' => 'url', 'disallowed_sites' => 'url',
                'seed_sites' => 'url', 'page_rules'=>'rule');
        foreach ($site_types as $type => $field) {
            $n[] = "[$type]";
            if (isset($info[$type][$field])) {
                foreach ($info[$type][$field] as $field_value) {
                    $n[] = $field . "[] = '$field_value';";
                }
            }
            $n[]="";
        }
        $n[] = "[indexing_plugins]";
        if (isset($info["indexing_plugins"]['plugins'])) {
            foreach ($info["indexing_plugins"]['plugins'] as $plugin) {
                if ($plugin == "") { continue; }
                $n[] = "plugins[] = '$plugin';";
            }
        }
        $out = implode("\n", $n);
        $out .= "\n";
        file_put_contents(WORK_DIRECTORY."/crawl.ini", $out);
    }
    /**
     * Returns the crawl parameters that were used during a given crawl
     *
     * @param string $timestamp timestamp of the crawl to load the crawl
     *     parameters of
     * @return array  the first sites to crawl during the next crawl
     *     restrict_by_url, allowed, disallowed_sites
     * @param array $machine_urls an array of urls of yioop queue servers
     *
     */
    function getCrawlSeedInfo($timestamp,  $machine_urls = null)
    {
        if ($machine_urls != null &&
            !$this->isSingleLocalhost($machine_urls, $timestamp)) {
            /* seed info should be same amongst all queue_servers that have it--
               only start schedule differs -- however, not all queue_servers
               necessarily have the same crawls. Thus, we still query all
               machines in case only one has it.
             */
            $a_list = $this->execMachines("getCrawlSeedInfo",
                $machine_urls, serialize($timestamp));
            if (is_array($a_list)) {
                foreach ($a_list as $elt) {
                    $seed_info = unserialize(webdecode(
                        $elt[self::PAGE]));
                    if (isset($seed_info['general'])) {
                        break;
                    }
                }
            }
            return $seed_info;
        }
        $dir = CRAWL_DIR.'/cache/'.self::index_data_base_name.$timestamp;
        $seed_info = null;
        if (file_exists($dir)) {
            $info = IndexArchiveBundle::getArchiveInfo($dir);
            if (!isset($info['DESCRIPTION']) ||
                $info['DESCRIPTION'] == null ||
                strstr($info['DESCRIPTION'], "Archive created") ) {
                return $seed_info;
            }
            $index_info = unserialize($info['DESCRIPTION']);
            $general_params = array("restrict_sites_by_url" =>
                array(self::RESTRICT_SITES_BY_URL, false),
                "crawl_type" => array(self::CRAWL_TYPE, self::WEB_CRAWL),
                "crawl_index" => array(self::CRAWL_INDEX, ''),
                "crawl_order" => array(self::CRAWL_ORDER,
                    self::PAGE_IMPORTANCE),
                "summarizer_option" => array(self::SUMMARIZER_OPTION,
                    self::BASIC_SUMMARIZER),
                "arc_dir" => array(self::ARC_DIR, ''),
                "arc_type" => array(self::ARC_TYPE, ''),
                "cache_pages" => array(self::CACHE_PAGES, true),
                "page_recrawl_frequency" => array(self::PAGE_RECRAWL_FREQUENCY,
                    -1),
                "page_range_request" => array(self::PAGE_RANGE_REQUEST,
                    PAGE_RANGE_REQUEST),
                "max_description_len" => array(self::MAX_DESCRIPTION_LEN,
                    MAX_DESCRIPTION_LEN),
            );
            foreach ($general_params as $param => $info) {
                $seed_info['general'][$param] = (isset($index_info[$info[0]])) ?
                    $index_info[$info[0]] : $info[1];
            }

            $site_types = array(
                "allowed_sites" => array(self::ALLOWED_SITES, "url"),
                "disallowed_sites" => array(self::DISALLOWED_SITES, "url"),
                "seed_sites" => array(self::TO_CRAWL, "url"),
                "page_rules" => array(self::PAGE_RULES, "rule"),
                "indexed_file_types" => array(self::INDEXED_FILE_TYPES,
                    "extensions"),
            );
            foreach ($site_types as $type => $info) {
                if (isset($index_info[$info[0]])) {
                    $tmp = & $index_info[$info[0]];
                } else {
                    $tmp = array();
                }
                $seed_info[$type][$info[1]] =  $tmp;
            }
            if (isset($index_info[self::INDEXING_PLUGINS])) {
                $seed_info['indexing_plugins']['plugins'] =
                    $index_info[self::INDEXING_PLUGINS];
            }
            if (isset($index_info[self::INDEXING_PLUGINS_DATA])) {
                $seed_info['indexing_plugins']['plugins_data'] =
                    $index_info[self::INDEXING_PLUGINS_DATA];
            }
        }
        return $seed_info;
    }
    /**
     * Changes the crawl parameters of an existing crawl (can be while crawling)
     * Not all fields are allowed to be updated
     *
     * @param string $timestamp timestamp of the crawl to change
     * @param array $new_info the new parameters
     * @param array $machine_urls an array of urls of yioop queue servers
     */
    function setCrawlSeedInfo($timestamp, $new_info,  $machine_urls = null)
    {
        if ($machine_urls != null &&
            !$this->isSingleLocalhost($machine_urls, $timestamp)) {
            $params = array($timestamp, $new_info);
            $this->execMachines("setCrawlSeedInfo",
                $machine_urls, serialize($params));
        }
        $dir = CRAWL_DIR.'/cache/'.self::index_data_base_name.$timestamp;
        if (file_exists($dir)) {
            $info = IndexArchiveBundle::getArchiveInfo($dir);
            $index_info = unserialize($info['DESCRIPTION']);
            if (isset($new_info['general']["restrict_sites_by_url"])) {
                $index_info[self::RESTRICT_SITES_BY_URL] =
                    $new_info['general']["restrict_sites_by_url"];
            }
            $updatable_site_info = array(
                "allowed_sites" => array(self::ALLOWED_SITES,'url'),
                "disallowed_sites" => array(self::DISALLOWED_SITES, 'url'),
                "seed_sites" => array(self::TO_CRAWL, "url"),
                "page_rules" => array(self::PAGE_RULES, 'rule'),
                "indexed_file_types" => array(self::INDEXED_FILE_TYPES,
                    "extensions"),
                "active_classifiers" => array(self::ACTIVE_CLASSIFIERS,
                    'label'),
                "active_rankers" => array(self::ACTIVE_RANKERS,
                    'label'),
            );
            foreach ($updatable_site_info as $type => $type_info) {
                if (isset($new_info[$type][$type_info[1]])) {
                    $index_info[$type_info[0]] =
                        $new_info[$type][$type_info[1]];
                }
            }
            if (isset($new_info['indexing_plugins']['plugins']) ) {
                $index_info[self::INDEXING_PLUGINS] =
                    $new_info['indexing_plugins']['plugins'];
            }
            $info['DESCRIPTION'] = serialize($index_info);
            IndexArchiveBundle::setArchiveInfo($dir, $info);
        }
    }
    /**
     * Returns an array of urls which were stored via the suggest-a-url
     * form in suggest_view.php
     *
     * @return array urls that have been suggested
     */
    function getSuggestSites()
    {
        $suggest_file = $this->suggest_url_file;
        if (file_exists($suggest_file)) {
            $urls = file($suggest_file);
        } else {
            $urls = array();
        }
        return $urls;
    }
    /**
     * Add new distinct urls to those already saved in the suggest_url_file
     * If the supplied url is not new or the file size
     * exceeds MAX_SUGGEST_URL_FILE_SIZE then it is not added.
     *
     * @param string $url to add
     * @return string true if the url was added or already existed
     *     in the file; false otherwise
     */
    function appendSuggestSites($url)
    {
        $suggest_file = $this->suggest_url_file;
        $suggest_size = strlen($url);
        if (file_exists($suggest_file)) {
            $suggest_size += filesize($suggest_file);
        } else {
            $this->clearSuggestSites();
        }
        if ($suggest_size < MAX_SUGGEST_URL_FILE_SIZE) {
            $urls = file($suggest_file);
            $urls[] = $url;
            $urls = array_unique($urls);
            $out_string = "";
            $delim = "";
            foreach ($urls as $url) {
                $trim_url = trim($url);
                if (strlen($trim_url) > 0) {
                    $out_string .= $delim.$trim_url;
                    $delim = "\n";
                }
            }
            file_put_contents($suggest_file, $out_string, LOCK_EX);
            return true;
        }
        return false;
    }
    /**
     * Resets the suggest_url_file to be the empty file
     */
    function clearSuggestSites()
    {
        file_put_contents($this->suggest_url_file, "", LOCK_EX);
    }
    /**
     * Get a description associated with a Web Crawl or Crawl Mix
     *
     * @param int $timestamp of crawl or mix in question
     * @param array $machine_urls an array of urls of yioop queue servers
     *
     * @return array associative array containing item DESCRIPTION
     */
    function getInfoTimestamp($timestamp, $machine_urls = null)
    {
        $is_mix = $this->isCrawlMix($timestamp);
        $info = array();
        if ($is_mix) {
            $sql = "SELECT TIMESTAMP, NAME FROM CRAWL_MIXES WHERE ".
                " TIMESTAMP=?";
            $result = $this->db->execute($sql, array($timestamp));
            $mix =  $this->db->fetchArray($result);
            $info['TIMESTAMP'] = $timestamp;
            $info['DESCRIPTION'] = $mix['NAME'];
            $info['IS_MIX'] = true;
        } else {
            if ($machine_urls != null &&
                !$this->isSingleLocalhost($machine_urls, $timestamp)) {
                $cache_file = CRAWL_DIR."/cache/".self::network_base_name.
                    $timestamp.".txt";
                if (file_exists($cache_file)) {
                    $old_info = unserialize(file_get_contents($cache_file));
                }
                if (isset($old_info) && filemtime($cache_file)
                    + 300 > time()) {
                    return $old_info;
                }
                $info = array();
                if (isset($old_info["MACHINE_URLS"])) {
                    $info["MACHINE_URLS"] = $old_info["MACHINE_URLS"];
                } else {
                    $info["MACHINE_URLS"] = $machine_urls;
                }
                $info_lists = $this->execMachines("getInfoTimestamp",
                    $info["MACHINE_URLS"], serialize($timestamp));
                $info['DESCRIPTION'] = "";
                $info["COUNT"] = 0;
                $info['VISITED_URLS_COUNT'] = 0;
                restore_error_handler();
                foreach ($info_lists as $info_list) {
                    $a_info = @unserialize(webdecode(
                        $info_list[self::PAGE]));
                    if (isset($a_info['DESCRIPTION'])) {
                        $info['DESCRIPTION'] = $a_info['DESCRIPTION'];
                    }
                    if (isset($a_info['VISITED_URLS_COUNT'])) {
                        $info['VISITED_URLS_COUNT'] +=
                            $a_info['VISITED_URLS_COUNT'];
                    }
                    if (isset($a_info['COUNT'])) {
                        $info['COUNT'] +=
                            $a_info['COUNT'];
                    }
                }
                set_error_handler("yioop_error_handler");
                file_put_contents($cache_file, serialize($info));
                return $info;
            }
            $dir = CRAWL_DIR.'/cache/'.self::index_data_base_name.$timestamp;
            if (file_exists($dir)) {
                $info = IndexArchiveBundle::getArchiveInfo($dir);
                if (isset($info['DESCRIPTION'])) {
                    $tmp = unserialize($info['DESCRIPTION']);
                    $info['DESCRIPTION'] = isset($tmp['DESCRIPTION']) ?
                        $tmp['DESCRIPTION'] : "";
                }
            }
        }
        return $info;
    }
    /**
     * Deletes the crawl with the supplied timestamp if it exists. Also
     * deletes any crawl mixes making use of this crawl
     *
     * @param string $timestamp a Unix timestamp
     * @param array $machine_urls an array of urls of yioop queue servers
     */
    function deleteCrawl($timestamp, $machine_urls = null)
    {
        if ($machine_urls != null &&
            !$this->isSingleLocalhost($machine_urls, $timestamp)) {
            //get rid of cache info on Name machine
            $mask = CRAWL_DIR."/cache/".self::network_crawllist_base_name.
                "*.txt";
            array_map( "unlink", glob( $mask ) );
            $delete_files = array(
                CRAWL_DIR."/cache/".self::network_base_name.
                    "$timestamp.txt",
                CRAWL_DIR."/cache/".self::statistics_base_name.
                    "$timestamp.txt"
            );
            foreach ($delete_files as $delete_file) {
                if (file_exists($delete_file)) {
                    unlink($delete_file);
                }
            }
            if (!in_array(NAME_SERVER, $machine_urls)) {
                array_unshift($machine_urls, NAME_SERVER);
            }
            //now get rid of files on queue_servers
            $this->execMachines("deleteCrawl",
                $machine_urls, serialize($timestamp));
            return;
        }
        $delete_dirs = array(
            CRAWL_DIR.'/cache/'.self::index_data_base_name . $timestamp,
            CRAWL_DIR.'/schedules/'.self::index_data_base_name . $timestamp,
            CRAWL_DIR.'/schedules/' . self::schedule_data_base_name.$timestamp,
            CRAWL_DIR.'/schedules/'.self::robot_data_base_name . $timestamp,
            CRAWL_DIR.'/schedules/'.self::name_archive_iterator . $timestamp,
        );
        foreach ($delete_dirs as $delete_dir) {
            if (file_exists($delete_dir)) {
                $this->db->unlinkRecursive($delete_dir, true);
            }
        }
        $save_point_files = glob(CRAWL_DIR.'/schedules/'.self::save_point.
            $timestamp."*.txt");
        foreach ($save_point_files as $save_point_file) {
            @unlink($save_point_file);
        }

        $sql = "SELECT DISTINCT TIMESTAMP FROM MIX_COMPONENTS WHERE ".
            " CRAWL_TIMESTAMP='$timestamp'";
        $result = $this->db->execute($sql);
        $rows = array();
        while($rows[] =  $this->db->fetchArray($result)) ;

        foreach ($rows as $row) {
            $this->deleteCrawlMix($row['TIMESTAMP']);
        }
        $current_timestamp = $this->getCurrentIndexDatabaseName();
        if ($current_timestamp == $timestamp) {
            $this->db->execute("DELETE FROM CURRENT_WEB_INDEX");
        }
    }
    /**
     * Used to send a message to the queue_servers to start a crawl
     *
     * @param array $crawl_params has info like the time of the crawl,
     *      whether starting a new crawl or resuming an old one, etc.
     * @param array $seed_info what urls to crawl, etc as from the crawl.ini
     *      file
     * @param array $machine_urls an array of urls of yioop queue servers
     */
    function sendStartCrawlMessage($crawl_params, $seed_info = null,
        $machine_urls = null)
    {
        if ($machine_urls != null && !$this->isSingleLocalhost($machine_urls)) {
            $params = array($crawl_params, $seed_info);
            $crawl_time = $crawl_params[self::CRAWL_TIME];
            file_put_contents(CRAWL_DIR."/schedules/network_status.txt",
                serialize($crawl_time));
            $this->execMachines("sendStartCrawlMessage",
                $machine_urls, serialize($params));
            return;
        }
        $info_string = serialize($crawl_params);
        file_put_contents(
            CRAWL_DIR."/schedules/queue_server_messages.txt",
            $info_string);
        chmod(CRAWL_DIR."/schedules/queue_server_messages.txt",
            0777);
        if ($seed_info != null) {
            $scheduler_info[self::HASH_SEEN_URLS] = array();
            foreach ($seed_info['seed_sites']['url'] as $site) {
                if ($site[0] == "#") { continue; } //ignore comments in file
                $site_parts = preg_split("/\s+/", $site);
                if (strlen($site_parts[0]) > 0) {
                    $scheduler_info[self::TO_CRAWL][] =
                        array($site_parts[0], 1.0);
                }
            }
            $scheduler_string = "\n".webencode(
                gzcompress(serialize($scheduler_info)));
            file_put_contents(
                CRAWL_DIR."/schedules/".self::schedule_start_name,
                $scheduler_string);
        }
    }
    /**
     * Used to send a message to the queue_servers to stop a crawl
     * @param array $machine_urls an array of urls of yioop queue servers
     */
    function sendStopCrawlMessage($machine_urls = null)
    {
        if ($machine_urls != null && !$this->isSingleLocalhost($machine_urls)) {
            @unlink(CRAWL_DIR."/schedules/network_status.txt");
            $this->execMachines("sendStopCrawlMessage", $machine_urls);
            return;
        }

        $info = array();
        $info[self::STATUS] = "STOP_CRAWL";
        $info_string = serialize($info);
        file_put_contents(
            CRAWL_DIR."/schedules/queue_server_messages.txt",
            $info_string);
    }
    /**
     * Gets a list of all index archives of crawls that have been conducted
     *
     * @param bool $return_arc_bundles whether index bundles used for indexing
     *     arc or other archive bundles should be included in the lsit
     * @param bool $return_recrawls whether index archive bundles generated as
     *     a result of recrawling should be included in the result
     * @param array $machine_urls an array of urls of yioop queue servers
     * @param bool $cache whether to try to get/set the data to a cache file
     *
     * @return array available IndexArchiveBundle directories and
     *     their meta information this meta information includes the time of
     *     the crawl, its description, the number of pages downloaded, and the
     *     number of partitions used in storing the inverted index
     */
    function getCrawlList($return_arc_bundles = false, $return_recrawls = false,
        $machine_urls = null, $cache = false)
    {
        if ($machine_urls != null && !$this->isSingleLocalhost($machine_urls)) {
            $arg = ($return_arc_bundles && $return_recrawls) ? 3 :
                (($return_recrawls) ? 2 : (($return_arc_bundles) ? 1 : 0));
            $cache_file = CRAWL_DIR."/cache/".self::network_crawllist_base_name.
                "$arg.txt";
            if ($cache && file_exists($cache_file) && filemtime($cache_file)
                + 300 > time() ) {
                return unserialize(file_get_contents($cache_file));
            }
            $list_strings = $this->execMachines("getCrawlList",
                $machine_urls, $arg);
            $list = $this->aggregateCrawlList($list_strings);
            if ($cache) {
                file_put_contents($cache_file, serialize($list));
            }
            return $list;
        }
        $list = array();
        $dirs = glob(CRAWL_DIR.'/cache/'.self::index_data_base_name.
            '*', GLOB_ONLYDIR);
        foreach ($dirs as $dir) {
            $crawl = array();
            $pre_timestamp = strstr($dir, self::index_data_base_name);
            $crawl['CRAWL_TIME'] =
                substr($pre_timestamp, strlen(self::index_data_base_name));
            $info = IndexArchiveBundle::getArchiveInfo($dir);
            if (isset($info['DESCRIPTION'])) {
                restore_error_handler();
                $index_info = @unserialize($info['DESCRIPTION']);
                set_error_handler("yioop_error_handler");
            } else {
                $index_info = array();
                $index_info['DESCRIPTION'] = "ERROR!! $dir<br />" .
                    print_r($info, true);
            }
            $crawl['DESCRIPTION'] = "";
            if (!$return_recrawls &&
                isset($index_info[self::CRAWL_TYPE]) &&
                $index_info[self::CRAWL_TYPE] == self::ARCHIVE_CRAWL) {
                continue;
            } else if ($return_recrawls  &&
                isset($index_info[self::CRAWL_TYPE]) &&
                $index_info[self::CRAWL_TYPE] == self::ARCHIVE_CRAWL) {
                $crawl['DESCRIPTION'] = "RECRAWL::";
            }
            $sched_path = CRAWL_DIR.'/schedules/'.
                self::schedule_data_base_name.$crawl['CRAWL_TIME'];
            $crawl['RESUMABLE'] = false;
            if (is_dir($sched_path)) {
            $sched_dir = opendir($sched_path);
                while( ($name = readdir($sched_dir)) !==  false) {
                    $sub_path = "$sched_path/$name";
                    if (!is_dir($sub_path) || $name == '.' ||
                        $name == '..') {continue; }
                    $sub_dir = opendir($sub_path);
                    $i = 0;
                    while(($sub_name = readdir($sub_dir)) !==  false && $i < 5){
                        if ($sub_name[0] == 'A' && $sub_name[1] == 't') {
                            $crawl['RESUMABLE'] = true;
                            break 2;
                        }
                    }
                    closedir($sub_dir);
                }
                closedir($sched_dir);
            }
            if (isset($index_info['DESCRIPTION'])) {
                $crawl['DESCRIPTION'] .= $index_info['DESCRIPTION'];
            }
            $crawl['VISITED_URLS_COUNT'] =
                isset($info['VISITED_URLS_COUNT']) ?
                $info['VISITED_URLS_COUNT'] : 0;
            $crawl['COUNT'] = (isset($info['COUNT'])) ? $info['COUNT'] :0;
            $crawl['NUM_DOCS_PER_PARTITION'] =
                (isset($info['NUM_DOCS_PER_PARTITION'])) ?
                $info['NUM_DOCS_PER_PARTITION'] : 0;
            $crawl['WRITE_PARTITION'] =
                (isset($info['WRITE_PARTITION'])) ?
                $info['WRITE_PARTITION'] : 0;
            $list[] = $crawl;
        }
        if ($return_arc_bundles) {
            $dirs = glob(CRAWL_DIR.'/archives/*', GLOB_ONLYDIR);
            foreach ($dirs as $dir) {
                $crawl = array();
                $crawl['CRAWL_TIME'] = crc32($dir);
                $crawl['DESCRIPTION'] = "ARCFILE::";
                $crawl['ARC_DIR'] = $dir;
                $ini_file = "$dir/arc_description.ini";
                if (!file_exists($ini_file)) {
                    continue;
                } else {
                    $ini = parse_ini_with_fallback($ini_file);
                    $crawl['ARC_TYPE'] = $ini['arc_type'];
                    $crawl['DESCRIPTION'] .= $ini['description'];
                }
                $crawl['VISITED_URLS_COUNT'] = 0;
                $crawl['COUNT'] = 0;
                $crawl['NUM_DOCS_PER_PARTITION'] = 0;
                $crawl['WRITE_PARTITION'] = 0;
                $list[] = $crawl;
            }
        }
        return $list;
    }
    /**
     * When @see getCrawlList() is used in a multi-queue_server this method
     * used to integrate the crawl lists received by the different machines
     *
     * @param array $list_strings serialized crawl list data from different
     * queue_servers
     * @param string $data_field field of $list_strings to use for data
     * @return array list of crawls and their meta data
     */
    function aggregateCrawlList($list_strings, $data_field = null)
    {
        restore_error_handler();
        $pre_list = array();
        foreach ($list_strings as $list_string) {
            $a_list = @unserialize(webdecode(
                $list_string[self::PAGE]));
            if ($data_field != null) {
                $a_list = $a_list[$data_field];
            }
            if (is_array($a_list)) {
                foreach ($a_list as $elt) {
                    $timestamp = $elt['CRAWL_TIME'];
                    if (!isset($pre_list[$timestamp])) {
                        $pre_list[$timestamp] = $elt;
                    } else {
                        if (isset($elt["DESCRIPTION"]) &&
                            $elt["DESCRIPTION"] != "") {
                            $pre_list[$timestamp]["DESCRIPTION"] =
                                $elt["DESCRIPTION"];
                        }
                        $pre_list[$timestamp]["VISITED_URLS_COUNT"] +=
                            $elt["VISITED_URLS_COUNT"];
                        $pre_list[$timestamp]["COUNT"] +=
                            $elt["COUNT"];
                        $pre_list[$timestamp]['RESUMABLE'] |= $elt['RESUMABLE'];
                    }
                }
            }
        }
        $list = array_values($pre_list);
        set_error_handler("yioop_error_handler");
        return $list;
    }
    /**
     * Determines if the length of time since any of the fetchers has spoken
     * with any of the queue_servers has exceeded CRAWL_TIME_OUT. If so,
     * typically the caller of this method would do something such as officially
     * stop the crawl.
     *
     * @param array $list_strings serialized crawl list data from different
     *  queue_servers
     * @param array $machine_urls an array of urls of yioop queue servers
     * @return bool whether the current crawl is stalled or not
     */
    function crawlStalled($machine_urls = null)
    {
        if ($machine_urls != null && !$this->isSingleLocalhost($machine_urls)) {
            $outputs = $this->execMachines("crawlStalled", $machine_urls);
            return $this->aggregateStalled($outputs);
        }

        if (file_exists(CRAWL_DIR."/schedules/crawl_status.txt")) {
            /* assume if status not updated for CRAWL_TIME_OUT
               crawl not active (do check for both scheduler and indexer) */
            if (filemtime(
                CRAWL_DIR."/schedules/crawl_status.txt") +
                    CRAWL_TIME_OUT < time() ) {
                return true;
            }
            $schedule_status_exists =
                file_exists(CRAWL_DIR."/schedules/schedule_status.txt");
            if ($schedule_status_exists &&
                filemtime(CRAWL_DIR."/schedules/schedule_status.txt") +
                    CRAWL_TIME_OUT < time() ) {
                return true;
            }
        }
        return false;
    }
    /**
     * When @see crawlStalled() is used in a multi-queue_server this method
     * used to integrate the stalled information received by the different
     * machines
     *
     * @param array $stall_statuses contains web encoded serialized data one
     * one field of which has the boolean data concerning stalled statis
     *
     * @param string $data_field field of $stall_statuses to use for data
     *     if null then each element of $stall_statuses is a wen encoded
     *     serialized boolean
     * @return bool true if no queue_server has heard from one
     *     fetcher within the time out period
     */
    function aggregateStalled($stall_statuses, $data_field = null)
    {
        restore_error_handler();
        $result = true;
        foreach ($stall_statuses as $status) {
            $stall_status = @unserialize(webdecode($status[self::PAGE]));
            if ($data_field != null) {
                $stall_status = $stall_status[$data_field];
            } else {
                /* this case would mean some kind of error occurred, but
                   don't stop crawl for it */
                $result = false;
                break;
            }
            if ($stall_status === false) {
                $result = false;
                break;
            }
        }
        set_error_handler("yioop_error_handler");
        return $result;
    }
    /**
     * Returns data about current crawl such as DESCRIPTION, TIMESTAMP,
     * peak memory of various processes, most recent fetcher, most recent
     * urls, urls seen, urls visited, etc.
     *
     * @param array $machine_urls an array of urls of yioop queue servers
     *     on which the crawl is being conducted
     * @return array associative array of the said data
     */
    function crawlStatus($machine_urls = null)
    {
        if ($machine_urls != null && !$this->isSingleLocalhost($machine_urls)) {
            $status_strings = $this->execMachines("crawlStatus", $machine_urls);
            return $this->aggregateStatuses($status_strings);
        }

        $data = array();
        $crawl_status_exists =
            file_exists(CRAWL_DIR."/schedules/crawl_status.txt");
        if ($crawl_status_exists) {
            $crawl_status =
                @unserialize(file_get_contents(
                    CRAWL_DIR."/schedules/crawl_status.txt"));
        }
        $schedule_status_exists =
            file_exists(CRAWL_DIR."/schedules/schedule_status.txt");
        if ($schedule_status_exists) {
            $schedule_status =
                @unserialize(file_get_contents(
                    CRAWL_DIR."/schedules/schedule_status.txt"));
            if (isset($schedule_status[self::TYPE]) &&
                $schedule_status[self::TYPE] == self::SCHEDULER) {
                $data['SCHEDULER_PEAK_MEMORY'] =
                    isset($schedule_status[self::MEMORY_USAGE]) ?
                    $schedule_status[self::MEMORY_USAGE] : 0;
            }
        }

        $data = (isset($crawl_status) && is_array($crawl_status)) ?
            array_merge($data, $crawl_status) : $data;

        if (isset($data['VISITED_COUNT_HISTORY']) &&
            count($data['VISITED_COUNT_HISTORY']) > 1) {
            $recent = array_shift($data['VISITED_COUNT_HISTORY']);
            $data["MOST_RECENT_TIMESTAMP"] = $recent[0];
            $oldest = array_pop($data['VISITED_COUNT_HISTORY']);
            unset($data['VISITED_COUNT_HISTORY']);
            $change_in_time_hours = floatval(time() - $oldest[0]) /
                floatval(ONE_HOUR);
            $change_in_urls = $recent[1] - $oldest[1];
            $data['VISITED_URLS_COUNT_PER_HOUR'] = ($change_in_time_hours > 0) ?
                $change_in_urls/$change_in_time_hours : 0;
        } else {
            $data['VISITED_URLS_COUNT_PER_HOUR'] = 0;
        }

        return $data;
    }
    /**
     * When @see crawlStatus() is used in a multi-queue_server this method
     * used to integrate the status information received by the different
     * machines
     *
     * @param array $status_strings
     * @param string $data_field field of $status_strings to use for data
     * @return array associative array of DESCRIPTION, TIMESTAMP,
     * peak memory of various processes, most recent fetcher, most recent
     * urls, urls seen, urls visited, etc.
     */
    function aggregateStatuses($status_strings, $data_field = null)
    {
        $status['WEBAPP_PEAK_MEMORY'] = 0;
        $status['FETCHER_PEAK_MEMORY'] = 0;
        $status['QUEUE_PEAK_MEMORY'] = 0;
        $status["SCHEDULER_PEAK_MEMORY"] = 0;
        $status["COUNT"] = 0;
        $status["VISITED_URLS_COUNT"] = 0;
        $status["VISITED_URLS_COUNT_PER_HOUR"] = 0;
        $status["MOST_RECENT_TIMESTAMP"] = 0;
        $status["DESCRIPTION"] = "";
        $status['MOST_RECENT_FETCHER'] = "";
        $status['MOST_RECENT_URLS_SEEN'] = array();
        $status['CRAWL_TIME'] = 0;
        restore_error_handler();
        foreach ($status_strings as $status_string) {
            $a_status = @unserialize(webdecode(
                    $status_string[self::PAGE]));
            if ($data_field != null) {
                $a_status = $a_status[$data_field];
            }
            $count_fields = array("COUNT", "VISITED_URLS_COUNT_PER_HOUR",
                "VISITED_URLS_COUNT");
            foreach ($count_fields as $field) {
                if (isset($a_status[$field])) {
                    $status[$field] += $a_status[$field];
                }
            }
            if (isset($a_status["CRAWL_TIME"]) && $a_status["CRAWL_TIME"] >=
                $status['CRAWL_TIME']) {
                $status['CRAWL_TIME'] = $a_status["CRAWL_TIME"];
                $text_fields = array("DESCRIPTION", "MOST_RECENT_FETCHER");
                foreach ($text_fields as $field) {
                    if (isset($a_status[$field])) {
                        if ($status[$field] == "" ||
                            in_array($status[$field], array("BEGIN_CRAWL",
                                "RESUME_CRAWL") )) {
                            $status[$field] = $a_status[$field];
                        }
                    }
                }
            }
            if (isset($a_status["MOST_RECENT_TIMESTAMP"]) &&
                $status["MOST_RECENT_TIMESTAMP"] <=
                    $a_status["MOST_RECENT_TIMESTAMP"]) {
                $status["MOST_RECENT_TIMESTAMP"] =
                    $a_status["MOST_RECENT_TIMESTAMP"];
                if (isset($a_status['MOST_RECENT_URLS_SEEN'])) {
                    $status['MOST_RECENT_URLS_SEEN'] =
                        $a_status['MOST_RECENT_URLS_SEEN'];
                }
            }
            $memory_fields = array("WEBAPP_PEAK_MEMORY",
                "FETCHER_PEAK_MEMORY", "QUEUE_PEAK_MEMORY",
                "SCHEDULER_PEAK_MEMORY");
            foreach ($memory_fields as $field) {
                $status[$field] = (!isset($a_status[$field])) ? 0 :
                        max($status[$field], $a_status[$field]);
            }
        }
        set_error_handler("yioop_error_handler");
        return $status;
    }
    /**
     * This method is used to reduce the number of network requests
     * needed by the crawlStatus method of admin_controller. It returns
     * an array containing the results of the @see crawlStalled
     * @see crawlStatus and @see getCrawlList methods
     *
     * @param array $machine_urls an array of urls of yioop queue servers
     * @return array containing three components one for each of the three
     *     kinds of results listed above
     */
    function combinedCrawlInfo($machine_urls = null)
    {
        if ($machine_urls != null && !$this->isSingleLocalhost($machine_urls)) {
            $combined_strings =
                $this->execMachines("combinedCrawlInfo", $machine_urls);
            $combined = array();
            $combined[] = $this->aggregateStalled($combined_strings,
                0);
            $combined[] = $this->aggregateStatuses($combined_strings,
                1);
            $combined[] = $this->aggregateCrawlList($combined_strings,
                2);
            return $combined;
        }
        $combined = array();
        $combined[] = $this->crawlStalled();
        $combined[] = $this->crawlStatus();
        $combined[] = $this->getCrawlList(false, true);
        return $combined;
    }
    /**
     * Add the provided urls to the schedule directory of URLs that will
     * be crawled
     *
     * @param string $timestamp Unix timestamp of crawl to add to schedule of
     * @param array $inject_urls urls to be added to the schedule of
     *     the active crawl
     * @param array $machine_urls an array of urls of yioop queue servers
     */
    function injectUrlsCurrentCrawl($timestamp, $inject_urls,
        $machine_urls = null)
    {
        if ($machine_urls != null &&
            !$this->isSingleLocalhost($machine_urls, $timestamp)) {
            $this->execMachines("injectUrlsCurrentCrawl", $machine_urls,
                serialize(array($timestamp, $inject_urls)));
            return;
        }

        $dir = CRAWL_DIR."/schedules/".
            self::schedule_data_base_name. $timestamp;
        if (!file_exists($dir)) {
            mkdir($dir);
            chmod($dir, 0777);
        }
        $day = floor($timestamp/ONE_DAY) - 1;
            /* want before all other schedules,
               execute next */
        $dir .= "/$day";
        if (!file_exists($dir)) {
            mkdir($dir);
            chmod($dir, 0777);
        }
        $count = count($inject_urls);
        if ($count > 0 ) {
            $now = time();
            $schedule_data = array();
            $schedule_data[self::SCHEDULE_TIME] =
                $timestamp;
            $schedule_data[self::TO_CRAWL] = array();
            for ($i = 0; $i < $count; $i++) {
                $url = $inject_urls[$i];
                $hash = crawlHash($now.$url);
                $schedule_data[self::TO_CRAWL][] =
                    array($url, 1, $hash);
            }
            $data_string = webencode(
                gzcompress(serialize($schedule_data)));
            $data_hash = crawlHash($data_string);
            file_put_contents($dir."/At1From127-0-0-1".
                "WithHash$data_hash.txt", $data_string);
            return true;
        }
        return false;
    }
    /**
     * Computes for each word in an array of words a count of the total number
     * of times it occurs in this crawl model's default index.
     *
     * @param array $words words to find the counts for
     * @param array $machine_urls machines to invoke this command on
     * @return array associative array of word => counts
     */
     function countWords($words, $machine_urls = null)
     {
        if ($machine_urls != null && !$this->isSingleLocalhost($machine_urls)) {
            $count_strings = $this->execMachines("countWords", $machine_urls,
                serialize(array($words, $this->index_name)));
            $word_counts = array();
            foreach ($count_strings as $count_string) {
                $a_word_counts = unserialize(webdecode(
                        $count_string[self::PAGE]));
                if (is_array($a_word_counts)) {
                    foreach ($a_word_counts as $word => $count) {
                        $word_counts[$word] = (isset($word_counts[$word])) ?
                            $word_counts[$word] + $count : $count;
                    }
                }
            }
            return $word_counts;
        }
        $index_archive = IndexManager::getIndex($this->index_name);
        $hashes = array();
        $lookup = array();
        foreach ($words as $word) {
            $tmp = crawlHash($word);
            $hashes[] = $tmp;
            $lookup[$tmp] = $word;
        }
        $word_key_counts =
            $index_archive->countWordKeys($hashes);
        $phrases = array();
        $word_counts = array();
        if (is_array($word_key_counts ) && count($word_key_counts ) > 0) {
            foreach ($word_key_counts as $word_key =>$count) {
                $word_counts[$lookup[$word_key]] = $count;
            }
        }
        return $word_counts;
     }

}

ViewGit