Last commit for src/models/CrawlModel.php: 2addb500315b7393a90fe66431d7832b1e7386c7

Adjust copyrights years

Chris Pollett [2024-01-03 21:Jan:rd]

Adjust copyrights years

<?php
/**
 * SeekQuarry/Yioop --
 * Open Source Pure PHP Search Engine, Crawler, and Indexer
 *
 * Copyright (C) 2009 - 2019  Chris Pollett chris@pollett.org
 *
 * LICENSE:
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 *
 * END LICENSE
 *
 * @author Chris Pollett chris@pollett.org
 * @license https://www.gnu.org/licenses/ GPL3
 * @link https://www.seekquarry.com/
 * @copyright 2009 - 2019
 * @filesource
 */
namespace seekquarry\yioop\models;

use seekquarry\yioop\controllers\SearchController;
use seekquarry\yioop\configs as C;
use seekquarry\yioop\library as L;
use seekquarry\yioop\library\CrawlConstants;
use seekquarry\yioop\library\CrawlDaemon;
use seekquarry\yioop\library\FetchUrl;
use seekquarry\yioop\library\IndexArchiveBundle;
use seekquarry\yioop\library\IndexManager;
use seekquarry\yioop\library\UrlParser;

/** used to prevent cache page requests from being logged*/
if (!C\nsdefined("POST_PROCESSING") && !empty($_SERVER["NO_LOGGING"])) {
    $_SERVER["NO_LOGGING"] = true;
}
/**
 * This is class is used to handle getting/setting crawl parameters, CRUD
 * operations on current crawls, start, stop, status of crawls,
 * getting cache files out of crawls, determining
 * what is the default index to be used, marshalling/unmarshalling crawl mixes,
 * and handling data from suggest-a-url forms
 *
 * @author Chris Pollett
 */
class CrawlModel extends ParallelModel
{
    /**
     * Used to map between search crawl mix form variables and database columns
     * @var array
     */
    public $search_table_column_map = ["name"=>"NAME", "owner_id"=>"OWNER_ID"];
    /**
     * File to be used to store suggest-a-url form data
     * @var string
     */
    public $suggest_url_file;
    /**
     * {@inheritDoc}
     *
     * @param string $db_name the name of the database for the search engine
     * @param bool $connect whether to connect to the database by default
     *     after making the datasource class
     */
    public function __construct($db_name = C\DB_NAME, $connect = true)
    {
        $this->suggest_url_file = C\WORK_DIRECTORY."/data/suggest_url.txt";
        parent::__construct($db_name, $connect);
    }
    /**
     * {@inheritDoc}
     *
     * @param mixed $args any additional arguments which should be used to
     *     determine these tables (in this case none)
     * @return string a comma separated list of tables suitable for a SQL
     *     query
     */
    public function fromCallback($args = null)
    {
        return "CRAWL_MIXES";
    }
    /**
     * {@inheritDoc}
     *
     * @param array $row row as retrieved from database query
     * @param mixed $args additional arguments that might be used by this
     *      callback. In this case, should be a boolean flag that says whether
     *      or not to add information about the components of the crawl mix
     * @return array $row after callback manipulation
     */
    public function rowCallback($row, $args)
    {
        if ($args) {
            $mix = $this->getCrawlMix($row['TIMESTAMP'], true);
            $row['FRAGMENTS'] = $mix['FRAGMENTS'];
        }
        return $row;
    }

    /**
     * Gets the cached version of a web page from the machine on which it was
     * fetched.
     *
     * Complete cached versions of web pages typically only live on a fetcher
     * machine. The queue server machine typically only maintains summaries.
     * This method makes a REST request of a fetcher machine for a cached page
     * and get the results back.
     *
     * @param string $machine the ip address of domain name of the machine the
     *     cached page lives on
     * @param string $machine_uri the path from document root on $machine where
     *     the yioop scripts live
     * @param int $partition the partition in the WebArchiveBundle the page is
     *      in
     * @param int $offset the offset in bytes into the WebArchive partition in
     *     the WebArchiveBundle at which the cached page lives.
     * @param string $crawl_time the timestamp of the crawl the cache page is
     *     from
     * @param int $instance_num which fetcher instance for the particular
     *     fetcher crawled the page (if more than one), false otherwise
     * @return array page data of the cached page
     */
    public function getCacheFile($machine, $machine_uri, $partition,
        $offset, $crawl_time, $instance_num = false)
    {
        $time = time();
        $session = md5($time . C\AUTH_KEY);
        if ($machine == '::1') { //IPv6 :(
            $machine = "[::1]";
            //used if the fetching and queue serving were on the same machine
        }
        // we assume all machines use the same scheme & port of the name server
        $port = UrlParser::getPort(C\NAME_SERVER);
        $scheme = UrlParser::getScheme(C\NAME_SERVER);
        $request = "$scheme://$machine:$port$machine_uri?c=archive&a=cache&".
            "time=$time&session=$session&partition=$partition&offset=$offset".
            "&crawl_time=$crawl_time";
        if ($instance_num !== false) {
            $request .= "&instance_num=$instance_num";
        }
        $tmp = FetchUrl::getPage($request);
        $page = @unserialize(base64_decode($tmp));
        $page['REQUEST'] = $request;

        return $page;
    }
    /**
     * Gets the name (aka timestamp) of the current index archive to be used to
     * handle search queries
     *
     * @return string the timestamp of the archive
     */
    public function getCurrentIndexDatabaseName()
    {
        $db = $this->db;
        $sql = "SELECT CRAWL_TIME FROM CURRENT_WEB_INDEX";
        $result = $db->execute($sql);
        $row =  $db->fetchArray($result);
        return $row['CRAWL_TIME'];
    }
    /**
     * Sets the IndexArchive that will be used for search results
     *
     * @param $timestamp  the timestamp of the index archive. The timestamp is
     *      when the crawl was started. Currently, the timestamp appears as
     *      substring of the index archives directory name
     */
    public function setCurrentIndexDatabaseName($timestamp)
    {
        $db = $this->db;
        $db->execute("DELETE FROM CURRENT_WEB_INDEX");
        $sql = "INSERT INTO CURRENT_WEB_INDEX VALUES ( ? )";
        $db->execute($sql, [$timestamp]);
    }
    /**
     * Returns all the files in $dir or its subdirectories with modified times
     * more recent than timestamp. The file which have
     * in their path or name a string in the $excludes array will be exclude
     *
     * @param string $dir a directory to traverse
     * @param int $timestamp used to check modified times against
     * @param array $excludes an array of path substrings tot exclude
     * @return array of file structs consisting of name, modified time and
     *     size.
     */
    public function getDeltaFileInfo($dir, $timestamp, $excludes)
    {
        $dir_path_len = strlen($dir) + 1;
        $files = $this->db->fileInfoRecursive($dir, true);
        $names = [];
        $results = [];
        foreach ($files as $file) {
            $file["name"] = substr($file["name"], $dir_path_len);
            if ($file["modified"] > $timestamp && $file["name"] !="") {
                $flag = true;
                foreach ($excludes as $exclude) {
                    if (stristr($file["name"], $exclude)) {
                        $flag = false;
                        break;
                    }
                }
                if ($flag) {
                    $results[$file["name"]] = $file;
                }
            }
        }
        $results = array_values($results);
        return $results;
    }
    /**
     * Gets a list of all mixes of available crawls
     *
     * @param int $user_id user that we are getting a list of mixes for
     * @param bool $with_components if false then don't load the factors
     *     that make up the crawl mix, just load the name of the mixes
     *     and their timestamps; otherwise, if true loads everything
     * @return array list of available crawls
     */
    public function getMixList($user_id, $with_components = false)
    {
        $sql = "SELECT TIMESTAMP, NAME FROM CRAWL_MIXES WHERE OWNER_ID = ?";
        if (intval($user_id) != $user_id) {
            return false; //keep postgres error log cleaner by doing check
        }
        $result = $this->db->execute($sql, [$user_id]);
        $rows = [];
        while ($row = $this->db->fetchArray($result)) {
            if ($with_components) {
                $mix = $this->getCrawlMix($row['TIMESTAMP'], true);
                $row['FRAGMENTS'] = $mix['FRAGMENTS'];
            }
            $rows[] = $row;
        }
        return $rows;
    }
    /**
     * Retrieves the weighting component of the requested crawl mix
     *
     * @param string $timestamp of the requested crawl mix
     * @param bool $just_components says whether to find the mix name or
     *     just the components array.
     * @return array the crawls and their weights that make up the
     *     requested crawl mix.
     */
    public function getCrawlMix($timestamp, $just_components = false)
    {
        $db = $this->db;
        if (!$just_components) {
            $sql = "SELECT TIMESTAMP, NAME, OWNER_ID, PARENT FROM CRAWL_MIXES ".
                "WHERE TIMESTAMP = ?";
            $result = $db->execute($sql, [$timestamp]);
            $mix =  $db->fetchArray($result);
        } else {
            $mix = [];
        }
        $sql = "SELECT FRAGMENT_ID, RESULT_BOUND".
            " FROM MIX_FRAGMENTS WHERE ".
            " TIMESTAMP = ?";
        $result = $db->execute($sql, [$timestamp]);
        $mix['FRAGMENTS'] = [];
        while ($row = $db->fetchArray($result)) {
            $mix['FRAGMENTS'][$row['FRAGMENT_ID']]['RESULT_BOUND'] =
                $row['RESULT_BOUND'];
        }
        $sql = "SELECT CRAWL_TIMESTAMP, WEIGHT, KEYWORDS ".
            " FROM MIX_COMPONENTS WHERE ".
            " TIMESTAMP=:timestamp AND FRAGMENT_ID=:fragment_id";
        $params = [":timestamp" => $timestamp];
        foreach ($mix['FRAGMENTS'] as $fragment_id => $data) {
            $params[":fragment_id"] = $fragment_id;
            $result = $db->execute($sql, $params);
            $mix['COMPONENTS'] = [];
            $count = 0;
            if ($result) {
                while ($row =  $db->fetchArray($result)) {
                    $mix['FRAGMENTS'][$fragment_id]['COMPONENTS'][$count] =$row;
                    $count++;
                }
            } else {
                break;
            }
        }
        return $mix;
    }
    /**
     * Returns the timestamp associated with a mix name;
     *
     * @param string $mix_name name to lookup
     * @return mixed timestamp associated with name if exists false otherwise
     */
    public function getCrawlMixTimestamp($mix_name)
    {
        $db = $this->db;
        $sql = "SELECT TIMESTAMP, NAME FROM CRAWL_MIXES WHERE ".
            " NAME= ?";
        $result = $db->execute($sql, [$mix_name]);
        $mix =  $db->fetchArray($result);
        if (isset($mix["TIMESTAMP"])) {
            return $mix["TIMESTAMP"];
        }
        return false;
    }
    /**
     * Returns whether the supplied timestamp corresponds to a crawl mix
     *
     * @param string $timestamp of the requested crawl mix
     *
     * @return bool true if it does; false otherwise
     */
    public function isCrawlMix($timestamp)
    {
        $db = $this->db;
        $sql = "SELECT TIMESTAMP, NAME FROM CRAWL_MIXES WHERE ".
            " TIMESTAMP = ?";
        $result = $db->execute($sql, [$timestamp]);
        if ($result) {
            if ($mix = $db->fetchArray($result)) {
                return true;
            } else {
                return false;
            }
        }
    }
    /**
     * Returns whether there is a mix with the given $timestamp that $user_id
     * owns
     *
     * @param string $timestamp to see if exists
     * @param string $user_id id of would be owner
     *
     * @return bool true if owner; false otherwise
     */
    public function isMixOwner($timestamp, $user_id)
    {
        $db = $this->db;
        $sql = "SELECT TIMESTAMP, NAME FROM CRAWL_MIXES WHERE ".
            " TIMESTAMP = ? and OWNER_ID = ?";
        if (intval($user_id) != $user_id) {
            return false;
        }
        $result = $db->execute($sql, [$timestamp, $user_id]);
        if ($result) {
            if ($mix = $db->fetchArray($result)) {
                return true;
            } else {
                return false;
            }
        }
    }
    /**
     * Stores in DB the supplied crawl mix object
     *
     * @param array $mix an associative array representing the crawl mix object
     */
    public function setCrawlMix($mix)
    {
        $db = $this->db;
        //although maybe slower, we first get rid of any old data
        $timestamp = $mix['TIMESTAMP'];
        $this->deleteCrawlMix($timestamp);
        //next we store the new data
        $sql = "INSERT INTO CRAWL_MIXES VALUES (?, ?, ?, ?)";
        $db->execute($sql, [$timestamp, $mix['NAME'], $mix['OWNER_ID'],
            $mix['PARENT']]);
        $fid = 0;
        foreach ($mix['FRAGMENTS'] as $fragment_id => $fragment_data) {
            $sql = "INSERT INTO MIX_FRAGMENTS VALUES (?, ?, ?)";
            $db->execute($sql, [$timestamp, $fid,
                $fragment_data['RESULT_BOUND']]);
            foreach ($fragment_data['COMPONENTS'] as $component) {
                $sql = "INSERT INTO MIX_COMPONENTS VALUES (?, ?, ?, ?, ?)";
                $db->execute($sql, [$timestamp, $fid,
                    $component['CRAWL_TIMESTAMP'], $component['WEIGHT'],
                    $component['KEYWORDS']]);
            }
            $fid++;
        }
    }
    /**
     * Deletes from the DB the crawl mix ans its associated components and
     * fragments
     *
     * @param int $timestamp of the mix to delete
     */
    public function deleteCrawlMix($timestamp)
    {
        $sql = "DELETE FROM CRAWL_MIXES WHERE TIMESTAMP=?";
        $this->db->execute($sql, [$timestamp]);
        $sql = "DELETE FROM MIX_FRAGMENTS WHERE TIMESTAMP=?";
        $this->db->execute($sql, [$timestamp]);
        $sql = "DELETE FROM MIX_COMPONENTS WHERE TIMESTAMP=?";
        $this->db->execute($sql, [$timestamp]);
    }
    /**
     * Deletes the archive iterator and savepoint files created during the
     * process of iterating through a crawl mix.
     *
     * @param int $timestamp The timestamp of the crawl mix
     */
    public function deleteCrawlMixIteratorState($timestamp)
    {
        L\setLocaleObject(L\getLocaleTag());
        $search_controller = new SearchController();
        $search_controller->clearQuerySavepoint($timestamp);

        $archive_dir = C\WORK_DIRECTORY."/schedules/".
            self::name_archive_iterator.$timestamp;
        if (file_exists($archive_dir)) {
            $this->db->unlinkRecursive($archive_dir);
        }
    }
    /**
     * Returns the initial sites that a new crawl will start with along with
     * crawl parameters such as crawl order, allowed and disallowed crawl sites
     * @param bool $use_default whether or not to use the Yioop! default
     *     crawl.ini file rather than the one created by the user.
     * @return array  the first sites to crawl during the next crawl
     *     restrict_by_url, allowed, disallowed_sites
     */
    public function getSeedInfo($use_default = false)
    {
        if (file_exists(C\WORK_DIRECTORY."/crawl.ini") && !$use_default) {
            $info = L\parse_ini_with_fallback(C\WORK_DIRECTORY . "/crawl.ini");
        } else {
            $info = L\parse_ini_with_fallback(
                C\BASE_DIR . "/configs/default_crawl.ini");
        }
        return $info;
    }
    /**
     * Writes a crawl.ini file with the provided data to the user's
     * WORK_DIRECTORY
     *
     * @param array $info an array containing information about the crawl
     */
    public function setSeedInfo($info)
    {
        if (!isset($info['general']['crawl_index'])) {
            $info['general']['crawl_index']='12345678';
        }
        if (!isset($info['general']['channel'])) {
            $info['general']['channel']='0';
        }
        if (!isset($info["general"]["arc_dir"])) {
            $info["general"]["arc_dir"] = "";
        }
        if (!isset($info["general"]["arc_type"])) {
            $info["general"]["arc_type"] = "";
        }
        if (!isset($info["general"]["cache_pages"])) {
            $info["general"]["cache_pages"] = true;
        }
        if (!isset($info["general"]["summarizer_option"])) {
            $info["general"]["summarizer_option"] = "";
        }
        $n = [];
        $n[] = <<<EOT
; ***** BEGIN LICENSE BLOCK *****
;  SeekQuarry/Yioop Open Source Pure PHP Search Engine, Crawler, and Indexer
;  Copyright (C) 2009 - 2019  Chris Pollett chris@pollett.org
;
;  This program is free software: you can redistribute it and/or modify
;  it under the terms of the GNU General Public License as published by
;  the Free Software Foundation, either version 3 of the License, or
;  (at your option) any later version.
;
;  This program is distributed in the hope that it will be useful,
;  but WITHOUT ANY WARRANTY; without even the implied warranty of
;  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;  GNU General Public License for more details.
;
;  You should have received a copy of the GNU General Public License
;  along with this program.  If not, see <https://www.gnu.org/licenses/>.
;  ***** END LICENSE BLOCK *****
;
; crawl.ini
;
; Crawl configuration file
;
EOT;
        if (!isset($info['general']['page_range_request'])) {
            $info['general']['page_range_request'] = C\PAGE_RANGE_REQUEST;
        }
        if (!isset($info['general']['max_depth'])) {
            $info['general']['max_depth'] = -1;
        }
        if (!isset($info['general']['repeat_type'])) {
            $info['general']['repeat_type'] = -1;
        }
        if (!isset($info['general']['page_recrawl_frequency'])) {
            $info['general']['page_recrawl_frequency'] =
                C\PAGE_RECRAWL_FREQUENCY;
        }
        if (!isset($info['general']['max_description_len'])) {
            $info['general']['max_description_len'] = C\MAX_DESCRIPTION_LEN;
        }
        $n[] = '[general]';
        $n[] = "crawl_order = '" . $info['general']['crawl_order'] . "';";
        $n[] = "summarizer_option = '" .
            $info['general']['summarizer_option'] . "';";
        $n[] = "crawl_type = '" . $info['general']['crawl_type'] . "';";
        $n[] = "max_depth = '" . $info['general']['max_depth'] . "';";
        $n[] = "repeat_type = '" . $info['general']['repeat_type'] . "';";
        $n[] = "crawl_index = '" . $info['general']['crawl_index'] . "';";
        $n[] = "channel = '" . $info['general']['channel'] . "';";
        $n[] = 'arc_dir = "' . $info["general"]["arc_dir"] . '";';
        $n[] = 'arc_type = "' . $info["general"]["arc_type"] . '";';
        $n[] = "page_recrawl_frequency = '".
            $info['general']['page_recrawl_frequency']."';";
        $n[] = "page_range_request = '".
            $info['general']['page_range_request']."';";
        $n[] = "max_description_len = '".
            $info['general']['max_description_len']."';";
        $bool_string =
            ($info['general']['cache_pages']) ? "true" : "false";
        $n[] = "cache_pages = $bool_string;";
        $bool_string =
            ($info['general']['restrict_sites_by_url']) ? "true" : "false";
        $n[] = "restrict_sites_by_url = $bool_string;";
        $n[] = "";

        $n[] = "[indexed_file_types]";
        if (isset($info["indexed_file_types"]['extensions'])) {
            foreach ($info["indexed_file_types"]['extensions'] as $extension) {
                $n[] = "extensions[] = '$extension';";
            }
        }
        $n[] = "";

        $n[] = "[active_classifiers]";
        if (isset($info['active_classifiers']['label'])) {
            foreach ($info['active_classifiers']['label'] as $label) {
                $n[] = "label[] = '$label';";
            }
        }
        $n[] = "";

        $n[] = "[active_rankers]";
        if (isset($info['active_rankers']['label'])) {
            foreach ($info['active_rankers']['label'] as $label) {
                $n[] = "label[] = '$label';";
            }
        }
        $n[] = "";

        $site_types =
            ['allowed_sites' => 'url', 'disallowed_sites' => 'url',
                'seed_sites' => 'url', 'page_rules'=>'rule'];
        foreach ($site_types as $type => $field) {
            $n[] = "[$type]";
            if (isset($info[$type][$field])) {
                foreach ($info[$type][$field] as $field_value) {
                    $n[] = $field . "[] = '$field_value';";
                }
            }
            $n[]="";
        }
        $n[] = "[indexing_plugins]";
        if (isset($info["indexing_plugins"]['plugins'])) {
            foreach ($info["indexing_plugins"]['plugins'] as $plugin) {
                if ($plugin == "") {
                    continue;
                }
                $n[] = "plugins[] = '$plugin';";
            }
        }
        $out = implode("\n", $n);
        $out .= "\n";
        file_put_contents(C\WORK_DIRECTORY."/crawl.ini", $out);
    }
    /**
     * Returns the crawl parameters that were used during a given crawl
     *
     * @param string $timestamp timestamp of the crawl to load the crawl
     *     parameters of
     * @return array  the first sites to crawl during the next crawl
     *     restrict_by_url, allowed, disallowed_sites
     * @param array $machine_urls an array of urls of yioop queue servers
     *
     */
    public function getCrawlSeedInfo($timestamp,  $machine_urls = null)
    {
        if ($machine_urls != null &&
            !$this->isSingleLocalhost($machine_urls, $timestamp)) {
            /* seed info should be same amongst all queue servers that have it--
               only start schedule differs -- however, not all queue servers
               necessarily have the same crawls. Thus, we still query all
               machines in case only one has it.
             */
            $a_list = $this->execMachines("getCrawlSeedInfo",
                $machine_urls, serialize($timestamp));
            if (is_array($a_list)) {
                foreach ($a_list as $elt) {
                    $seed_info = unserialize(L\webdecode(
                        $elt[self::PAGE]));
                    if (isset($seed_info['general'])) {
                        break;
                    }
                }
            }
            return $seed_info;
        }
        $dir = C\CRAWL_DIR . '/cache/' . self::index_data_base_name .
            $timestamp;
        $is_double_index = file_exists($dir) ? false : true;
        if ($is_double_index) {
            $dir = C\CRAWL_DIR . '/cache/' . self::double_index_base_name .
                $timestamp;
        }
        $seed_info = null;
        $index_bundle_class = C\NS_LIB . ($is_double_index  ?
            "DoubleIndexBundle" : "IndexArchiveBundle");
        if (file_exists($dir) || file_exists($re_dir)) {
            $info = $index_bundle_class::getArchiveInfo($dir);
            if (!isset($info['DESCRIPTION']) ||
                $info['DESCRIPTION'] == null ||
                strstr($info['DESCRIPTION'], "Archive created")) {
                return $seed_info;
            }
            $index_info = unserialize($info['DESCRIPTION']);
            $general_params = ["restrict_sites_by_url" =>
                [self::RESTRICT_SITES_BY_URL, false],
                "crawl_type" => [self::CRAWL_TYPE, self::WEB_CRAWL],
                "channel" => [self::CHANNEL, 0],
                "crawl_index" => [self::CRAWL_INDEX, ''],
                "crawl_order" => [self::CRAWL_ORDER, self::PAGE_IMPORTANCE],
                "max_depth" => [self::MAX_DEPTH, -1],
                "repeat_type" => [self::REPEAT_TYPE, -1],
                "summarizer_option" => [self::SUMMARIZER_OPTION,
                    self::BASIC_SUMMARIZER],
                "arc_dir" => [self::ARC_DIR, ''],
                "arc_type" => [self::ARC_TYPE, ''],
                "cache_pages" => [self::CACHE_PAGES, true],
                "page_recrawl_frequency" => [self::PAGE_RECRAWL_FREQUENCY, -1],
                "page_range_request" => [self::PAGE_RANGE_REQUEST,
                    C\PAGE_RANGE_REQUEST],
                "max_description_len" => [self::MAX_DESCRIPTION_LEN,
                    C\MAX_DESCRIPTION_LEN],
            ];
            foreach ($general_params as $param => $info) {
                $seed_info['general'][$param] = (isset($index_info[$info[0]])) ?
                    $index_info[$info[0]] : $info[1];
            }
            $site_types = [
                "allowed_sites" => [self::ALLOWED_SITES, "url"],
                "disallowed_sites" => [self::DISALLOWED_SITES, "url"],
                "seed_sites" => [self::TO_CRAWL, "url"],
                "page_rules" => [self::PAGE_RULES, "rule"],
                "indexed_file_types" => [self::INDEXED_FILE_TYPES,
                    "extensions"],
            ];
            foreach ($site_types as $type => $info) {
                if (isset($index_info[$info[0]])) {
                    $tmp = & $index_info[$info[0]];
                } else {
                    $tmp = [];
                }
                $seed_info[$type][$info[1]] =  $tmp;
            }
            if (isset($index_info[self::INDEXING_PLUGINS])) {
                $seed_info['indexing_plugins']['plugins'] =
                    $index_info[self::INDEXING_PLUGINS];
            }
            if (isset($index_info[self::INDEXING_PLUGINS_DATA])) {
                $seed_info['indexing_plugins']['plugins_data'] =
                    $index_info[self::INDEXING_PLUGINS_DATA];
            }
        }
        return $seed_info;
    }
    /**
     * Changes the crawl parameters of an existing crawl (can be while crawling)
     * Not all fields are allowed to be updated
     *
     * @param string $timestamp timestamp of the crawl to change
     * @param array $new_info the new parameters
     * @param array $machine_urls an array of urls of yioop queue servers
     */
    public function setCrawlSeedInfo($timestamp, $new_info,
        $machine_urls = null)
    {
        if ($machine_urls != null &&
            !$this->isSingleLocalhost($machine_urls, $timestamp)) {
            $params = [$timestamp, $new_info];
            $this->execMachines("setCrawlSeedInfo",
                $machine_urls, serialize($params));
        }
        $pre_dir = C\CRAWL_DIR . '/cache/' . self::index_data_base_name .
            $timestamp;
        $dir = (file_exists($pre_dir)) ? $pre_dir : C\CRAWL_DIR . '/cache/' .
            self::double_index_base_name . $timestamp;
        $class_name = ($dir == $pre_dir) ? C\NS_LIB . "IndexArchiveBundle" :
            C\NS_LIB . "DoubleIndexBundle";
        if (file_exists($dir)) {
            $info = $class_name::getArchiveInfo($dir);
            $index_info = unserialize($info['DESCRIPTION']);
            if (isset($new_info['general']["restrict_sites_by_url"])) {
                $index_info[self::RESTRICT_SITES_BY_URL] =
                    $new_info['general']["restrict_sites_by_url"];
            }
            if (isset($new_info['general']["restrict_sites_by_url"])) {
                $index_info[self::REPEAT_TYPE] =
                    $new_info['general']["repeat_type"];
            }
            $updatable_site_info = [
                "allowed_sites" => [self::ALLOWED_SITES, 'url'],
                "disallowed_sites" => [self::DISALLOWED_SITES, 'url'],
                "seed_sites" => [self::TO_CRAWL, "url"],
                "page_rules" => [self::PAGE_RULES, 'rule'],
                "indexed_file_types" => [self::INDEXED_FILE_TYPES,
                    "extensions"],
                "active_classifiers" => [self::ACTIVE_CLASSIFIERS, 'label'],
                "active_rankers" => [self::ACTIVE_RANKERS, 'label'],
            ];
            foreach ($updatable_site_info as $type => $type_info) {
                if (isset($new_info[$type][$type_info[1]])) {
                    $index_info[$type_info[0]] =
                        $new_info[$type][$type_info[1]];
                }
            }
            if (isset($new_info['indexing_plugins']['plugins'])) {
                $index_info[self::INDEXING_PLUGINS] =
                    $new_info['indexing_plugins']['plugins'];
            }
            $info['DESCRIPTION'] = serialize($index_info);
            $class_name::setArchiveInfo($dir, $info);
        }
    }
    /**
     *
     */
    public function getChannel($timestamp)
    {
        $seed_info = $this->getCrawlSeedInfo($timestamp);
        $channel = (empty($seed_info['channel'])) ? 0 : $seed_info['channel'];
        return $channel;
    }
    /**
     * Returns an array of urls which were stored via the suggest-a-url
     * form in suggest_view.php
     *
     * @return array urls that have been suggested
     */
    public function getSuggestSites()
    {
        $suggest_file = $this->suggest_url_file;
        if (file_exists($suggest_file)) {
            $urls = file($suggest_file);
        } else {
            $urls = [];
        }
        return $urls;
    }
    /**
     * Add new distinct urls to those already saved in the suggest_url_file
     * If the supplied url is not new or the file size
     * exceeds MAX_SUGGEST_URL_FILE_SIZE then it is not added.
     *
     * @param string $url to add
     * @return string true if the url was added or already existed
     *     in the file; false otherwise
     */
    public function appendSuggestSites($url)
    {
        $suggest_file = $this->suggest_url_file;
        $suggest_size = strlen($url);
        if (file_exists($suggest_file)) {
            $suggest_size += filesize($suggest_file);
        } else {
            $this->clearSuggestSites();
        }
        if ($suggest_size < C\MAX_SUGGEST_URL_FILE_SIZE) {
            $urls = file($suggest_file);
            $urls[] = $url;
            $urls = array_unique($urls);
            $out_string = "";
            $delim = "";
            foreach ($urls as $url) {
                $trim_url = trim($url);
                if (strlen($trim_url) > 0) {
                    $out_string .= $delim . $trim_url;
                    $delim = "\n";
                }
            }
            file_put_contents($suggest_file, $out_string, LOCK_EX);
            return true;
        }
        return false;
    }
    /**
     * Resets the suggest_url_file to be the empty file
     */
    public function clearSuggestSites()
    {
        file_put_contents($this->suggest_url_file, "", LOCK_EX);
    }
    /**
     * Get a description associated with a Web Crawl or Crawl Mix
     *
     * @param int $timestamp of crawl or mix in question
     * @param array $machine_urls an array of urls of yioop queue servers
     *
     * @return array associative array containing item DESCRIPTION
     */
    public function getInfoTimestamp($timestamp, $machine_urls = null)
    {
        $is_mix = $this->isCrawlMix($timestamp);
        $info = [];
        if ($is_mix) {
            $sql = "SELECT TIMESTAMP, NAME FROM CRAWL_MIXES WHERE ".
                " TIMESTAMP=?";
            $result = $this->db->execute($sql, [$timestamp]);
            $mix =  $this->db->fetchArray($result);
            $info['TIMESTAMP'] = $timestamp;
            $info['DESCRIPTION'] = $mix['NAME'];
            $info['IS_MIX'] = true;
        } else {
            if ($machine_urls != null && is_array($machine_urls) &&
                !$this->isSingleLocalhost($machine_urls, $timestamp)) {
                $cache_file = C\CRAWL_DIR . "/cache/" . self::network_base_name.
                    $timestamp . ".txt";
                if (file_exists($cache_file)) {
                    $old_info = unserialize(file_get_contents($cache_file));
                }
                if (isset($old_info) && filemtime($cache_file)
                    + 300 > time()) {
                    return $old_info;
                }
                $info = [];
                if (isset($old_info["MACHINE_URLS"])) {
                    $info["MACHINE_URLS"] = $old_info["MACHINE_URLS"];
                    $machines_urls = array_intersect(
                        $old_info["MACHINE_URLS"], $machines_urls);
                } else {
                    $info["MACHINE_URLS"] = $machine_urls;
                }
                $info_lists = $this->execMachines("getInfoTimestamp",
                    $machines_urls, serialize($timestamp));
                $info['DESCRIPTION'] = "";
                $info["COUNT"] = 0;
                $info['VISITED_URLS_COUNT'] = 0;
                set_error_handler(null);
                foreach ($info_lists as $info_list) {
                    $a_info = @unserialize(L\webdecode(
                        $info_list[self::PAGE]));
                    if (isset($a_info['DESCRIPTION'])) {
                        $info['DESCRIPTION'] = $a_info['DESCRIPTION'];
                    }
                    if (isset($a_info['VISITED_URLS_COUNT'])) {
                        $info['VISITED_URLS_COUNT'] +=
                            $a_info['VISITED_URLS_COUNT'];
                    }
                    if (isset($a_info['COUNT'])) {
                        $info['COUNT'] += $a_info['COUNT'];
                    }
                }
                set_error_handler(C\NS_CONFIGS . "yioop_error_handler");
                file_put_contents($cache_file, serialize($info));
                return $info;
            }
            $pre_dir = C\CRAWL_DIR . '/cache/' . self::index_data_base_name .
                $timestamp;
            $dir = (file_exists($pre_dir)) ? $pre_dir : C\CRAWL_DIR .
                '/cache/' . self::double_index_base_name . $timestamp;
            $class_name = ($dir == $pre_dir) ? C\NS_LIB . "IndexArchiveBundle" :
                C\NS_LIB . "DoubleIndexBundle";
            if (file_exists($dir)) {
                $info = $class_name::getArchiveInfo($dir);
                if (isset($info['DESCRIPTION'])) {
                    $tmp = unserialize($info['DESCRIPTION']);
                    $info['DESCRIPTION'] = isset($tmp['DESCRIPTION']) ?
                        $tmp['DESCRIPTION'] : "";
                }
                $info['COUNT'] = empty($info['QUERY_COUNT']) ?
                    $info['COUNT'] : $info['QUERY_COUNT'];
                $info['VISITED_URLS_COUNT'] =
                    empty($info['QUERY_VISITED_URLS_COUNT']) ?
                    $info['VISITED_URLS_COUNT'] :
                    $info['QUERY_VISITED_URLS_COUNT'];
            }
        }
        return $info;
    }
    /**
     * Deletes the crawl with the supplied timestamp if it exists. Also
     * deletes any crawl mixes making use of this crawl
     *
     * @param string $timestamp a Unix timestamp
     * @param array $machine_urls an array of urls of yioop queue servers
     */
    public function deleteCrawl($timestamp, $machine_urls = null)
    {
        if ($machine_urls != null &&
            !$this->isSingleLocalhost($machine_urls, $timestamp)) {
            //get rid of cache info on Name machine
            $mask = C\CRAWL_DIR . "/cache/" . self::network_crawllist_base_name.
                "*.txt";
            array_map("unlink", glob($mask));
            $delete_files = [
                C\CRAWL_DIR . "/cache/" . self::network_base_name .
                    "$timestamp.txt",
                C\CRAWL_DIR . "/cache/" . self::statistics_base_name .
                    "$timestamp.txt"
            ];
            foreach ($delete_files as $delete_file) {
                if (file_exists($delete_file)) {
                    unlink($delete_file);
                }
            }
            if (!in_array(C\NAME_SERVER, $machine_urls)) {
                array_unshift($machine_urls, C\NAME_SERVER);
            }
            //now get rid of files on queue servers
            $this->execMachines("deleteCrawl",
                $machine_urls, serialize($timestamp));
            return;
        }
        $delete_dirs = [
            C\CRAWL_DIR . '/cache/' . self::double_index_base_name . $timestamp,
            C\CRAWL_DIR . '/cache/' . self::index_data_base_name . $timestamp,
            C\CRAWL_DIR . '/schedules/' . self::index_data_base_name .
                $timestamp,
            C\CRAWL_DIR . '/schedules/' . self::schedule_data_base_name .
                $timestamp,
            C\CRAWL_DIR . '/schedules/' . self::robot_data_base_name .
                $timestamp,
            C\CRAWL_DIR . '/schedules/' . self::name_archive_iterator .
                $timestamp,
        ];
        foreach ($delete_dirs as $delete_dir) {
            if (file_exists($delete_dir)) {
                $this->db->unlinkRecursive($delete_dir, true);
            }
        }
        $save_point_files = glob(C\CRAWL_DIR.'/schedules/'.self::save_point.
            $timestamp . "*.txt");
        foreach ($save_point_files as $save_point_file) {
            @unlink($save_point_file);
        }
        $sql = "SELECT DISTINCT TIMESTAMP FROM MIX_COMPONENTS WHERE ".
            " CRAWL_TIMESTAMP='$timestamp'";
        $result = $this->db->execute($sql);
        $rows = [];
        while ($rows[] =  $this->db->fetchArray($result)) ;
        foreach ($rows as $row) {
            $this->deleteCrawlMix($row['TIMESTAMP']);
        }
        $current_timestamp = $this->getCurrentIndexDatabaseName();
        if ($current_timestamp == $timestamp) {
            $this->db->execute("DELETE FROM CURRENT_WEB_INDEX");
        }
    }
    /**
     * Used to send a message to the queue servers to start a crawl
     *
     * @param array $crawl_params has info like the time of the crawl,
     *      whether starting a new crawl or resuming an old one, etc.
     * @param array $seed_info what urls to crawl, etc as from the crawl.ini
     *      file
     * @param array $machine_urls an array of urls of yioop queue servers
     * @param int $channel if the value is greater than or equal to 0,
     *      the machine  has a queue server on it and will send message
     *      to queue_server with that channel.
     * @param int $num_fetchers number of fetchers on machine to start.
     *      This parameter and $channel are used to start the daemons
     *      running on the machines if they aren't already running
     */
    public function sendStartCrawlMessage($crawl_params, $seed_info = null,
        $machine_urls = null, $num_fetchers = 0)
    {
        if ($machine_urls != null && !$this->isSingleLocalhost($machine_urls)) {
            $params = [$crawl_params, $seed_info];
            $crawl_time = $crawl_params[self::CRAWL_TIME];
            file_put_contents(C\CRAWL_DIR . "/schedules/" .
                "{$crawl_params[self::CHANNEL]}-network_status.txt",
                serialize($crawl_time));
            $this->execMachines("sendStartCrawlMessage",
                $machine_urls, serialize($params), 0, true);
            return true;
        }
        $channel = (empty($crawl_params[self::CHANNEL])) ? 0 :
            $crawl_params[self::CHANNEL];
        $statuses = CrawlDaemon::statuses();
        unset($statuses["index"], $statuses["MediaUpdater"]);
        if ($statuses == [] && $channel != -1) {
            if(!$this->startQueueServerFetchers($channel,
                $num_fetchers)) {
                return false;
            }
        }
        $info_string = serialize($crawl_params);
        file_put_contents(
            C\CRAWL_DIR . "/schedules/$channel-QueueServerMessages.txt",
            $info_string);
        chmod(C\CRAWL_DIR . "/schedules/$channel-QueueServerMessages.txt",
            0777);
        if ($seed_info != null) {
            $scheduler_info[self::HASH_SEEN_URLS] = [];
            $sites = $seed_info['seed_sites']['url'];
            $sites = array_filter($sites, function($site) {
                return $site[0] != "#"; //ignore comments in file
            });
            $num_sites = count($sites);
            // use high 3 bytes for integer weight, 1 byte to remember depth
            $common_start_weight = floor(((1 << 24) - 1) / $num_sites);
            foreach ($sites as $site) {
                $site_parts = preg_split("/\s+/", $site);
                if (strlen($site_parts[0]) > 0) {
                    if ($crawl_params[self::CRAWL_ORDER] ==
                        self::PAGE_IMPORTANCE) {
                        $scheduler_info[self::TO_CRAWL][] =
                            [$site_parts[0], $common_start_weight];
                    } else {
                        $scheduler_info[self::TO_CRAWL][] =
                            [$site_parts[0], 0];
                    }
                }
            }
            $scheduler_string = "\n" . L\webencode(
                gzcompress(serialize($scheduler_info)));
            file_put_contents(
                C\CRAWL_DIR."/schedules/$channel-" . self::schedule_start_name,
                $scheduler_string);
        }
        return true;
    }
    /**
     * Used to start QueueServers and Fetchers on current machine when
     * it is detected that someone tried to start a crawl but hadn't
     * started any queue servers or fetchers.
     *
     * @param int $channel channel of crawl to start
     * @param int $num_fetchers the number of fetchers on the current machine
     * @return bool whether any processes were started
     */
    function startQueueServerFetchers($channel, $num_fetchers)
    {
        $db = $this->db;
        $success = false;
        $machine_name = "NAME_SERVER";
        if (C\NAME_SERVER == C\BASE_URL && $channel == 0 &&
            $num_fetchers == 0) {
            $sql = "SELECT NAME, CHANNEL, NUM_FETCHERS FROM MACHINE" .
                " WHERE URL='" . C\BASE_URL . "' OR URL='BASE_URL'";
            $result = $db->execute($sql);
            if ($result) {
                $row = $db->fetchArray($result);
                $machine_name = $row['NAME'];
                $channel = intval($row['CHANNEL']);
                $num_fetchers = intval($row['NUM_FETCHERS']);
            }
        }
        $sql = "INSERT INTO ACTIVE_PROCESS VALUES (?, ?, ?)";
        /* This method was supposed to be called if nothing is running
           so shouldn't do anything if not starting any fetchers.
           Checking channel >= handles mirror mode. Even if multi channels
           on a given machine, use only one queue server.
         */
        if ($channel >= 0 && $num_fetchers > 0) {
            CrawlDaemon::start("QueueServer", 'none', self::INDEXER, 0);
            CrawlDaemon::start("QueueServer", 'none', self::SCHEDULER, -1);
            $db->execute($sql, [$machine_name, 0, "QueueServer"]);
            $success = true;
        }
        for ($i = 0; $i < $num_fetchers; $i++) {
            $db->execute($sql, [$machine_name, $i, "Fetcher"]);
            CrawlDaemon::start("Fetcher", "$i", "", -1);
            $success = true;
        }
        sleep(5);
        return $success;
    }
    /**
     * Used to send a message to the queue servers to stop a crawl
     * @param $channel of crawl to stop
     * @param array $machine_urls an array of urls of yioop queue servers
     */
    public function sendStopCrawlMessage($channel, $machine_urls = null)
    {
         if ($machine_urls != null && !$this->isSingleLocalhost($machine_urls)){
            $net_stat_file = C\CRAWL_DIR . "/schedules/network_status.txt";
            if (file_exists($net_stat_file)) {
                @unlink($net_stat_file);
            }
            $this->execMachines("sendStopCrawlMessage", $machine_urls,
                $channel);
            return;
        }
        $info = [];
        $info[self::STATUS] = "STOP_CRAWL";
        $info_string = serialize($info);
        file_put_contents(C\CRAWL_DIR . "/schedules/$channel-" .
            "QueueServerMessages.txt", $info_string);
    }
    /**
     * Gets a list of all index archives of crawls that have been conducted
     *
     * @param bool $return_arc_bundles whether index bundles used for indexing
     *     arc or other archive bundles should be included in the lsit
     * @param bool $return_recrawls whether index archive bundles generated as
     *     a result of recrawling should be included in the result
     * @param array $machine_urls an array of urls of yioop queue servers
     * @param bool $cache whether to try to get/set the data to a cache file
     *
     * @return array available IndexArchiveBundle directories and
     *     their meta information this meta information includes the time of
     *     the crawl, its description, the number of pages downloaded, and the
     *     number of partitions used in storing the inverted index
     */
    public function getCrawlList($return_arc_bundles = false,
        $return_recrawls = false, $machine_urls = null, $cache = false)
    {
        if ($machine_urls != null && !$this->isSingleLocalhost($machine_urls)) {
            $arg = ($return_arc_bundles && $return_recrawls) ? 3 :
                (($return_recrawls) ? 2 : (($return_arc_bundles) ? 1 : 0));
            $cache_file = C\CRAWL_DIR . "/cache/" .
                self::network_crawllist_base_name . "$arg.txt";
            if ($cache && file_exists($cache_file) && filemtime($cache_file)
                + 300 > time()) {
                return unserialize(file_get_contents($cache_file));
            }
            $list_strings = $this->execMachines("getCrawlList",
                $machine_urls, $arg);
            $list = $this->aggregateCrawlList($list_strings);
            if ($cache) {
                file_put_contents($cache_file, serialize($list));
            }
            return $list;
        }
        $list = [];
        $dirs = glob(C\CRAWL_DIR.'/cache/{' . self::index_data_base_name .
            ',' . self::double_index_base_name . '}*',
            GLOB_ONLYDIR | GLOB_BRACE);
        foreach ($dirs as $dir) {
            $crawl = [];
            preg_match('/(' .self::index_data_base_name .
                '|'. self::double_index_base_name .')(\d+)$/', $dir, $matches);
            $bundle_class_name = ($matches[1][0] == 'D') ?
                C\NS_LIB . "DoubleIndexBundle" :
                C\NS_LIB . "IndexArchiveBundle";
            $crawl['CRAWL_TIME'] = $matches[2];
            $info = $bundle_class_name::getArchiveInfo($dir);
            if (isset($info['DESCRIPTION'])) {
                set_error_handler(null);
                $index_info = @unserialize($info['DESCRIPTION']);
                set_error_handler(C\NS_CONFIGS . "yioop_error_handler");
            } else {
                $index_info = [];
                $index_info['DESCRIPTION'] = "ERROR!! $dir<br />" .
                    print_r($info, true);
            }
            $crawl['DESCRIPTION'] = "";
            if (!$return_recrawls &&
                isset($index_info[self::CRAWL_TYPE]) &&
                $index_info[self::CRAWL_TYPE] == self::ARCHIVE_CRAWL) {
                continue;
            } elseif ($return_recrawls  &&
                isset($index_info[self::CRAWL_TYPE]) &&
                $index_info[self::CRAWL_TYPE] == self::ARCHIVE_CRAWL
                && empty($index_info[self::ARC_TYPE])) {
                $crawl['DESCRIPTION'] = "RECRAWL::";
            }
            $sched_path = C\CRAWL_DIR . '/schedules/'.
                self::schedule_data_base_name . $crawl['CRAWL_TIME'];
            $crawl['RESUMABLE'] = false;
            if (is_dir($sched_path)) {
                $sched_dir = opendir($sched_path);
                while (($name = readdir($sched_dir)) !==  false) {
                    $sub_path = "$sched_path/$name";
                    if (!is_dir($sub_path) || $name == '.' ||
                        $name == '..') {
                        continue;
                    }
                    $sub_dir = opendir($sub_path);
                    $i = 0;
                    while (($sub_name = readdir($sub_dir)) !==  false &&
                        $i < 5) {
                        if ($sub_name[0] == 'A' && $sub_name[1] == 't') {
                            $crawl['RESUMABLE'] = true;
                            break 2;
                        }
                    }
                    closedir($sub_dir);
                }
                closedir($sched_dir);
            }
            if (isset($index_info['DESCRIPTION'])) {
                $crawl['DESCRIPTION'] .= $index_info['DESCRIPTION'];
            }
            $crawl['VISITED_URLS_COUNT'] =
                isset($info['VISITED_URLS_COUNT']) ?
                $info['VISITED_URLS_COUNT'] : 0;
            $crawl['COUNT'] = (isset($info['COUNT'])) ? $info['COUNT'] : 0;
            if (!empty($info['QUERY_COUNT'])) {
                $crawl['QUERY_VISITED_URLS_COUNT'] =
                    isset($info['QUERY_VISITED_URLS_COUNT']) ?
                    $info['QUERY_VISITED_URLS_COUNT'] : 0;
                $crawl['QUERY_COUNT'] = (isset($info['QUERY_COUNT'])) ?
                    $info['QUERY_COUNT'] : 0;
            }
            $crawl['NUM_DOCS_PER_PARTITION'] =
                (isset($info['NUM_DOCS_PER_PARTITION'])) ?
                $info['NUM_DOCS_PER_PARTITION'] : 0;
            $crawl['WRITE_PARTITION'] =
                (isset($info['WRITE_PARTITION'])) ?
                $info['WRITE_PARTITION'] : 0;
            $list[] = $crawl;
        }
        if ($return_arc_bundles) {
            $dirs = glob(C\CRAWL_DIR.'/archives/*', GLOB_ONLYDIR);
            foreach ($dirs as $dir) {
                $crawl = [];
                $crawl['CRAWL_TIME'] = crc32($dir);
                $crawl['DESCRIPTION'] = "ARCFILE::";
                $crawl['ARC_DIR'] = $dir;
                $ini_file = "$dir/arc_description.ini";
                if (!file_exists($ini_file)) {
                    continue;
                } else {
                    $ini = L\parse_ini_with_fallback($ini_file);
                    $crawl['ARC_TYPE'] = $ini['arc_type'];
                    $crawl['DESCRIPTION'] .= $ini['description'];
                }
                $crawl['VISITED_URLS_COUNT'] = 0;
                $crawl['COUNT'] = 0;
                $crawl['NUM_DOCS_PER_PARTITION'] = 0;
                $crawl['WRITE_PARTITION'] = 0;
                $list[] = $crawl;
            }
        }
        return $list;
    }
    /**
     * When @see getCrawlList() is used in a multi-queue server this method
     * used to integrate the crawl lists received by the different machines
     *
     * @param array $list_strings serialized crawl list data from different
     * queue servers
     * @param string $data_field field of $list_strings to use for data
     * @return array list of crawls and their meta data
     */
    public function aggregateCrawlList($list_strings, $data_field = null)
    {
        set_error_handler(null);
        $pre_list = [];
        foreach ($list_strings as $list_string) {
            $a_list = @unserialize(L\webdecode(
                $list_string[self::PAGE]));
            if ($data_field != null) {
                $a_list = $a_list[$data_field];
            }
            if (is_array($a_list)) {
                foreach ($a_list as $elt) {
                    $timestamp = $elt['CRAWL_TIME'];
                    if (!isset($pre_list[$timestamp])) {
                        $pre_list[$timestamp] = $elt;
                    } else {
                        if (isset($elt["DESCRIPTION"]) &&
                            $elt["DESCRIPTION"] != "") {
                            $pre_list[$timestamp]["DESCRIPTION"] =
                                $elt["DESCRIPTION"];
                        }
                        $pre_list[$timestamp]["VISITED_URLS_COUNT"] +=
                            $elt["VISITED_URLS_COUNT"];
                        $pre_list[$timestamp]["COUNT"] +=
                            $elt["COUNT"];
                        if (!empty($elt["QUERY_COUNT"])) {
                            $pre_list[$timestamp]["QUERY_VISITED_URLS_COUNT"] +=
                                $elt["QUERY_VISITED_URLS_COUNT"];
                            $pre_list[$timestamp]["QUERY_COUNT"] +=
                                $elt["QUERY_COUNT"];
                        }
                        if (isset($elt['RESUMABLE'])) {
                            $pre_list[$timestamp]['RESUMABLE'] |=
                                $elt['RESUMABLE'];
                        }
                    }
                }
            }
        }
        $list = array_values($pre_list);
        set_error_handler(C\NS_CONFIGS . "yioop_error_handler");
        return $list;
    }
    /**
     * Determines if the length of time since any of the fetchers has spoken
     * with any of the queue servers has exceeded CRAWL_TIME_OUT. If so,
     * typically the caller of this method would do something such as officially
     * stop the crawl.
     *
     * @param array $machine_urls an array of urls of yioop queue servers
     * @return bool whether the current crawl is stalled or not
     */
    public function crawlStalled($machine_urls = null)
    {
        if ($machine_urls != null && !$this->isSingleLocalhost($machine_urls)) {
            $outputs = $this->execMachines("crawlStalled", $machine_urls);
            return $this->aggregateStalled($outputs);
        }
        $stat_prefix = C\CRAWL_DIR . "/schedules/";
        $stat_prefix_len = strlen($stat_prefix);
        $stat_suffix = "-crawl_status.txt";
        $stat_suffix_len = strlen($stat_suffix);
        $stat_files = glob("$stat_suffix*$stat_suffix");
        $statuses = [];
        foreach ($stat_files as $stat_file) {
            $channel = substr($stat_file, $stat_prefix_len, -$stat_suffix_len);
            $crawl_status = unserialize(file_get_contents($stat_file));
            $non_repeating = (empty($crawl_status["REPEAT_TYPE"]) ||
                $crawl_status["REPEAT_TYPE"] < 0);
            /* assume if status not updated for CRAWL_TIME_OUT
               crawl not active (do check for both scheduler and indexer) */
            if ($non_repeating && filemtime($stat_file) +
                    C\CRAWL_TIME_OUT < time()) {
                $statuses[$channel] = true;
            } else {
                $schedule_stat_file = C\CRAWL_DIR .
                    "/schedules/$channel-schedule_status.txt";
                $schedule_status_exists = file_exists($schedule_stat_file);
                if ($schedule_status_exists && $non_repeating &&
                    filemtime($schedule_stat_file) + C\CRAWL_TIME_OUT < time()) {
                    $statuses[$channel] = true;
                } else {
                    $statuses[$channel] = false;
                }
            }
        }
        return $statuses;
    }
    /**
     * When @see crawlStalled() is used in a multi-queue server this method
     * used to integrate the stalled information received by the different
     * machines
     *
     * @param array $stall_statuses contains web encoded serialized data one
     * one field of which has the boolean data concerning stalled statis
     *
     * @param string $data_field field of $stall_statuses to use for data
     *     if null then each element of $stall_statuses is a wen encoded
     *     serialized boolean
     * @return array
     */
    public function aggregateStalled($stall_statuses, $data_field = null)
    {
        set_error_handler(null);
        $out_stalled = [];
        foreach ($stall_statuses as $status) {
            $stall_status = @unserialize(L\webdecode($status[self::PAGE]));
            if ($data_field !== null) {
                $stall_status = $stall_status[$data_field];
            }
            foreach ($stall_status  as $channel => $channel_status) {
                if (isset($out_stalled[$channel])) {
                    $out_stalled[$channel] = $channel_status &&
                        $out_stalled[$channel];
                } else {
                    $out_stalled[$channel] = $channel_status;
                }
            }
        }
        set_error_handler(C\NS_CONFIGS . "yioop_error_handler");
        return $out_stalled;
    }
    /**
     * Returns data about current crawl such as DESCRIPTION, TIMESTAMP,
     * peak memory of various processes, most recent fetcher, most recent
     * urls, urls seen, urls visited, etc.
     *
     * @param array $machine_urls an array of urls of yioop queue servers
     *     on which the crawl is being conducted
     * @return array associative array of the said data
     */
    public function crawlStatus($machine_urls = null)
    {
        if ($machine_urls != null && !$this->isSingleLocalhost($machine_urls)) {
            $status_strings = $this->execMachines("crawlStatus", $machine_urls);
            return $this->aggregateStatuses($status_strings);
        }
        $stat_prefix = C\CRAWL_DIR . "/schedules/";
        $stat_prefix_len = strlen($stat_prefix);
        $stat_suffix = "-crawl_status.txt";
        $stat_suffix_len = strlen($stat_suffix);
        $stat_files = glob("$stat_prefix*$stat_suffix");
        $data = [];
        $statuses = [];
        foreach ($stat_files as $stat_file) {
            $channel = substr($stat_file, $stat_prefix_len, -$stat_suffix_len);
            $data[$channel] = [];
            $crawl_status =
                @unserialize(file_get_contents($stat_file));
            if (empty($crawl_status)) {
                unlink($stat_file);
                continue;
            }
            $schedule_status_file = C\CRAWL_DIR .
                "/schedules/$channel-schedule_status.txt";
            $schedule_status_exists = file_exists($schedule_status_file);
            if ($schedule_status_exists) {
                $schedule_status = @unserialize(file_get_contents(
                    $schedule_status_file));
                if (isset($schedule_status[self::TYPE]) &&
                    $schedule_status[self::TYPE] == self::SCHEDULER) {
                    $data[$channel]['SCHEDULER_PEAK_MEMORY'] =
                        isset($schedule_status[self::MEMORY_USAGE]) ?
                        $schedule_status[self::MEMORY_USAGE] : 0;
                }
            }
            $data[$channel] = (isset($crawl_status)
                && is_array($crawl_status)) ?
                array_merge($data[$channel], $crawl_status) : $data[$channel];
            if (isset($data[$channel]['VISITED_COUNT_HISTORY']) &&
                count($data[$channel]['VISITED_COUNT_HISTORY']) > 1) {
                $recent = array_shift($data[$channel]['VISITED_COUNT_HISTORY']);
                $data[$channel]["MOST_RECENT_TIMESTAMP"] = $recent[0];
                $oldest = array_pop($data[$channel]['VISITED_COUNT_HISTORY']);
                unset($data[$channel]['VISITED_COUNT_HISTORY']);
                $change_in_time_hours = floatval(time() - $oldest[0]) /
                    floatval(C\ONE_HOUR);
                $change_in_urls = $recent[1] - $oldest[1];
                $data[$channel]['VISITED_URLS_COUNT_PER_HOUR'] =
                    ($change_in_time_hours > 0) ?
                    $change_in_urls/$change_in_time_hours : 0;
            } else {
                $data[$channel]['VISITED_URLS_COUNT_PER_HOUR'] = 0;
            }
        }
        return $data;
    }
    /**
     * When @see crawlStatus() is used in a multi-queue server this method
     * used to integrate the status information received by the different
     * machines
     *
     * @param array $status_strings
     * @param string $data_field field of $status_strings to use for data
     * @return array associative array of DESCRIPTION, TIMESTAMP,
     * peak memory of various processes, most recent fetcher, most recent
     * urls, urls seen, urls visited, etc.
     */
    public function aggregateStatuses($status_strings, $data_field = null)
    {
        $status = [];
        $init_channel_status = [
            'WEBAPP_PEAK_MEMORY' => 0,
            'FETCHER_PEAK_MEMORY' => 0,
            'QUEUE_PEAK_MEMORY' => 0,
            'SCHEDULER_PEAK_MEMORY' => 0,
            'COUNT' => 0,
            'VISITED_URLS_COUNT' => 0,
            'VISITED_URLS_COUNT_PER_HOUR' => 0,
            'MOST_RECENT_TIMESTAMP' => 0,
            'DESCRIPTION' => "",
            'MOST_RECENT_FETCHER' => "",
            'MOST_RECENT_URLS_SEEN' => [],
            'CRAWL_TIME' => 0
        ];
        set_error_handler(null);
        foreach ($status_strings as $status_string) {
            $a_status = @unserialize(L\webdecode(
                    $status_string[self::PAGE]));
            if ($data_field != null) {
                $a_status = $a_status[$data_field];
            }
            $count_fields = ["COUNT", "VISITED_URLS_COUNT_PER_HOUR",
                "VISITED_URLS_COUNT"];
            if (empty($a_status)) {
                return [];
            }
            foreach ($a_status as $channel => $a_status_data) {
                if (empty($status[$channel])) {
                    $status[$channel] = $init_channel_status;
                }
                foreach ($count_fields as $field) {
                    if (isset($a_status_data[$field])) {
                        $status[$channel][$field] += $a_status_data[$field];
                    }
                }
                if (isset($a_status_data["CRAWL_TIME"]) &&
                    $a_status_data["CRAWL_TIME"] >=
                    $status[$channel]['CRAWL_TIME']) {
                    $status[$channel]['CRAWL_TIME'] =
                        $a_status_data["CRAWL_TIME"];
                    $text_fields = ["DESCRIPTION", "MOST_RECENT_FETCHER"];
                    foreach ($text_fields as $field) {
                        if (isset($a_status_data[$field])) {
                            if ($status[$channel][$field] == "" ||
                                in_array($status[$channel][$field],
                                    ["BEGIN_CRAWL", "RESUME_CRAWL"])) {
                                $status[$channel][$field] =
                                    $a_status_data[$field];
                            }
                        }
                    }
                }
                if (isset($a_status_data["MOST_RECENT_TIMESTAMP"]) &&
                    $status[$channel]["MOST_RECENT_TIMESTAMP"] <=
                        $a_status_data["MOST_RECENT_TIMESTAMP"]) {
                    $status[$channel]["MOST_RECENT_TIMESTAMP"] =
                        $a_status_data["MOST_RECENT_TIMESTAMP"];
                    if (isset($a_status_data['MOST_RECENT_URLS_SEEN'])) {
                        $status[$channel]['MOST_RECENT_URLS_SEEN'] =
                            $a_status_data['MOST_RECENT_URLS_SEEN'];
                    }
                }
                $memory_fields = ["WEBAPP_PEAK_MEMORY", "FETCHER_PEAK_MEMORY",
                    "QUEUE_PEAK_MEMORY", "SCHEDULER_PEAK_MEMORY"];
                foreach ($memory_fields as $field) {
                    $status[$channel][$field] =
                        (!isset($a_status[$field])) ? 0 :
                        max($status[$channel][$field], $a_status[$field]);
                }
            }
        }
        set_error_handler(C\NS_CONFIGS . "yioop_error_handler");
        return $status;
    }
    /**
     * This method is used to reduce the number of network requests
     * needed by the crawlStatus method of admin_controller. It returns
     * an array containing the results of the @see crawlStalled
     * @see crawlStatus and @see getCrawlList methods
     *
     * @param array $machine_urls an array of urls of yioop queue servers
     * @return array containing three components one for each of the three
     *     kinds of results listed above
     */
    public function combinedCrawlInfo($machine_urls = null)
    {
        if ($machine_urls != null && !$this->isSingleLocalhost($machine_urls)) {
            $combined_strings =
                $this->execMachines("combinedCrawlInfo", $machine_urls);
            $combined = [];
            $combined[] = $this->aggregateStalled($combined_strings,
                0);
            $combined[] = $this->aggregateStatuses($combined_strings,
                1);
            $combined[] = $this->aggregateCrawlList($combined_strings,
                2);
            return $combined;
        }
        $combined = [];
        $combined[] = $this->crawlStalled();
        $combined[] = $this->crawlStatus();
        $combined[] = $this->getCrawlList(false, true);
        return $combined;
    }
    /**
     * Add the provided urls to the schedule directory of URLs that will
     * be crawled
     *
     * @param string $timestamp Unix timestamp of crawl to add to schedule of
     * @param array $inject_urls urls to be added to the schedule of
     *     the active crawl
     * @param array $machine_urls an array of urls of yioop queue servers
     */
    public function injectUrlsCurrentCrawl($timestamp, $inject_urls,
        $machine_urls = null)
    {
        if ($machine_urls != null &&
            !$this->isSingleLocalhost($machine_urls, $timestamp)) {
            $this->execMachines("injectUrlsCurrentCrawl", $machine_urls,
                serialize(array($timestamp, $inject_urls)));
            return;
        }

        $dir = C\CRAWL_DIR."/schedules/".
            self::schedule_data_base_name. $timestamp;
        if (!file_exists($dir)) {
            mkdir($dir);
            chmod($dir, 0777);
        }
        $day = floor($timestamp/C\ONE_DAY) - 1;
            /* want before all other schedules,
               execute next */
        $dir .= "/$day";
        if (!file_exists($dir)) {
            mkdir($dir);
            chmod($dir, 0777);
        }
        $count = count($inject_urls);
        if ($count > 0) {
            $now = time();
            $schedule_data = [];
            $schedule_data[self::SCHEDULE_TIME] =
                $timestamp;
            $schedule_data[self::TO_CRAWL] = [];
            for ($i = 0; $i < $count; $i++) {
                $url = $inject_urls[$i];
                $hash = L\crawlHash($now.$url);
                $schedule_data[self::TO_CRAWL][] =
                    [$url, 1, $hash];
            }
            $data_string = L\webencode(
                gzcompress(serialize($schedule_data)));
            $data_hash = L\crawlHash($data_string);
            file_put_contents($dir."/At1From127-0-0-1".
                "WithHash$data_hash.txt", $data_string);
            return true;
        }
        return false;
    }
    /**
     * Computes for each word in an array of words a count of the total number
     * of times it occurs in this crawl model's default index.
     *
     * @param array $words words to find the counts for
     * @param array $machine_urls machines to invoke this command on
     * @return array associative array of word => counts
     */
     public function countWords($words, $machine_urls = null)
     {
         if ($machine_urls != null && !$this->isSingleLocalhost($machine_urls)) {
             $count_strings = $this->execMachines("countWords", $machine_urls,
                serialize(array($words, $this->index_name)));
             $word_counts = [];
             foreach ($count_strings as $count_string) {
                 $a_word_counts = unserialize(L\webdecode(
                        $count_string[self::PAGE]));
                 if (is_array($a_word_counts)) {
                     foreach ($a_word_counts as $word => $count) {
                         $word_counts[$word] = (isset($word_counts[$word])) ?
                            $word_counts[$word] + $count : $count;
                     }
                 }
             }
             return $word_counts;
         }
         $index_archive = IndexManager::getIndex($this->index_name);
         $hashes = [];
         $lookup = [];
         foreach ($words as $word) {
             $tmp = L\crawlHash($word);
             $hashes[] = $tmp;
             $lookup[$tmp] = $word;
         }
         $word_key_counts =
            $index_archive->countWordKeys($hashes);
         $phrases = [];
         $word_counts = [];
         if (is_array($word_key_counts) && count($word_key_counts) > 0) {
             foreach ($word_key_counts as $word_key =>$count) {
                 $word_counts[$lookup[$word_key]] = $count;
             }
         }
         return $word_counts;
     }
}

ViewGit