Last commit for src/library/DoubleIndexBundle.php: 2addb500315b7393a90fe66431d7832b1e7386c7

Adjust copyrights years

Chris Pollett [2024-01-03 21:Jan:rd]

Adjust copyrights years

<?php
/**
 * SeekQuarry/Yioop --
 * Open Source Pure PHP Search Engine, Crawler, and Indexer
 *
 * Copyright (C) 2009 - 2024  Chris Pollett chris@pollett.org
 *
 * LICENSE:
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 *
 * END LICENSE
 *
 * @author Chris Pollett chris@pollett.org
 * @license https://www.gnu.org/licenses/ GPL3
 * @link https://www.seekquarry.com/
 * @copyright 2009 - 2024
 * @filesource
 */
namespace seekquarry\yioop\library;

use seekquarry\yioop\configs as C;

/**
 * Used for crawlLog and crawlHash
 */
require_once __DIR__ . '/Utility.php';
/**
 * A DoubleIndexBundle encapsulates and provided methods for two
 * IndexDocumentBundle used to store a repeating crawl. One one thse bundles
 * is used to handle current search queries, while the other is used to store
 * an ongoing crawl, once the crawl time has been reach the roles of the two
 * bundles are swapped
 *
 * @author Chris Pollett
 */
class DoubleIndexBundle implements CrawlConstants
{
    /**
     * How frequency the live and ongoing archive should be swapped
     * in seconds
     * @var int
     */
    public $repeat_frequency;
    /**
     * Last time live and ongoing archives were switched
     * @var int
     */
    public $repeat_time;
    /**
     * The number of times live and ongoing archives have swapped
     * @var int
     */
    public $swap_count;
    /**
     * The internal IndexDocumentBundle which is active
     * @var IndexDocumentBundle
     */
    public $active_archive;
    /**
     * The number of the internal IndexDocumentBundle which is active
     * @var IndexDocumentBundle
     */
    public $active_archive_num;
    /**
     * A short text name for this DoubleIndexBundle
     * @var string
     */
    public $description;
    /**
     * Number of docs before a new generation is started for an
     * IndexDocumentBundle in this DoubleIndexBundle
     * @var int
     */
    public $num_docs_per_partition;
    /**
     * Makes or initializes an DoubleIndexBundle with the provided parameters
     *
     * @param string $dir_name folder name to store this bundle
     * @param bool $read_only_archive whether to open archive only for reading
     *      or reading and writing
     * @param string $description a text name/serialized info about this
     *      IndexDocumentBundle
     * @param int $num_docs_per_partition the number of pages to be stored
     *      in a single shard
     * @param int $repeat_frequency how often the crawl should be redone in
     *      seconds (has no effect if $read_only_archive is true)
     */
    public function __construct($dir_name, $read_only_archive = true,
        $description = null, $num_docs_per_partition =
        C\NUM_DOCS_PER_PARTITION, $repeat_frequency = 3600)
    {
        $this->dir_name = $dir_name;
        $this->num_docs_per_partition = $num_docs_per_partition;
        $index_archive_exists = false;
        $is_dir = is_dir($this->dir_name);
        if (!$is_dir && !$read_only_archive) {
            mkdir($this->dir_name);
            $this->active_archive = new IndexDocumentBundle($dir_name .
                "/bundle0", false, null, $num_docs_per_partition);
            $bundle = new IndexDocumentBundle($dir_name . "/bundle1",
                false, null, $num_docs_per_partition);
        } else if (!$is_dir) {
            return false;
        }
        $read_status = false;
        if (file_exists($this->dir_name . "/status.txt")) {
            $status = unserialize(
                file_get_contents($this->dir_name . "/status.txt"));
            $read_status = true;
        }
        $this->repeat_frequency =  (empty($status["repeat_frequency"])) ?
            $repeat_frequency : $status["repeat_frequency"];
        $this->repeat_time = (empty($status["repeat_time"])) ?
            time() : $status["repeat_time"];
        $this->swap_count = (empty($status["swap_count"])) ?
            0 : $status["swap_count"];
        $this->active_archive_num = $this->swap_count % 2;
        $this->description = (empty($status["DESCRIPTION"])) ?
            $description : $status["DESCRIPTION"];
        if (!$read_status && !$read_only_archive) {
            $status = ["repeat_frequency" => $this->repeat_frequency,
                "repeat_time" => $this->repeat_time,
                "swap_count" => $this->swap_count,
                "DESCRIPTION" => $this->description
            ];
            file_put_contents($this->dir_name . "/status.txt",
                serialize($status));
        }
        if (empty($this->active_archive)) {
            $this->active_archive = new IndexDocumentBundle($dir_name .
                "/bundle" . $this->active_archive_num, $read_only_archive, null,
                $num_docs_per_partition);
        }
    }
    /**
     * Switches which of the two bundles is the the one new index data will
     * be written. Before switching closes old bundle properly.
     */
    public function swapActiveBundle()
    {
        if(!$this->active_archive->updateDictionary()) {
            $this->active_archive->forceSave();
        }
        $this->swap_count++;
        $this->active_archive_num = $this->swap_count % 2;
        $bundle_name = $this->dir_name . "/bundle" . $this->active_archive_num;
        $db_class = C\NS_DATASOURCES . ucfirst(C\DBMS) . "Manager";
        $db = new $db_class();
        $db->unlinkRecursive($bundle_name, true);
        $this->active_archive = new IndexDocumentBundle($bundle_name,
            false, null, $this->num_docs_per_partition);
        $this->repeat_time = time();
        $status = ["repeat_frequency" => $this->repeat_frequency,
            "repeat_time" => $this->repeat_time,
            "swap_count" => $this->swap_count,
            "DESCRIPTION" => $this->description
        ];
        file_put_contents($this->dir_name . "/status.txt",
            serialize($status));
    }
    /**
     * Used when a crawl stops to perform final dictionary operations
     * to produce a working stand-alone index.
     */
    public function stopIndexing()
    {
        $this->forceSave();
        $bundle_name = $this->dir_name . "/bundle" . $this->active_archive_num;
        /* we haven't swapped yet, so want to serve results from bundle
           that exists. i.e., the not active one
         */
        $status = ["repeat_frequency" => $this->repeat_frequency,
            "repeat_time" => $this->repeat_time,
            "swap_count" => $this->swap_count,
            "DESCRIPTION" => $this->description
        ];
        file_put_contents($this->dir_name . "/status.txt",
            serialize($status));
    }
    /**
     * Checks if the amount of time since the two IndexDocumentBundles in
     * this DoubleIndexBundle roles have been swapped has exceeded the
     * swap time for this buundle.
     *
     * @return bool true if the swap time has been exceeded
     */
    public function swapTimeReached()
    {
        return ($this->repeat_time + $this->repeat_frequency < time());
    }
    /**
     * Returns a document summary from the arctive archive associated with
     * the supplied key
     * @param string $doc_key key (usually based on url of where document
     *      came from) associated with documnent want summary of
     * @return array desired summary
     */
    public function getSummary($doc_key)
    {
        return $this->active_archive->getSummary($doc_key);
    }
    /**
     * Returns a full page cache (usually the web page downloaded as opposed
     * to a summary of the web page) associated with a supplied key.
     * @param string $doc_key key (usually based on url of where document
     *      came from) associated with documnent want summary of
     * @return array desired cache
     */
    public function getCachePage($doc_key)
    {
        return $this->active_archive->getCachePage($doc_key);
    }
    /**
     * Add the array of $pages to the active IndexDocumentBundle,
     * storing in the partition $generation and the field used
     * to store the resulting offsets given by $offset_field.
     *
     * @param array &$pages data to store
     * @param int $visited_urls_count number to add to the count of visited urls
     *     (visited urls is a smaller number than the total count of objects
     *     stored in the index).
     */
    public function addPages(&$pages, $visited_urls_count)
    {
        return $this->active_archive->addPages($pages,
            $visited_urls_count);
    }
    /**
     * Checks if there is enough data in the active partition of the active
     * archive to warrant storing in the dictionary, if so it builds an
     * inverted index for the active partition of the active archive and
     * adds the postings to the dictionary
     * @param string $taking_too_long_touch name of file to touch if
     *      checking the update takes longer than LOG_TIMEOUT. To prevent
     *      a crawl from stopping because nothing is happening the
     *      file usually supplied is C\SCHEDULES_DIR . "/{$this->channel}-" .
     *      self::crawl_status_file
     */
    public function updateDictionary($taking_too_long_touch = null)
    {
        return $this->active_archive->updateDictionary($taking_too_long_touch);
    }
    /**
     * Given a $site array of information about a web page/document. Use
     * CrawlConstant::URL and CrawlConstant::HASH fields to compute a
     * unique doc id for the array.
     *
     * @param array $site site to compute doc_id for
     */
    public function computeDocId($site)
    {
        return $this->active_archive->computeDocId($site);
    }
    /**
     * Forces the current shard to be saved
     */
    public function forceSave()
    {
        $this->active_archive->forceSave();
    }
    /**
     * The start schedule is the first schedule a queue server makes
     * when a crawl is just started. To facilitate switching between
     * IndexDocumentBundles when doing a crawl with a DoubleIndexBundle
     * this start schedule is stored in the DoubleIndexBundle, when the
     * IndexDocumentBundles' roles (query and crawl) are swapped,
     * the DoubleIndexBundle copy is used to start the crawl from the beginning
     * again. This method copies the start schedule from the schedule folder
     * to the DoubleIndexBundle at the start of a crawl for later use to do
     * this swapping
     *
     * @param string $dir_name folder in the bundle where the schedule
     *      should be stored
     * @param int $channel channel that is being used to do the current
     *      double index crawl. Typical yioop instance might have several
     *      ongoing crawls each with a different channel
     */
    public static function setStartSchedule($dir_name, $channel)
    {
        $start_schedule = C\SCHEDULES_DIR . "/$channel-" .
            self::schedule_start_name;
        if (file_exists($dir_name) && is_dir($dir_name) &&
            file_exists($start_schedule)) {
            copy($start_schedule, $dir_name . "/" . self::schedule_start_name);
        }
    }
    /**
     * The start schedule is the first schedule a queue server makes
     * when a crawl is just started. To facilitate switching between
     * IndexDocumentBundles when doing a crawl with a DoubleIndexBundle
     * this start schedule is stored in the DoubleIndexBundle, when the
     * IndexDocumentBundles' roles (query and crawl) are swapped,
     * this method copies the start schedule from the DoubleIndexBundle
     * to the schedule folder to restart the crawl
     *
     * @param string $dir_name folder in the bundle where the schedule
     *      is stored
     * @param int $channel channel that is being used to do the current
     *      double index crawl. Typical yioop instance might have several
     *      ongoing crawls each with a different channel
     */
    public static function getStartSchedule($dir_name, $channel)
    {
        $start_schedule = C\SCHEDULES_DIR . "/$channel-" .
            self::schedule_start_name;
        if (file_exists($dir_name) && is_dir($dir_name)) {
            copy($dir_name . "/" . self::schedule_start_name, $start_schedule);
        }
    }
    /**
     * Gets information about a DoubleIndexBundle out of its status.txt
     * file
     *
     * @param string $dir_name folder name of the DoubleIndexBundle to get
     *     info for
     * @return array containing the name (description) of the
     *     DouleIndexBundle, the number of items stored in it, and the
     *     number of WebArchive file partitions it uses.
     */
    public static function getArchiveInfo($dir_name)
    {
        $info = unserialize(file_get_contents($dir_name . "/status.txt"));
        $swap_count = intval($info['swap_count']);
        $active = $swap_count % 2;
        $inactive = 1 - $active;
        $bundle_name = $dir_name . "/bundle$active";
        $bundle_class_name =  C\NS_LIB . "IndexDocumentBundle";
        if (file_exists($bundle_name . "/summaries")) {
            $bundle_class_name = C\NS_LIB . "IndexArchiveBundle";
        }
        $count_info = $bundle_class_name::getArchiveInfo($bundle_name);
        $info['COUNT'] = $count_info['COUNT'] +
            ($count_info['ACTIVE_COUNT'] ?? 0);
        $info['VISITED_URLS_COUNT'] = $count_info['VISITED_URLS_COUNT'];
        $bundle_name = $dir_name . "/bundle$inactive";
        $count_info = $bundle_class_name::getArchiveInfo($bundle_name);
        $info['QUERY_COUNT'] = $count_info['COUNT'] +
            ($count_info['ACTIVE_COUNT'] ?? 0);
        $info['QUERY_VISITED_URLS_COUNT'] = $count_info['VISITED_URLS_COUNT'];
        return $info;
    }
    /**
     * Sets the archive info struct for the index archive and web archive
     * bundles associated with this double index bundle. This struct has fields
     * like: DESCRIPTION   (serialied store of global parameters of the crawl
     * like seed sites,  timestamp, etc),  COUNT (num urls seen +
     * pages seen stored for the index archive in use for crawling),
     * VISITED_URLS_COUNT (number of pages seen for the index archive in use for
     * crawling), QUERY_COUNT (num urls seen +
     * pages seen stored for the index archive in use for querying, not
     * crawling), QUERY_VISITED_URLS_COUNT number of pages seen for the
     * index archive in use for querying not crawling),
     * NUM_DOCS_PER_PARTITION (how many doc/web archive in bundle).
     *
     * @param string $dir_name folder with archive bundle
     * @param array $info struct with above fields
     */
    public static function setArchiveInfo($dir_name, $info)
    {
        file_put_contents($dir_name . "/status.txt",
            serialize($info));
    }
    /**
     * Returns the last time the archive info of the bundle was modified.
     *
     * @param string $dir_name folder with archive bundle
     */
    public static function getParamModifiedTime($dir_name)
    {
        $info = unserialize(file_get_contents($dir_name . "/status.txt"));
        $swap_count = intval($info['swap_count']);
        $active = $swap_count % 2;
        $bundle_name = $dir_name . "/bundle" . $active;
        $count_time =
            IndexDocumentBundle::getParamModifiedTime($bundle_name);
        clearstatcache();
        return max(filemtime($dir_name . "/status.txt"), $count_time);
    }
}

ViewGit