viewgit/inc/functions.php:22 Function utf8_encode() is deprecated [8192]
<?php /** * SeekQuarry/Yioop -- * Open Source Pure PHP Search Engine, Crawler, and Indexer * * Copyright (C) 2009 - 2024 Chris Pollett chris@pollett.org * * LICENSE: * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <https://www.gnu.org/licenses/>. * * END LICENSE * * @author Chris Pollett chris@pollett.org * @license https://www.gnu.org/licenses/ GPL3 * @link https://www.seekquarry.com/ * @copyright 2009 - 2024 * @filesource */ namespace seekquarry\yioop\library; use seekquarry\yioop\configs as C; /** * Used for crawlLog and crawlHash */ require_once __DIR__ . '/Utility.php'; /** * A DoubleIndexBundle encapsulates and provided methods for two * IndexDocumentBundle used to store a repeating crawl. One one thse bundles * is used to handle current search queries, while the other is used to store * an ongoing crawl, once the crawl time has been reach the roles of the two * bundles are swapped * * @author Chris Pollett */ class DoubleIndexBundle implements CrawlConstants { /** * How frequency the live and ongoing archive should be swapped * in seconds * @var int */ public $repeat_frequency; /** * Last time live and ongoing archives were switched * @var int */ public $repeat_time; /** * The number of times live and ongoing archives have swapped * @var int */ public $swap_count; /** * The internal IndexDocumentBundle which is active * @var IndexDocumentBundle */ public $active_archive; /** * The number of the internal IndexDocumentBundle which is active * @var IndexDocumentBundle */ public $active_archive_num; /** * A short text name for this DoubleIndexBundle * @var string */ public $description; /** * Number of docs before a new generation is started for an * IndexDocumentBundle in this DoubleIndexBundle * @var int */ public $num_docs_per_partition; /** * Makes or initializes an DoubleIndexBundle with the provided parameters * * @param string $dir_name folder name to store this bundle * @param bool $read_only_archive whether to open archive only for reading * or reading and writing * @param string $description a text name/serialized info about this * IndexDocumentBundle * @param int $num_docs_per_partition the number of pages to be stored * in a single shard * @param int $repeat_frequency how often the crawl should be redone in * seconds (has no effect if $read_only_archive is true) */ public function __construct($dir_name, $read_only_archive = true, $description = null, $num_docs_per_partition = C\NUM_DOCS_PER_PARTITION, $repeat_frequency = 3600) { $this->dir_name = $dir_name; $this->num_docs_per_partition = $num_docs_per_partition; $index_archive_exists = false; $is_dir = is_dir($this->dir_name); if (!$is_dir && !$read_only_archive) { mkdir($this->dir_name); $this->active_archive = new IndexDocumentBundle($dir_name . "/bundle0", false, null, $num_docs_per_partition); $bundle = new IndexDocumentBundle($dir_name . "/bundle1", false, null, $num_docs_per_partition); } else if (!$is_dir) { return false; } $read_status = false; if (file_exists($this->dir_name . "/status.txt")) { $status = unserialize( file_get_contents($this->dir_name . "/status.txt")); $read_status = true; } $this->repeat_frequency = (empty($status["repeat_frequency"])) ? $repeat_frequency : $status["repeat_frequency"]; $this->repeat_time = (empty($status["repeat_time"])) ? time() : $status["repeat_time"]; $this->swap_count = (empty($status["swap_count"])) ? 0 : $status["swap_count"]; $this->active_archive_num = $this->swap_count % 2; $this->description = (empty($status["DESCRIPTION"])) ? $description : $status["DESCRIPTION"]; if (!$read_status && !$read_only_archive) { $status = ["repeat_frequency" => $this->repeat_frequency, "repeat_time" => $this->repeat_time, "swap_count" => $this->swap_count, "DESCRIPTION" => $this->description ]; file_put_contents($this->dir_name . "/status.txt", serialize($status)); } if (empty($this->active_archive)) { $this->active_archive = new IndexDocumentBundle($dir_name . "/bundle" . $this->active_archive_num, $read_only_archive, null, $num_docs_per_partition); } } /** * Switches which of the two bundles is the the one new index data will * be written. Before switching closes old bundle properly. */ public function swapActiveBundle() { if(!$this->active_archive->updateDictionary()) { $this->active_archive->forceSave(); } $this->swap_count++; $this->active_archive_num = $this->swap_count % 2; $bundle_name = $this->dir_name . "/bundle" . $this->active_archive_num; $db_class = C\NS_DATASOURCES . ucfirst(C\DBMS) . "Manager"; $db = new $db_class(); $db->unlinkRecursive($bundle_name, true); $this->active_archive = new IndexDocumentBundle($bundle_name, false, null, $this->num_docs_per_partition); $this->repeat_time = time(); $status = ["repeat_frequency" => $this->repeat_frequency, "repeat_time" => $this->repeat_time, "swap_count" => $this->swap_count, "DESCRIPTION" => $this->description ]; file_put_contents($this->dir_name . "/status.txt", serialize($status)); } /** * Used when a crawl stops to perform final dictionary operations * to produce a working stand-alone index. */ public function stopIndexing() { $this->forceSave(); $bundle_name = $this->dir_name . "/bundle" . $this->active_archive_num; /* we haven't swapped yet, so want to serve results from bundle that exists. i.e., the not active one */ $status = ["repeat_frequency" => $this->repeat_frequency, "repeat_time" => $this->repeat_time, "swap_count" => $this->swap_count, "DESCRIPTION" => $this->description ]; file_put_contents($this->dir_name . "/status.txt", serialize($status)); } /** * Checks if the amount of time since the two IndexDocumentBundles in * this DoubleIndexBundle roles have been swapped has exceeded the * swap time for this buundle. * * @return bool true if the swap time has been exceeded */ public function swapTimeReached() { return ($this->repeat_time + $this->repeat_frequency < time()); } /** * Returns a document summary from the arctive archive associated with * the supplied key * @param string $doc_key key (usually based on url of where document * came from) associated with documnent want summary of * @return array desired summary */ public function getSummary($doc_key) { return $this->active_archive->getSummary($doc_key); } /** * Returns a full page cache (usually the web page downloaded as opposed * to a summary of the web page) associated with a supplied key. * @param string $doc_key key (usually based on url of where document * came from) associated with documnent want summary of * @return array desired cache */ public function getCachePage($doc_key) { return $this->active_archive->getCachePage($doc_key); } /** * Add the array of $pages to the active IndexDocumentBundle, * storing in the partition $generation and the field used * to store the resulting offsets given by $offset_field. * * @param array &$pages data to store * @param int $visited_urls_count number to add to the count of visited urls * (visited urls is a smaller number than the total count of objects * stored in the index). */ public function addPages(&$pages, $visited_urls_count) { return $this->active_archive->addPages($pages, $visited_urls_count); } /** * Checks if there is enough data in the active partition of the active * archive to warrant storing in the dictionary, if so it builds an * inverted index for the active partition of the active archive and * adds the postings to the dictionary * @param string $taking_too_long_touch name of file to touch if * checking the update takes longer than LOG_TIMEOUT. To prevent * a crawl from stopping because nothing is happening the * file usually supplied is C\SCHEDULES_DIR . "/{$this->channel}-" . * self::crawl_status_file */ public function updateDictionary($taking_too_long_touch = null) { return $this->active_archive->updateDictionary($taking_too_long_touch); } /** * Given a $site array of information about a web page/document. Use * CrawlConstant::URL and CrawlConstant::HASH fields to compute a * unique doc id for the array. * * @param array $site site to compute doc_id for */ public function computeDocId($site) { return $this->active_archive->computeDocId($site); } /** * Forces the current shard to be saved */ public function forceSave() { $this->active_archive->forceSave(); } /** * The start schedule is the first schedule a queue server makes * when a crawl is just started. To facilitate switching between * IndexDocumentBundles when doing a crawl with a DoubleIndexBundle * this start schedule is stored in the DoubleIndexBundle, when the * IndexDocumentBundles' roles (query and crawl) are swapped, * the DoubleIndexBundle copy is used to start the crawl from the beginning * again. This method copies the start schedule from the schedule folder * to the DoubleIndexBundle at the start of a crawl for later use to do * this swapping * * @param string $dir_name folder in the bundle where the schedule * should be stored * @param int $channel channel that is being used to do the current * double index crawl. Typical yioop instance might have several * ongoing crawls each with a different channel */ public static function setStartSchedule($dir_name, $channel) { $start_schedule = C\SCHEDULES_DIR . "/$channel-" . self::schedule_start_name; if (file_exists($dir_name) && is_dir($dir_name) && file_exists($start_schedule)) { copy($start_schedule, $dir_name . "/" . self::schedule_start_name); } } /** * The start schedule is the first schedule a queue server makes * when a crawl is just started. To facilitate switching between * IndexDocumentBundles when doing a crawl with a DoubleIndexBundle * this start schedule is stored in the DoubleIndexBundle, when the * IndexDocumentBundles' roles (query and crawl) are swapped, * this method copies the start schedule from the DoubleIndexBundle * to the schedule folder to restart the crawl * * @param string $dir_name folder in the bundle where the schedule * is stored * @param int $channel channel that is being used to do the current * double index crawl. Typical yioop instance might have several * ongoing crawls each with a different channel */ public static function getStartSchedule($dir_name, $channel) { $start_schedule = C\SCHEDULES_DIR . "/$channel-" . self::schedule_start_name; if (file_exists($dir_name) && is_dir($dir_name)) { copy($dir_name . "/" . self::schedule_start_name, $start_schedule); } } /** * Gets information about a DoubleIndexBundle out of its status.txt * file * * @param string $dir_name folder name of the DoubleIndexBundle to get * info for * @return array containing the name (description) of the * DouleIndexBundle, the number of items stored in it, and the * number of WebArchive file partitions it uses. */ public static function getArchiveInfo($dir_name) { $info = unserialize(file_get_contents($dir_name . "/status.txt")); $swap_count = intval($info['swap_count']); $active = $swap_count % 2; $inactive = 1 - $active; $bundle_name = $dir_name . "/bundle$active"; $bundle_class_name = C\NS_LIB . "IndexDocumentBundle"; if (file_exists($bundle_name . "/summaries")) { $bundle_class_name = C\NS_LIB . "IndexArchiveBundle"; } $count_info = $bundle_class_name::getArchiveInfo($bundle_name); $info['COUNT'] = $count_info['COUNT'] + ($count_info['ACTIVE_COUNT'] ?? 0); $info['VISITED_URLS_COUNT'] = $count_info['VISITED_URLS_COUNT']; $bundle_name = $dir_name . "/bundle$inactive"; $count_info = $bundle_class_name::getArchiveInfo($bundle_name); $info['QUERY_COUNT'] = $count_info['COUNT'] + ($count_info['ACTIVE_COUNT'] ?? 0); $info['QUERY_VISITED_URLS_COUNT'] = $count_info['VISITED_URLS_COUNT']; return $info; } /** * Sets the archive info struct for the index archive and web archive * bundles associated with this double index bundle. This struct has fields * like: DESCRIPTION (serialied store of global parameters of the crawl * like seed sites, timestamp, etc), COUNT (num urls seen + * pages seen stored for the index archive in use for crawling), * VISITED_URLS_COUNT (number of pages seen for the index archive in use for * crawling), QUERY_COUNT (num urls seen + * pages seen stored for the index archive in use for querying, not * crawling), QUERY_VISITED_URLS_COUNT number of pages seen for the * index archive in use for querying not crawling), * NUM_DOCS_PER_PARTITION (how many doc/web archive in bundle). * * @param string $dir_name folder with archive bundle * @param array $info struct with above fields */ public static function setArchiveInfo($dir_name, $info) { file_put_contents($dir_name . "/status.txt", serialize($info)); } /** * Returns the last time the archive info of the bundle was modified. * * @param string $dir_name folder with archive bundle */ public static function getParamModifiedTime($dir_name) { $info = unserialize(file_get_contents($dir_name . "/status.txt")); $swap_count = intval($info['swap_count']); $active = $swap_count % 2; $bundle_name = $dir_name . "/bundle" . $active; $count_time = IndexDocumentBundle::getParamModifiedTime($bundle_name); clearstatcache(); return max(filemtime($dir_name . "/status.txt"), $count_time); } }