viewgit/inc/functions.php:22 Function utf8_encode() is deprecated [8192]
<?php /** * SeekQuarry/Yioop -- * Open Source Pure PHP Search Engine, Crawler, and Indexer * * Copyright (C) 2009 - 2020 Chris Pollett chris@pollett.org * * LICENSE: * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <https://www.gnu.org/licenses/>. * * END LICENSE * * @author Chris Pollett chris@pollett.org * @license https://www.gnu.org/licenses/ GPL3 * @link https://www.seekquarry.com/ * @copyright 2009 - 2020 * @filesource */ namespace seekquarry\yioop\library; use seekquarry\yioop\configs as C; /** * Used for crawlLog, crawlHash, and garbageCollect */ require_once __DIR__ . '/Utility.php'; /** * Encapsulates a set of web page summaries and an inverted word-index of terms * from these summaries which allow one to search for summaries containing a * particular word. * * The basic file structures for an IndexArchiveBundle are: * <ol> * <li>A WebArchiveBundle for web page summaries.</li> * <li>A IndexDictionary containing all the words stored in the bundle. * Each word entry in the dictionary contains starting and ending * offsets for documents containing that word for some particular IndexShard * generation.</li> * <li>A set of index shard generations. These generations * have names index0, index1,... A shard has word entries, word doc entries * and document entries. For more information see the index shard * documentation. * </li> * <li> * The file generations.txt keeps track of what is the current generation. * A given generation can hold NUM_WORDS_PER_GENERATION words amongst all * its partitions. After which the next generation begins. * </li> * </ol> * * * @author Chris Pollett */ class IndexArchiveBundle implements CrawlConstants { /** * Folder name to use for this IndexArchiveBundle * @var string */ public $dir_name; /** * A short text name for this IndexArchiveBundle * @var string */ public $description; /** * Number of partitions in the summaries WebArchiveBundle * @var int */ public $num_partitions_summaries; /** * structure contains info about the current generation: * its index (ACTIVE), and the number of words it contains * (NUM_WORDS). * @var array */ public $generation_info; /** * Number of docs before a new generation is started * @var int */ public $num_docs_per_generation; /** * WebArchiveBundle for web page summaries * @var object */ public $summaries; /** * IndexDictionary for all shards in the IndexArchiveBundle * This contains entries of the form (word, num_shards with word, * posting list info 0th shard containing the word, * posting list info 1st shard containing the word, ...) * @var object */ public $dictionary; /** * Index Shard for current generation inverted word index * @var object */ public $current_shard; /** * What version of index archive bundle this is * @var int */ public $version; /** * Threshold hold beyond which we don't load old index shard when * restarting and instead just advance to a new shard */ const NO_LOAD_SIZE = 50000000; /** * Threshold index shard beyond which we force the generation to advance */ const FORCE_ADVANCE_SIZE = 120000000; /** * Makes or initializes an IndexArchiveBundle with the provided parameters * * @param string $dir_name folder name to store this bundle * @param bool $read_only_archive whether to open archive only for reading * or reading and writing * @param string $description a text name/serialized info about this * IndexArchiveBundle * @param int $num_docs_per_generation the number of pages to be stored * in a single shard */ public function __construct($dir_name, $read_only_archive = true, $description = null, $num_docs_per_generation = C\NUM_DOCS_PER_GENERATION) { $this->dir_name = $dir_name; $is_dir = is_dir($this->dir_name); if (!$is_dir && !$read_only_archive) { mkdir($this->dir_name); mkdir($this->dir_name . "/posting_doc_shards"); } else if (!$is_dir) { return false; } if (file_exists($this->dir_name . "/generation.txt")) { $this->generation_info = unserialize( file_get_contents($this->dir_name . "/generation.txt")); } else if (!$read_only_archive) { $this->generation_info['ACTIVE'] = 0; $this->generation_info['LAST_DICTIONARY_SHARD'] = -1; file_put_contents($this->dir_name . "/generation.txt", serialize($this->generation_info)); } $this->summaries = new WebArchiveBundle($dir_name . "/summaries", $read_only_archive, -1, $description); if (!$read_only_archive) { $this->summaries->initCountIfNotExists("VISITED_URLS_COUNT"); } $this->description = $this->summaries->description; if (isset($this->summaries->version)) { $this->version = $this->summaries->version; } $this->num_docs_per_generation = $num_docs_per_generation; $this->dictionary = new IndexDictionary($this->dir_name . "/dictionary", $this); } /** * Add the array of $pages to the summaries WebArchiveBundle pages being * stored in the partition $generation and the field used * to store the resulting offsets given by $offset_field. * * @param int $generation field used to select partition * @param string $offset_field field used to record offsets after storing * @param array &$pages data to store * @param int $visited_urls_count number to add to the count of visited urls * (visited urls is a smaller number than the total count of objects * stored in the index). */ public function addPages($generation, $offset_field, &$pages, $visited_urls_count) { $this->summaries->setWritePartition($generation); $this->summaries->addPages($offset_field, $pages); $this->summaries->addCount($visited_urls_count, "VISITED_URLS_COUNT"); } /** * Adds the provided mini inverted index data to the IndexArchiveBundle * Expects initGenerationToAdd to be called before, so generation is correct * * @param object $index_shard a mini inverted index of word_key=>doc data * to add to this IndexArchiveBundle */ public function addIndexData($index_shard) { crawlLog("**ADD INDEX DIAGNOSTIC INFO..."); $start_time = microtime(true); $this->getActiveShard()->appendIndexShard($index_shard); crawlLog("Append Index Shard: Memory usage:" . memory_get_usage() . " Time: ".(changeInMicrotime($start_time))); } /** * Determines based on its size, if index_shard should be added to * the active generation or in a new generation should be started. * If so, a new generation is started, the old generation is saved, and * the dictionary of the old shard is copied to the bundles dictionary * and a log-merge performed if needed * * @param int $add_num_docs number of docs in the shard about to be added * @param object $callback object with join function to be * called if process is taking too long * @param bool $blocking whether there is an ongoing merge tiers operation * occurring, if so don't do anything and return -1 * @return int the active generation after the check and possible change has * been performed */ public function initGenerationToAdd($add_num_docs, $callback = null, $blocking = false) { $active_shard = $this->getActiveShard(); $active_file_name = $this->dir_name. "/posting_doc_shards/index" . $this->generation_info['ACTIVE']; $current_num_docs = (empty($active_shard->num_docs)) ? 0 : $active_shard->num_docs; crawlLog("Current index shard has " . $current_num_docs . " documents."); $memory_limit = metricToInt(ini_get("memory_limit")); $before_usage = memory_get_usage(); crawlLog("Indexer Memory limit is " . $memory_limit . ". Usage is ". $before_usage); $too_many_docs = $current_num_docs + $add_num_docs > $this->num_docs_per_generation; $shard_size_too_big = (file_exists($active_file_name) && filesize($active_file_name) > self::FORCE_ADVANCE_SIZE); $too_close_to_memory_limit = $before_usage > C\MEMORY_FILL_FACTOR * $memory_limit; if ($too_many_docs || $shard_size_too_big || $too_close_to_memory_limit) { if ($blocking == true) { return -1; } crawlLog("Switching Index Shard..."); if ($too_many_docs) { crawlLog("...because too many docs in shard ". "($current_num_docs)."); } if ($shard_size_too_big) { crawlLog("...because shard size uses too much memory (". filesize($active_file_name) . ")."); } if ($too_close_to_memory_limit) { crawlLog("...because too close to overall memory limit (". $before_usage . ")."); } $switch_time = microtime(true); // Save current shard dictionary to main dictionary $this->forceSave(); $this->addAdvanceGeneration($callback); $num_freed = garbageCollect(); crawlLog("Indexer force running garbage collector after generation". " advance. This freed " . $num_freed . " bytes."); $after_usage = memory_get_usage(); crawlLog("Indexer after switch memory usage: $after_usage"); crawlLog("Switch Index Shard time:". changeInMicrotime($switch_time)); } return $this->generation_info['ACTIVE']; } /** * Starts a new generation, the dictionary of the old shard is copied to * the bundles dictionary and a log-merge performed if needed. This * function may be called by initGenerationToAdd as well as when resuming * a crawl rather than loading the periodic index of save of a too large * shard. * * @param object $callback object with join function to be * called if process is taking too long */ public function addAdvanceGeneration($callback = null) { $this->addCurrentShardDictionary($callback); //this saves space, but makes it harder to recover from indexing errors if (C\nsdefined("RESAVE_SHARDS_WITHOUT_DICTIONARY") && C\RESAVE_SHARDS_WITHOUT_DICTIONARY) { crawlLog("Resaving active shard without prefix and dictionary."); $this->current_shard->saveWithoutDictionary(true); } crawlLog("..Done resaving active shard."); //Set up new shard $this->generation_info['ACTIVE']++; $this->generation_info['CURRENT'] = $this->generation_info['ACTIVE']; $current_index_shard_file = $this->dir_name. "/posting_doc_shards/index" . $this->generation_info['ACTIVE']; $this->current_shard = new IndexShard( $current_index_shard_file, $this->generation_info['ACTIVE'], $this->num_docs_per_generation); file_put_contents($this->dir_name . "/generation.txt", serialize($this->generation_info)); } /** * Adds the words from this shard to the dictionary * @param object $callback object with join function to be * called if process is taking too long */ public function addCurrentShardDictionary($callback = null) { $current_index_shard_file = $this->dir_name. "/posting_doc_shards/index" . $this->generation_info['ACTIVE']; /* want to do the copying of dictionary as files to conserve memory in case merge tiers after adding to dictionary */ $this->current_shard = new IndexShard( $current_index_shard_file, $this->generation_info['ACTIVE'], $this->num_docs_per_generation, true); $this->dictionary->addShardDictionary($this->current_shard, $callback); $this->generation_info['LAST_DICTIONARY_SHARD'] = $this->generation_info['ACTIVE']; } /** * Sets the current shard to be the active shard (the active shard is * what we call the last (highest indexed) shard in the bundle. Then * returns a reference to this shard * @return object last shard in the bundle */ public function getActiveShard() { if ($this->setCurrentShard($this->generation_info['ACTIVE'])) { return $this->getCurrentShard(); } else if (!isset($this->current_shard) ) { $current_index_shard_file = $this->dir_name. "/posting_doc_shards/index". $this->generation_info['CURRENT']; $this->current_shard = new IndexShard($current_index_shard_file, $this->generation_info['CURRENT'], $this->num_docs_per_generation); } return $this->current_shard; } /** * Returns the shard which is currently being used to read word-document * data from the bundle. If one wants to write data to the bundle use * getActiveShard() instead. The point of this method is to allow * for lazy reading of the file associated with the shard. * * @param bool $force_read whether to force no advance generation and * merge dictionary side effects * @return object the currently being index shard */ public function getCurrentShard($force_read = false) { if (!isset($this->current_shard)) { if (!isset($this->generation_info['CURRENT'])) { $this->generation_info['CURRENT'] = $this->generation_info['ACTIVE']; } $current_index_shard_file = $this->dir_name . "/posting_doc_shards/index". $this->generation_info['CURRENT']; if (file_exists($current_index_shard_file)) { if (!empty($this->generation_info['DISK_BASED'])) { $this->current_shard = new IndexShard( $current_index_shard_file, $this->generation_info['CURRENT'], $this->num_docs_per_generation, true); $this->current_shard->getShardHeader($force_read); $this->current_shard->read_only_from_disk = true; } else { if (!$force_read && filesize($current_index_shard_file) > self::NO_LOAD_SIZE) { $this->addAdvanceGeneration(); } else { $this->current_shard = IndexShard::load($current_index_shard_file); } } } else { $this->current_shard = new IndexShard($current_index_shard_file, $this->generation_info['CURRENT'], $this->num_docs_per_generation); } } return $this->current_shard; } /** * Sets the current shard to be the $i th shard in the index bundle. * * @param $i which shard to set the current shard to be * @param $disk_based whether to read the whole shard in before using or * leave it on disk except for pages need */ public function setCurrentShard($i, $disk_based = false) { $this->generation_info['DISK_BASED'] = $disk_based; if (isset($this->generation_info['CURRENT']) && isset($this->generation_info['ACTIVE']) && ($i == $this->generation_info['CURRENT'] || $i > $this->generation_info['ACTIVE'])) { return false; } else { $this->generation_info['CURRENT'] = $i; unset($this->current_shard); return true; } } /** * Gets the page out of the summaries WebArchiveBundle with the given * offset and generation * * @param int $offset byte offset in partition of desired page * @param int $generation which generation WebArchive to look up in * defaults to the same number as the current shard * @return array desired page */ public function getPage($offset, $generation = -1) { if ($generation == -1 ) { $generation = $this->generation_info['CURRENT']; } return $this->summaries->getPage($offset, $generation); } /** * Forces the current shard to be saved */ public function forceSave() { $this->getActiveShard()->save(false, true); } /** * Used when a crawl stops to perform final dictionary operations * to produce a working stand-alone index. */ public function stopIndexingBundle() { $this->forceSave(); $this->addAdvanceGeneration(); $this->dictionary->mergeAllTiers(); } /** * Computes the number of occurrences of each of the supplied list of * word_keys * * @param array $word_keys keys to compute counts for * @return array associative array of key => count values. */ public function countWordKeys($word_keys) { $words_array = []; if (!is_array($word_keys) || count($word_keys) < 1) { return null; } foreach ($word_keys as $word_key) { $tmp = $this->dictionary->getWordInfo($word_key); if ($tmp === false) { $words_array[$word_key] = 0; } else { $count = 0; foreach ($tmp as $entry) { $count += $entry[3]; } $words_array[$word_key] = $count; } } return $words_array; } /** * Gets the description, count of summaries, and number of partitions of the * summaries store in the supplied directory. If the file * arc_description.txt exists, this is viewed as a dummy index archive for * the sole purpose of allowing conversions of downloaded data such as arc * files into Yioop! format. * * @param string $dir_name path to a directory containing a summaries * WebArchiveBundle * @return array summary of the given archive */ public static function getArchiveInfo($dir_name) { if (file_exists($dir_name . "/arc_description.txt")) { $crawl = []; $info = []; $crawl['DESCRIPTION'] = substr( file_get_contents($dir_name . "/arc_description.txt"), 0, 256); $crawl['ARCFILE'] = true; $info['VISITED_URLS_COUNT'] = 0; $info['COUNT'] = 0; $info['NUM_DOCS_PER_PARTITION'] = 0; $info['WRITE_PARTITION'] = 0; $info['DESCRIPTION'] = serialize($crawl); return $info; } if (file_exists($dir_name . "/description.txt")) { $info = WebArchiveBundle::getArchiveInfo($dir_name); if (isset($info['DESCRIPTION'])) { return $info; } } return WebArchiveBundle::getArchiveInfo($dir_name . "/summaries"); } /** * Sets the archive info struct for the web archive bundle associated with * this bundle. This struct has fields like: DESCRIPTION * (serialied store of global parameters of the crawl like seed sites, * timestamp, etc), COUNT (num urls seen + pages seen stored), * VISITED_URLS_COUNT (number of pages seen while crawling), * NUM_DOCS_PER_PARTITION (how many doc/web archive in bundle). * * @param string $dir_name folder with archive bundle * @param array $info struct with above fields */ public static function setArchiveInfo($dir_name, $info) { WebArchiveBundle::setArchiveInfo($dir_name . "/summaries", $info); } /** * Returns the last time the archive info of the bundle was modified. * * @param string $dir_name folder with archive bundle */ public static function getParamModifiedTime($dir_name) { return WebArchiveBundle::getParamModifiedTime($dir_name . "/summaries"); } }