viewgit/inc/functions.php:22 Function utf8_encode() is deprecated [8192]
<?php /** * SeekQuarry/Yioop -- * Open Source Pure PHP Search Engine, Crawler, and Indexer * * Copyright (C) 2009 - 2019 Chris Pollett chris@pollett.org * * LICENSE: * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <https://www.gnu.org/licenses/>. * * END LICENSE * * @author Chris Pollett chris@pollett.org * @license https://www.gnu.org/licenses/ GPL3 * @link https://www.seekquarry.com/ * @copyright 2009 - 2019 * @filesource */ namespace seekquarry\yioop\library; use seekquarry\yioop\configs as C; /** * Used for crawlLog and crawlHash */ require_once __DIR__.'/Utility.php'; /** * Encapsulates a set of web page summaries and an inverted word-index of terms * from these summaries which allow one to search for summaries containing a * particular word. * * The basic file structures for an IndexArchiveBundle are: * <ol> * <li>A WebArchiveBundle for web page summaries.</li> * <li>A IndexDictionary containing all the words stored in the bundle. * Each word entry in the dictionary contains starting and ending * offsets for documents containing that word for some particular IndexShard * generation.</li> * <li>A set of index shard generations. These generations * have names index0, index1,... A shard has word entries, word doc entries * and document entries. For more information see the index shard * documentation. * </li> * <li> * The file generations.txt keeps track of what is the current generation. * A given generation can hold NUM_WORDS_PER_GENERATION words amongst all * its partitions. After which the next generation begins. * </li> * </ol> * * * @author Chris Pollett */ class IndexArchiveBundle implements CrawlConstants { /** * Folder name to use for this IndexArchiveBundle * @var string */ public $dir_name; /** * A short text name for this IndexArchiveBundle * @var string */ public $description; /** * Number of partitions in the summaries WebArchiveBundle * @var int */ public $num_partitions_summaries; /** * structure contains info about the current generation: * its index (ACTIVE), and the number of words it contains * (NUM_WORDS). * @var array */ public $generation_info; /** * Number of docs before a new generation is started * @var int */ public $num_docs_per_generation; /** * WebArchiveBundle for web page summaries * @var object */ public $summaries; /** * IndexDictionary for all shards in the IndexArchiveBundle * This contains entries of the form (word, num_shards with word, * posting list info 0th shard containing the word, * posting list info 1st shard containing the word, ...) * @var object */ public $dictionary; /** * Index Shard for current generation inverted word index * @var object */ public $current_shard; /** * What version of index archive bundle this is * @var int */ public $version; /** * Threshold hold beyond which we don't load old index shard when * restarting and instead just advance to a new shard */ const NO_LOAD_SIZE = 50000000; /** * Makes or initializes an IndexArchiveBundle with the provided parameters * * @param string $dir_name folder name to store this bundle * @param bool $read_only_archive whether to open archive only for reading * or reading and writing * @param string $description a text name/serialized info about this * IndexArchiveBundle * @param int $num_docs_per_generation the number of pages to be stored * in a single shard */ public function __construct($dir_name, $read_only_archive = true, $description = null, $num_docs_per_generation = C\NUM_DOCS_PER_GENERATION) { $this->dir_name = $dir_name; $is_dir = is_dir($this->dir_name); if (!$is_dir && !$read_only_archive) { mkdir($this->dir_name); mkdir($this->dir_name . "/posting_doc_shards"); } else if (!$is_dir) { return false; } if (file_exists($this->dir_name . "/generation.txt")) { $this->generation_info = unserialize( file_get_contents($this->dir_name . "/generation.txt")); } else if (!$read_only_archive) { $this->generation_info['ACTIVE'] = 0; file_put_contents($this->dir_name . "/generation.txt", serialize($this->generation_info)); } $this->summaries = new WebArchiveBundle($dir_name . "/summaries", $read_only_archive, -1, $description); if (!$read_only_archive) { $this->summaries->initCountIfNotExists("VISITED_URLS_COUNT"); } $this->description = $this->summaries->description; if (isset($this->summaries->version)) { $this->version = $this->summaries->version; } $this->num_docs_per_generation = $num_docs_per_generation; $this->dictionary = new IndexDictionary($this->dir_name . "/dictionary", $this); } /** * Add the array of $pages to the summaries WebArchiveBundle pages being * stored in the partition $generation and the field used * to store the resulting offsets given by $offset_field. * * @param int $generation field used to select partition * @param string $offset_field field used to record offsets after storing * @param array& $pages data to store * @param int $visited_urls_count number to add to the count of visited urls * (visited urls is a smaller number than the total count of objects * stored in the index). */ public function addPages($generation, $offset_field, &$pages, $visited_urls_count) { $this->summaries->setWritePartition($generation); $this->summaries->addPages($offset_field, $pages); $this->summaries->addCount($visited_urls_count, "VISITED_URLS_COUNT"); } /** * Adds the provided mini inverted index data to the IndexArchiveBundle * Expects initGenerationToAdd to be called before, so generation is correct * * @param object $index_shard a mini inverted index of word_key=>doc data * to add to this IndexArchiveBundle */ public function addIndexData($index_shard) { crawlLog("**ADD INDEX DIAGNOSTIC INFO..."); $start_time = microtime(true); $this->getActiveShard()->appendIndexShard($index_shard); crawlLog("Append Index Shard: Memory usage:".memory_get_usage() . " Time: ".(changeInMicrotime($start_time))); } /** * Determines based on its size, if index_shard should be added to * the active generation or in a new generation should be started. * If so, a new generation is started, the old generation is saved, and * the dictionary of the old shard is copied to the bundles dictionary * and a log-merge performed if needed * * @param int $add_num_docs number of docs in the shard about to be added * @param object $callback object with join function to be * called if process is taking too long * @param bool $blocking whether there is an ongoing merge tiers operation * occurring, if so don't do anything and return -1 * @return int the active generation after the check and possible change has * been performed */ public function initGenerationToAdd($add_num_docs, $callback = null, $blocking = false) { $current_num_docs = $this->getActiveShard()->num_docs; crawlLog("Current index shard has ".$current_num_docs." documents."); $memory_limit = metricToInt(ini_get("memory_limit")); crawlLog("Memory Indexer limit is ".$memory_limit.". Usage is ". memory_get_usage()); if ($current_num_docs + $add_num_docs > $this->num_docs_per_generation || (0.55 * $memory_limit) < memory_get_usage() ) { if ($blocking == true) { return -1; } crawlLog("Switching Index Shard..."); $switch_time = microtime(true); // Save current shard dictionary to main dictionary $this->forceSave(); $this->addAdvanceGeneration($callback); crawlLog("Switch Index Shard time:". changeInMicrotime($switch_time)); } return $this->generation_info['ACTIVE']; } /** * Starts a new generation, the dictionary of the old shard is copied to * the bundles dictionary and a log-merge performed if needed. This * function may be called by initGenerationToAdd as well as when resuming * a crawl rather than loading the periodic index of save of a too large * shard. * * @param object $callback object with join function to be * called if process is taking too long */ public function addAdvanceGeneration($callback = null) { $this->addCurrentShardDictionary($callback); crawlLog("Resaving active shard without prefix and dictionary."); $this->current_shard->saveWithoutDictionary(true); crawlLog("..Done resaving active shard."); //Set up new shard $this->generation_info['ACTIVE']++; $this->generation_info['CURRENT'] = $this->generation_info['ACTIVE']; $current_index_shard_file = $this->dir_name. "/posting_doc_shards/index". $this->generation_info['ACTIVE']; $this->current_shard = new IndexShard( $current_index_shard_file, $this->generation_info['ACTIVE'], $this->num_docs_per_generation); file_put_contents($this->dir_name . "/generation.txt", serialize($this->generation_info)); } /** * Adds the words from this shard to the dictionary * @param object $callback object with join function to be * called if process is taking too long */ public function addCurrentShardDictionary($callback = null) { $current_index_shard_file = $this->dir_name. "/posting_doc_shards/index" . $this->generation_info['ACTIVE']; /* want to do the copying of dictionary as files to conserve memory in case merge tiers after adding to dictionary */ $this->current_shard = new IndexShard( $current_index_shard_file, $this->generation_info['ACTIVE'], $this->num_docs_per_generation, true); $this->dictionary->addShardDictionary($this->current_shard, $callback); } /** * Sets the current shard to be the active shard (the active shard is * what we call the last (highest indexed) shard in the bundle. Then * returns a reference to this shard * @return object last shard in the bundle */ public function getActiveShard() { if ($this->setCurrentShard($this->generation_info['ACTIVE'])) { return $this->getCurrentShard(); } else if (!isset($this->current_shard) ) { $current_index_shard_file = $this->dir_name. "/posting_doc_shards/index". $this->generation_info['CURRENT']; $this->current_shard = new IndexShard($current_index_shard_file, $this->generation_info['CURRENT'], $this->num_docs_per_generation); } return $this->current_shard; } /** * Returns the shard which is currently being used to read word-document * data from the bundle. If one wants to write data to the bundle use * getActiveShard() instead. The point of this method is to allow * for lazy reading of the file associated with the shard. * * @param bool $force_read whether to force no advance generation and * merge dictionary side effects * @return object the currently being index shard */ public function getCurrentShard($force_read = false) { if (!isset($this->current_shard)) { if (!isset($this->generation_info['CURRENT'])) { $this->generation_info['CURRENT'] = $this->generation_info['ACTIVE']; } $current_index_shard_file = $this->dir_name . "/posting_doc_shards/index". $this->generation_info['CURRENT']; if (file_exists($current_index_shard_file)) { if (!empty($this->generation_info['DISK_BASED'])) { $this->current_shard = new IndexShard( $current_index_shard_file, $this->generation_info['CURRENT'], $this->num_docs_per_generation, true); $this->current_shard->getShardHeader($force_read); $this->current_shard->read_only_from_disk = true; } else { if (!$force_read && filesize($current_index_shard_file) > self::NO_LOAD_SIZE) { $this->addAdvanceGeneration(); } else { $this->current_shard = IndexShard::load($current_index_shard_file); } } } else { $this->current_shard = new IndexShard($current_index_shard_file, $this->generation_info['CURRENT'], $this->num_docs_per_generation); } } return $this->current_shard; } /** * Sets the current shard to be the $i th shard in the index bundle. * * @param $i which shard to set the current shard to be * @param $disk_based whether to read the whole shard in before using or * leave it on disk except for pages need and use memcache */ public function setCurrentShard($i, $disk_based = false) { $this->generation_info['DISK_BASED'] = $disk_based; if (isset($this->generation_info['CURRENT']) && isset($this->generation_info['ACTIVE']) && ($i == $this->generation_info['CURRENT'] || $i > $this->generation_info['ACTIVE'])) { return false; } else { $this->generation_info['CURRENT'] = $i; unset($this->current_shard); return true; } } /** * Gets the page out of the summaries WebArchiveBundle with the given * offset and generation * * @param int $offset byte offset in partition of desired page * @param int $generation which generation WebArchive to look up in * defaults to the same number as the current shard * @return array desired page */ public function getPage($offset, $generation = -1) { if ($generation == -1 ) { $generation = $this->generation_info['CURRENT']; } return $this->summaries->getPage($offset, $generation); } /** * Forces the current shard to be saved */ public function forceSave() { $this->getActiveShard()->save(false, true); } /** * Computes the number of occurrences of each of the supplied list of * word_keys * * @param array $word_keys keys to compute counts for * @return array associative array of key => count values. */ public function countWordKeys($word_keys) { $words_array = []; if (!is_array($word_keys) || count($word_keys) < 1) { return null;} foreach ($word_keys as $word_key) { $tmp = $this->dictionary->getWordInfo($word_key); if ($tmp === false) { $words_array[$word_key] = 0; } else { $count = 0; foreach ($tmp as $entry) { $count += $entry[3]; } $words_array[$word_key] = $count; } } return $words_array; } /** * Gets the description, count of summaries, and number of partitions of the * summaries store in the supplied directory. If the file * arc_description.txt exists, this is viewed as a dummy index archive for * the sole purpose of allowing conversions of downloaded data such as arc * files into Yioop! format. * * @param string $dir_name path to a directory containing a summaries * WebArchiveBundle * @return array summary of the given archive */ public static function getArchiveInfo($dir_name) { if (file_exists($dir_name."/arc_description.txt")) { $crawl = []; $info = []; $crawl['DESCRIPTION'] = substr( file_get_contents($dir_name."/arc_description.txt"), 0, 256); $crawl['ARCFILE'] = true; $info['VISITED_URLS_COUNT'] = 0; $info['COUNT'] = 0; $info['NUM_DOCS_PER_PARTITION'] = 0; $info['WRITE_PARTITION'] = 0; $info['DESCRIPTION'] = serialize($crawl); return $info; } if (file_exists($dir_name . "/description.txt")) { $info = WebArchiveBundle::getArchiveInfo($dir_name); if (isset($info['DESCRIPTION'])) { return $info; } } return WebArchiveBundle::getArchiveInfo($dir_name . "/summaries"); } /** * Sets the archive info (DESCRIPTION, COUNT, * NUM_DOCS_PER_PARTITION) for the web archive bundle associated with * this bundle. As DESCRIPTION is used to store info about the info * bundle this sets the global properties of the info bundle as well. * * @param string $dir_name folder with archive bundle * @param array $info struct with above fields */ public static function setArchiveInfo($dir_name, $info) { WebArchiveBundle::setArchiveInfo($dir_name . "/summaries", $info); } /** * Returns the last time the archive info of the bundle was modified. * * @param string $dir_name folder with archive bundle */ public static function getParamModifiedTime($dir_name) { return WebArchiveBundle::getParamModifiedTime($dir_name . "/summaries"); } }