viewgit/inc/functions.php:22 Function utf8_encode() is deprecated [8192]
<?php /** * SeekQuarry/Yioop -- * Open Source Pure PHP Search Engine, Crawler, and Indexer * * Copyright (C) 2009 - 2023 Chris Pollett chris@pollett.org * * LICENSE: * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <https://www.gnu.org/licenses/>. * * END LICENSE * * @author Chris Pollett chris@pollett.org * @license https://www.gnu.org/licenses/ GPL3 * @link https://www.seekquarry.com/ * @copyright 2009 - 2023 * @filesource */ namespace seekquarry\yioop\library; use seekquarry\yioop\configs as C; /** * For crawlHash */ require_once __DIR__ . "/Utility.php"; /** * Class used to manage open IndexArchiveBundle's while performing * a query. Ensures an easy place to obtain references to these bundles * and ensures only one object per bundle is instantiated in a Singleton-esque * way. * * @author Chris Pollett */ class IndexManager implements CrawlConstants { /** * Open IndexArchiveBundle's managed by this manager * @var array */ public static $indexes = []; /** * List of entries of the form name of bundle => time when cached * @var array */ public static $index_times = []; /** * Max number of IndexArchiveBundles that can be cached */ const INDEX_CACHE_SIZE = 1000; /** * Returns a reference to the managed copy of an IndexArchiveBundle object * with a given timestamp or feed (for handling media feeds) * * @param string $index_name timestamp of desired IndexArchiveBundle * @return object the desired IndexArchiveBundle reference */ public static function getIndex($index_name) { $index_name = trim($index_name ?? ""); //trim to fix postgres quirkiness $cache_dir = C\CACHE_DIR . '/'; $index_archive_full_path = $cache_dir . self::index_data_base_name . $index_name; $handled = false; if ($index_name == "feed" || $index_name == self::FEED_CRAWL_TIME) { $index_archive_name = self::feed_index_data_base_name; $index_name = "feed"; $handled = true; } else if (is_numeric($index_name) && file_exists($index_archive_full_path)) { $index_archive_name = self::index_data_base_name . $index_name; $handled = true; } if (!$handled && is_numeric($index_name) ) { $index_name = $cache_dir . self::double_index_base_name . $index_name; } if (!$handled && file_exists($index_name) ) { $is_repeating = file_exists("$index_name/bundle0"); $serve_archive = "0"; if ($is_repeating) { $status_file = "$index_name/status.txt"; if (file_exists($status_file)) { $status = unserialize(file_get_contents($status_file)); $active_archive = (empty($status["swap_count"])) ? 1 : $status["swap_count"] % 2; $serve_archive = 1 - $active_archive; } } $sub_folder = ($is_repeating) ? "/bundle$serve_archive" : ""; $is_old = ($is_repeating) ? (file_exists($index_name. "/bundle0/summaries")) : (file_exists($index_name . "/summaries")); $bundle_class_name = ($is_old) ? C\NS_LIB . "IndexArchiveBundle" : C\NS_LIB . "IndexDocumentBundle"; $tmp = new $bundle_class_name($index_name . $sub_folder); } else if (!$handled) { return false; } if (empty(self::$indexes[$index_name]) || (!empty(self::$index_times[$index_name]) && ($index_name == 'feed' && php_sapi_name() == 'cli') && (time() - self::$index_times[$index_name]) > C\MIN_QUERY_CACHE_TIME) ) { if (!isset($tmp)) { $index_filename = $cache_dir . $index_archive_name; if (file_exists($index_filename)) { $is_old = (file_exists($index_filename . "/summaries")); $bundle_class_name = ($is_old) ? C\NS_LIB . "IndexArchiveBundle" : C\NS_LIB . "IndexDocumentBundle"; $tmp = new $bundle_class_name($cache_dir . $index_archive_name); if (!$tmp) { return false; } } else { $tmp = false; $use_name = $index_name; $serve_archive = -1; if (preg_match("/\-\d$/", $index_name)) { $serve_archive = substr($index_name, -1); $use_name = substr($index_name, 0, -2); } $index_archive_name = self::double_index_base_name . $use_name; $status_file = $cache_dir . $index_archive_name . "/status.txt"; if ($serve_archive < 0 && file_exists($status_file)) { $status = unserialize(file_get_contents($status_file)); $active_archive = (empty($status["swap_count"])) ? 1 : $status["swap_count"] % 2; $serve_archive = 1 - $active_archive; } $is_old = (file_exists($index_filename . "/bundle0/summaries") || file_exists($index_filename . "/bundle1/summaries")); $bundle_class_name = ($is_old) ? C\NS_LIB . "IndexArchiveBundle" : C\NS_LIB . "IndexDocumentBundle"; $tmp = new $bundle_class_name($cache_dir . $index_archive_name . "/bundle$serve_archive"); if (!$tmp) { $serve_archive = ($serve_archive == 0) ? 1 : 0; $tmp = new $bundle_class_name($cache_dir . $index_archive_name . "/bundle$serve_archive"); } if (!$tmp) { return false; } } } self::$indexes[$index_name] = $tmp; if ($is_old) { self::$indexes[$index_name]->setCurrentShard(0, true); } self::$index_times[$index_name] = time(); /* If too many cached discard oldest 1/3 of cached indices */ if (count(self::$indexes) > self::INDEX_CACHE_SIZE) { $times = array_values(self::$index_times); sort($times); $oldest_third = $times[floor(count($times)/3)]; foreach (self::$index_times as $name => $time) { if ($time <= $oldest_third) { unset(self::$index_times[$name], self::$indexes[$name]); } } } } return self::$indexes[$index_name]; } /** * Clears the static variables in which caches of read in indexes * and dictionary info is stored. */ public static function clearCache() { self::$indexes = []; self::$index_times = []; } /** * Returns the version of the index, so that Yioop can determine * how to do word lookup.The only major change to the format was * when word_id's went from 8 to 20 bytes which happened around Unix * time 1369754208. * * @param string $index_name unix timestamp of index * @return int 0 - if the original format for Yioop indexes; 1 -if 20 byte * word_id format */ public static function getVersion($index_name) { $index_name = (string) $index_name; $index_name = (empty($index_name) || $index_name[0] != '-') ? $index_name : substr($index_name, 1); $index_name_int = intval($index_name); if (!is_numeric($index_name)) { $description_file = $index_name . "/summaries/description.txt"; if (file_exists($description_file)) { $description = unserialize( file_get_contents($description_file)); if (!empty($description['DESCRIPTION'])) { $description = unserialize($description['DESCRIPTION']); } if (!empty($description[self::CRAWL_TIME])) { if (intval($description[self::CRAWL_TIME]) < C\VERSION_0_TIMESTAMP && intval($description[self::CRAWL_TIME]) != self::FEED_CRAWL_TIME) { return 0; } } } } else if ($index_name_int != self::FEED_CRAWL_TIME && $index_name_int < C\VERSION_0_TIMESTAMP) { return 0; } $tmp_index = self::getIndex($index_name); if (isset($tmp_index->version)) { return $tmp_index->version; } else if (isset($tmp_index->archive_info['VERSION'])) { return $tmp_index->archive_info['VERSION']; } return C\DEFAULT_CRAWL_FORMAT; } /** * Gets an array of posting list positions for each shard in the * bundle $index_name for the word id $term_id * * @param string $index_name bundle to look for $term_id in * @param string $term_id id of phrase or word to look up in bundle * dictionary * @param int $threshold after the number of results exceeds this amount * stop looking for more dictionary entries. * @param int $start_generation what generation in the index to start * finding occurrence of phrase from * @param int $num_distinct_generations from $start_generation how * many generation to search forward to * @param bool $with_remaining_total whether to total number of * postings found as well or not * @return array either [total, sequence of four tuples] * or sequence of four tuples: * (index_shard generation, posting_list_offset, length, exact id * that match $term_id) */ public static function getWordInfo($index_name, $term_id, $threshold = -1, $start_generation = -1, $num_distinct_generations = -1, $with_remaining_total = false) { $index = self::getIndex($index_name); $pre_info = []; if (!empty($index) && method_exists($index, "getWordInfo")) { $start_generation = ($start_generation < 0) ? 0 : $start_generation; return $index->getWordInfo($term_id, $threshold, $start_generation, $num_distinct_generations, $with_remaining_total); } else if (!empty($index->dictionary)) { $pre_info = $index->dictionary->getWordInfo($term_id, true, $threshold, $start_generation, $num_distinct_generations, true); } $last_desired_generation = $start_generation + $num_distinct_generations; if (isset($index->generation_info['ACTIVE'])) { $active_generation = $index->generation_info['ACTIVE']; if ((empty($index->generation_info['LAST_DICTIONARY_SHARD']) || $index->generation_info['LAST_DICTIONARY_SHARD'] < $active_generation) && ($active_generation < $last_desired_generation || $last_desired_generation < 0)) { $active_shard_file = $index->dir_name . "/posting_doc_shards/index" . $active_generation; if (file_exists($active_shard_file)) { if (!empty($index->non_merged_shard) && !empty($index->non_merged_generation) && $index->non_merged_generation == $active_generation) { $active_shard = $index->non_merged_shard; } else { $active_shard = new IndexShard($active_shard_file, 0, C\NUM_DOCS_PER_PARTITION, true); $index->non_merged_shard = $active_shard; $index->non_merged_generation = $active_generation; } $active_info = $active_shard->getWordInfo($term_id, true); if (is_array($active_info)) { if (empty($pre_info)) { $pre_info[0] = 0; $pre_info[1] = []; } $pre_info[1][] = [$active_generation, $active_info[0], $active_info[1], $active_info[2], $active_info[3]]; $pre_info[0] += $active_info[2]; } } } } if (!empty($pre_info[1])) { list($total, $info) = $pre_info; } else { $total = 0; $info = []; } return ($with_remaining_total) ? [$total, $info] : $info; } /** * Returns the number of document that a given term or phrase appears in * in the given index where we discount later generation -- those with * lower document rank more * * @param string $term what to look up in the indexes dictionary * no mask is used for this look up * @param string $index_name index to look up term or phrase in * @return int number of documents */ public static function discountedNumDocsTerm($term, $index_name) { static $num_docs_cache = []; if (isset($num_docs_cache[$index_name][$term])) { return $num_docs_cache[$index_name][$term]; } $version = self::getVersion($index_name); $term_id = ($version > 2) ? canonicalTerm($term) : crawlHashWord($term, true); $word_info = self::getWordInfo($index_name, $term_id, -1, 0, C\NUM_DISTINCT_GENERATIONS); if ($version >= 3 && !empty($word_info)) { $word_info = $word_info['ROWS']; } if (empty($word_info)) { return 0.0; } $total = 0.0; $i = 1; foreach ($word_info as $generation_info) { if ($version < 3) { list($generation, , , $num_docs) = $generation_info; } else { $generation = $generation_info['PARTITION']; $num_docs = $generation_info['NUM_DOCS']; } $discount = max($generation + 1, $i++); $total += $num_docs / $discount; } if (count($num_docs_cache) > 1000) { $num_docs_cache = []; } if (!empty($num_docs_cache[$index_name]) && count($num_docs_cache[$index_name]) > 10000) { $num_docs_cache[$index_name] = []; } $num_docs_cache[$index_name][$term] = $total; return $total; } }