viewgit/inc/functions.php:22 Function utf8_encode() is deprecated [8192]

Last commit for src/library/IndexManager.php: 2af2b1d2598762884bdeb561402a1cf6ca9acaaa

Take 2 on last

Chris Pollett [2024-01-23 00:Jan:rd]
Take 2 on last
 * SeekQuarry/Yioop --
 * Open Source Pure PHP Search Engine, Crawler, and Indexer
 * Copyright (C) 2009 - 2024  Chris Pollett
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * GNU General Public License for more details.
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <>.
 * @author Chris Pollett
 * @license GPL3
 * @link
 * @copyright 2009 - 2024
 * @filesource
namespace seekquarry\yioop\library;

use seekquarry\yioop\configs as C;
use seekquarry\yioop\library as L;
use seekquarry\yioop\models\ParallelModel;

 * For crawlHash
require_once __DIR__ . "/Utility.php";
 * Class used to manage open IndexArchiveBundle's while performing
 * a query. Ensures an easy place to obtain references to these bundles
 * and ensures only one object per bundle is instantiated in a Singleton-esque
 * way.
 * @author Chris Pollett
class IndexManager implements CrawlConstants
     * Open IndexArchiveBundle's managed by this manager
     * @var array
    public static $indexes = [];
     * List of entries of the form name of bundle => time when cached
     * @var array
    public static $index_times = [];
     * List of entries of the form name of url => doc_map info when cached
     * @var array
    public static $urls_cache = [];
     * Max number of IndexArchiveBundles that can be cached
    const INDEX_CACHE_SIZE = 1000;
     * Max number of URLs to be cached for most recent version of a page lookup
    const URLS_CACHE_SIZE = 1000;
     * Max number of Word Info items that can be cached
    const INFO_CACHE_SIZE = 1000;
     * Returns a reference to the managed copy of an IndexArchiveBundle object
     * with a given timestamp or feed (for handling media feeds)
     * @param string $index_name timestamp of desired IndexArchiveBundle
     * @return object the desired IndexArchiveBundle reference
    public static function getIndex($index_name)
        $index_name = trim($index_name ?? ""); //trim to fix postgres quirkiness
        $cache_dir = C\CACHE_DIR . '/';
        $index_archive_full_path = $cache_dir . self::index_data_base_name .
        $handled = false;
        if ($index_name == "feed" || $index_name == self::FEED_CRAWL_TIME) {
            $index_archive_name = self::feed_index_data_base_name;
            $index_name = "feed";
            $handled = true;
        } else if (is_numeric($index_name) &&
            file_exists($index_archive_full_path)) {
            $index_archive_name = self::index_data_base_name . $index_name;
            $handled = true;
        if (!$handled && is_numeric($index_name) ) {
            $index_name = $cache_dir . self::double_index_base_name .
        if (!$handled && file_exists($index_name) ) {
            $is_repeating = file_exists("$index_name/bundle0");
            $serve_archive = "0";
            if ($is_repeating) {
                $status_file = "$index_name/status.txt";
                if (file_exists($status_file)) {
                    $status = unserialize(file_get_contents($status_file));
                    $active_archive = (empty($status["swap_count"])) ? 1 :
                        $status["swap_count"] % 2;
                    $serve_archive = 1 - $active_archive;
            $sub_folder = ($is_repeating) ? "/bundle$serve_archive" : "";
            $is_old = ($is_repeating) ? (file_exists($index_name.
                "/bundle0/summaries")) : (file_exists($index_name .
            $bundle_class_name = ($is_old) ? C\NS_LIB . "IndexArchiveBundle"
                : C\NS_LIB . "IndexDocumentBundle";
            $tmp = new $bundle_class_name($index_name . $sub_folder);
        } else if (!$handled) {
            return false;
        if (empty(self::$indexes[$index_name]) ||
            (!empty(self::$index_times[$index_name]) &&
            ($index_name == 'feed' && php_sapi_name() == 'cli') &&
            (time() - self::$index_times[$index_name])
            > C\MIN_QUERY_CACHE_TIME) ) {
            if (!isset($tmp)) {
                $index_filename = $cache_dir . $index_archive_name;
                if (file_exists($index_filename)) {
                    $is_old = (file_exists($index_filename . "/summaries"));
                    $bundle_class_name = ($is_old) ? C\NS_LIB .
                        "IndexArchiveBundle" : C\NS_LIB . "IndexDocumentBundle";
                    $tmp = new $bundle_class_name($cache_dir .
                    if (!$tmp) {
                        return false;
                } else {
                    $tmp = false;
                    $use_name = $index_name;
                    $serve_archive = -1;
                    if (preg_match("/\-\d$/", $index_name)) {
                        $serve_archive = substr($index_name, -1);
                        $use_name = substr($index_name, 0, -2);
                    $index_archive_name = self::double_index_base_name .
                    $status_file = $cache_dir . $index_archive_name .
                    if ($serve_archive < 0 && file_exists($status_file)) {
                        $status = unserialize(file_get_contents($status_file));
                        $active_archive = (empty($status["swap_count"])) ? 1 :
                            $status["swap_count"] % 2;
                        $serve_archive = 1 - $active_archive;
                    $is_old = (file_exists($index_filename .
                        "/bundle0/summaries") ||
                        file_exists($index_filename . "/bundle1/summaries"));
                    $bundle_class_name = ($is_old) ?
                        C\NS_LIB . "IndexArchiveBundle"
                        : C\NS_LIB . "IndexDocumentBundle";
                    $tmp = new $bundle_class_name($cache_dir .
                        $index_archive_name . "/bundle$serve_archive");
                    if (!$tmp) {
                        $serve_archive = ($serve_archive == 0) ? 1 : 0;
                        $tmp = new $bundle_class_name($cache_dir .
                            $index_archive_name . "/bundle$serve_archive");
                    if (!$tmp) {
                        return false;
            self::$indexes[$index_name] = $tmp;
            if ($is_old) {
                self::$indexes[$index_name]->setCurrentShard(0, true);
            self::$index_times[$index_name] = time();
               If too many cached discard oldest 1/3 of cached indices
            if (count(self::$indexes) > self::INDEX_CACHE_SIZE) {
                $times = array_values(self::$index_times);
                $oldest_third = $times[floor(count($times)/3)];
                foreach (self::$index_times as $name => $time) {
                    if ($time <= $oldest_third) {
                        unset(self::$index_times[$name], self::$indexes[$name]);
        return self::$indexes[$index_name];
     *  Clears the static variables in which caches of read in indexes
     *  and dictionary info is stored.
    public static function clearCache()
        self::$indexes = [];
        self::$index_times = [];
     * Returns the version of the index, so that Yioop can determine
     * how to do word lookup.The only major change to the format was
     * when word_id's went from 8 to 20 bytes which happened around Unix
     * time 1369754208.
     * @param string $index_name unix timestamp of index
     * @return int 0 - if the original format for Yioop indexes; 1 -if 20 byte
     *     word_id format
    public static function getVersion($index_name)
        $index_name = (string) $index_name;
        $index_name = (empty($index_name) || $index_name[0] != '-') ?
            $index_name : substr($index_name, 1);
        $index_name_int = intval($index_name);
        if (!is_numeric($index_name)) {
            $description_file = $index_name . "/summaries/description.txt";
            if (file_exists($description_file)) {
                $description = unserialize(
                if (!empty($description['DESCRIPTION'])) {
                    $description = unserialize($description['DESCRIPTION']);
                if (!empty($description[self::CRAWL_TIME])) {
                    if (intval($description[self::CRAWL_TIME]) <
                        C\VERSION_0_TIMESTAMP &&
                        intval($description[self::CRAWL_TIME]) !=
                        self::FEED_CRAWL_TIME) {
                        return 0;
        } else if ($index_name_int != self::FEED_CRAWL_TIME &&
            $index_name_int < C\VERSION_0_TIMESTAMP) {
            return 0;
        $tmp_index = self::getIndex($index_name);
        if (isset($tmp_index->version)) {
            return $tmp_index->version;
        } else if (isset($tmp_index->archive_info['VERSION'])) {
            return $tmp_index->archive_info['VERSION'];
        return C\DEFAULT_CRAWL_FORMAT;
     * Gets an array of posting list positions for each shard in the
     * bundle $index_name for the word id $term_id
     * @param string $index_name bundle to look for $term_id in
     * @param string $term_id id of phrase or word to look up in bundle
     *     dictionary
     * @param int $threshold after the number of results exceeds this amount
     *     stop looking for more dictionary entries.
     * @param int $start_generation what generation in the index to start
     *      finding occurrence of phrase from
     * @param int $num_distinct_generations from $start_generation how
     *      many generation to search forward to
     * @param bool $with_remaining_total whether to total number of
     *      postings found as well or not
     * @return array either [total, sequence of four tuples]
    *       or sequence of four tuples:
     *      (index_shard generation, posting_list_offset, length, exact id
     *      that match $term_id)
    public static function getWordInfo($index_name, $term_id, $threshold = -1,
        $start_generation = -1, $num_distinct_generations = -1,
        $with_remaining_total = false)
        static $info_cache = [];
        $lookup_hash = crawlHash($index_name . $term_id . $threshold .
            $start_generation . $num_distinct_generations .
        if (isset($info_cache[$lookup_hash])) {
            $tmp = $info_cache[$lookup_hash];
            $info_cache[$lookup_hash] = $tmp;
            return $tmp;
        $index = self::getIndex($index_name);
        $start_generation = ($start_generation < 0) ? 0 : $start_generation;
        $word_info = $index->getWordInfo($term_id,
            $threshold, $start_generation, $num_distinct_generations,
        $info_cache[$lookup_hash] = $word_info;
        if (count($info_cache) >= self::INFO_CACHE_SIZE) {
        return $word_info;
     * Finds posting info related to the most recent version
     * of a URL in the given index
     * @param string hash of the URL to be looked up
     * @param string current index
     * @return array of posting info | null
    public static function lookupLatestVersionPage($url_hash, $index_name)
        // Check if the url hash exists in the cache
        if (array_key_exists($url_hash, self::$urls_cache)) {
            return self::$urls_cache[$url_hash];
        $model_for_url_hash_lookup = new ParallelModel();
        $page_versions = $model_for_url_hash_lookup->
            $index_name, false, true);
        if (!empty($page_versions['ROWS']) &&
            count($page_versions['ROWS']) > 0) {
            $latest_postings_info =
                end($page_versions['ROWS'])['POSTINGS'] ?? null;
            $latest_partition =
                end($page_versions['ROWS'])['PARTITION'] ?? null;
            if (is_array($latest_postings_info) &&
                count($latest_postings_info) > 0) {
                $latest_posting = end($latest_postings_info);
                if (count(self::$urls_cache) >= self::URLS_CACHE_SIZE) {
                    self::$urls_cache = [];
                self::$urls_cache[$url_hash] = array($latest_partition,
                return self::$urls_cache[$url_hash];
        return null;