Last commit for src/library/IndexDocumentBundle.php: 88ba842636f692ac9bde972fed5a3cf6959d841b

Allows Arctool to rebuild/remerge a range of partitions, fixes term lookup bugs in WordIterator and IndexDocumentBundle

Chris Pollett [2024-02-04 02:Feb:th]
Allows Arctool to rebuild/remerge a range of partitions, fixes term lookup bugs in WordIterator and IndexDocumentBundle
 * SeekQuarry/Yioop --
 * Open Source Pure PHP Search Engine, Crawler, and Indexer
 * Copyright (C) 2009 - 2024  Chris Pollett
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * GNU General Public License for more details.
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <>.
 * @author Chris Pollett
 * @license GPL3
 * @link
 * @copyright 2009 - 2024
 * @filesource
namespace seekquarry\yioop\library;

use seekquarry\yioop\configs as C;

 * Used for crawlLog, crawlHash, and garbageCollect
require_once __DIR__ . '/Utility.php';
 * Encapsulates a set of web page documents and an inverted word-index of terms
 * from these documents which allow one to search for documents containing a
 * particular word.
 * @author Chris Pollett
class IndexDocumentBundle implements CrawlConstants
     * File name used to store within the folder of the IndexDocumentBundle
     * parameter/configuration information about the bundle
    const ARCHIVE_INFO_FILE = "archive_info.txt";
     * The version of this IndexDocumentBundle. The lowest format number is
     * 3.0 as prior inverted index/document stores used IndexArchiveBundle's
    const DEFAULT_VERSION = "4.0";
     * Default values for the configuration parameters of an
     * IndexDocumentBundle
        "VERSION" => self::DEFAULT_VERSION
     * Subfolder of IndexDocumentBundle to store the btree with
     * term => posting list information (i.e., the inverted index)
    const DICTIONARY_FOLDER = "Dictionary";
     * DocIds are made of three parts: hash of url, hash of document, hash
     * of url hostname. Each of these hashes is  DOCID_PART_LEN long
    const DOCID_PART_LEN = 8;
     * Length of DocIds used by this IndexDocumentBundle
    const DOCID_LEN = 24;
     * Length of TermIds used by this IndexDocumentBundle
    const TERMID_LEN = 16;
     * Length of terms' bloom filter string in bytes
    const TERMSFILTER_LEN = 125;
     * Number of terms from a doc to store in term filter (this
     * would typically be the top terms according to soem metric)
    const NUM_TERMS_FILTER = 300;
     * Length of terms' bloom filter string in bits
    const TERMSFILTER_BITS_LEN = 1000;
     * Number of hash functions to use while constructing the
     * terms' bloom filter string
     * Partition i in an IndexDocumentBundle has a subfolder i
     * within self::POSITIONS_DOC_MAP_FOLDER. Within this subfolder i,
     * self::DOC_MAP_FILENAME is the name of the file used to store the
     * document map for the partition. The document map consists of a sequence
     * of records associated with each doc_id of a document stored in the
     * partition. The first record is ["POS" => $num_words,
     * "SCORE" => floatval($global_score_for_document)]. The second record is:
     * ["POS" => $length_of_title_of_document, "SCORE" =>
     *          floatval($num_description_scores)]]
     * Here a description score is a score for the importance for a section
     * of a document. Subsequence records, list [POS => the length of the jth
     * section of the document, SCORE => its score].
    const DOC_MAP_FILENAME = "doc_map";
     * Folder used to store the partition data of this IndexDocumentBundle
     * These will consists of .txt.gz files for each partition which are used
     * to store summaries of documents and actual documents (web pages) and
     * .ix files which are used to store doc_id and the associated offsets to
     * their summary and actual document within the .txt.gz file
    const DOCUMENTS_FOLDER = "Documents";
     * Name of the last entries file used to help compute difference lists
     * for doc_map_index, and position list offsets used in postings for the
     * partition. This file is also used to track the total number of
     * occurrences of term in a partition
    const LAST_ENTRIES_FILENAME = "last_entries";
     * The filename of a file that is used to keep track of the integer that
     * says what is the next partition with documents that can be added to
     * this IndexDocumentBundle's dictionary. I.e., It should be that
     * next_partition <= save_partition
    const NEXT_PARTITION_FILE = "next_partition.txt";
     * Names for the files which appear within a partition sub-folder
     * Name of the file within a partitions positions_doc_maps folder used
     * to contain the partition's position list for all terms in partition.
    const POSITIONS_FILENAME = "positions";
     * Name of the file within a partition's positions_doc_maps folder with
     * posting information for all terms in that partition. This consists of
     * key value pairs term_id => posting records for all documents with that
     * term.
    const POSTINGS_FILENAME = "postings";
     * How many bytes of posting to buffer before writing, when
     * addPartitionPostingsDictionary
    const POSTINGS_BUFFER_SIZE = 1000000;
     * Maximum number of posting slices to cache
    const MAX_POSTING_CACHE_ITEMS = 100;
     * Holds property value pairs concerning the configuration of the
     * current IndexDocumentBundle
     * @var array
    public $archive_info;
     * Folder name to use for this IndexDocumentBundle
     * @var string
    public $dir_name;
     * A short text name for this IndexDocumentBundle
     * @var string
    public $description;
     * structure contains info about the current partition
     * @var array
    public $next_partition_to_add;
     * Reference to the LSMTree used to store term => array of partition
     * posting list info
     * @var LSMTree
    public $dictionary;
     * PartitionDocumentBundle for web page documents
     * @var PartitionDocumentBundle
    public $documents;
     * Associative array of docid=>doc_record pairs
     * @var array
    public $doc_map;
     * Used to read and write data to the $doc_map array
     * @var PackedTableTools
    public $doc_map_tools;
     * Used to keep track of the previous values
     * posting quantities so difference lists can be computed. For example,
     * previous $doc_map_index, previous position list offset. It also tracks
     * the total number of occurrences of a term within a partition.
     * @var array
    public $last_entries;
     * Used to read and write data to the $last_entries array
     * @var PackedTableTools
    public $last_entries_tools;
     * Map from int -> three character unpack string used to unpack posting info
     * @var array
    public $unpack_map;
     * Array of string lengths each of $unpack_maps codes consumes
     * @var array
    public $unpack_len_map;
     * A string consisting of a concatenated sequence
     * term position information for each document in turn and within this for
     * each term in that document.
     * @var string
    public $positions;
     *  Associative array $term_id => posting list
     *  records for that term in the partition.
     * @var array
    public $postings;
     * Used to read and write data to the $postings array
     * @var PackedTableTools
    public $postings_tools;
     * Keeps track of the number of documents present in the current partition
     * @var int
    public $doc_map_counter;
     * Holds the total time needed to extract phrases (sequences of adjacent
     * words) from site descriptions for a partition
     * @var int
    public $extract_phrase_time;
     * Makes or initializes an IndexDocumentBundle with the provided parameters
     * @param string $dir_name folder name to store this bundle
     * @param bool $read_only_archive whether to open archive only for reading
     *  or reading and writing
     * @param string $description a text name/serialized info about this
     *  IndexDocumentBundle
     * @param int $num_docs_per_partition the number of documents to be stored
     *  in a single partition
    public function __construct($dir_name, $read_only_archive = true,
        $description = null, $num_docs_per_partition =
        C\NUM_DOCS_PER_PARTITION, $overflow_threshold = C\OVERFLOW_THRESHOLD)
        $this->dir_name = $dir_name;
        $is_dir = is_dir($this->dir_name);
        if (!$is_dir && !$read_only_archive) {
        } else if (!$is_dir) {
            return false;
        $archive_info_path = $this->dir_name . "/" . self::ARCHIVE_INFO_FILE;
        $this->archive_info = self::DEFAULT_PARAMETERS;
        if (!empty($description)) {
            $this->archive_info["DESCRIPTION"] = $description;
        $just_got_info = false;
        if (file_exists($archive_info_path)) {
            $this->archive_info = unserialize(file_get_contents(
            $just_got_info = true;
        $this->archive_info['RECORD_COMPRESSOR'] ??=
            C\NS_COMPRESSORS . "NonCompressor";
        $record_compressor = $this->archive_info['RECORD_COMPRESSOR'];
        $this->archive_info['BLOB_COMPRESSOR'] ??=
            C\NS_COMPRESSORS . "GzipCompressor";
        $blob_compressor = $this->archive_info['BLOB_COMPRESSOR'];
        if (!$read_only_archive && !$just_got_info) {
        $next_partition_path = $this->dir_name . "/".
        if (file_exists($next_partition_path)) {
            $this->next_partition_to_add = intval(
        } else if (!$read_only_archive) {
            $this->next_partition_to_add = 0;
        $this->documents = new PartitionDocumentBundle($dir_name . "/" .
            self::DOCUMENTS_FOLDER, ["PRIMARY KEY" => [self::DOC_ID,
            self::SUMMARY => "SERIAL", self::PAGE => "SERIAL"],
            $record_compressor, $blob_compressor,
        if (!$read_only_archive) {
            $this->documents->index_cache_size = 1;
        $this->doc_map_tools = new PackedTableTools([
            "PRIMARY KEY" => ["DOC_KEYS", self::DOCID_LEN], "POS" => "INT",
            "SCORE" => "FLOAT"], $record_compressor);
        $this->postings_tools = new PackedTableTools([
            "PRIMARY KEY" => ["TERM", self::TERMID_LEN],
            "DOC_MAP_INDEX" => "INT", "FREQUENCY" => "INT",
            "POSITIONS_OFFSET" => "INT", "POSITIONS_LEN" => "INT"],
        $unpack_codes = [0 => "C", 1 => "n", 2=> "N", 3 => "J"];
        $len_codes = [0 => 1, 1 => 2, 2 => 4, 3 => 8];
        for ($i = 0; $i < 4; $i++) {
            for ($j = 0; $j < 4; $j++) {
                for ($k = 0; $k < 4; $k++) {
                    for ($m = 0; $m < 4; $m++) {
                        $this->unpack_map[] =
                            $unpack_codes[$i] . "DOC_MAP_INDEX/" .
                            $unpack_codes[$j] . "FREQUENCY/" .
                            $unpack_codes[$k] . "POSITIONS_OFFSET/" .
                            $unpack_codes[$m] . "POSITIONS_LEN";
                        $this->unpack_len_map[] = $len_codes[$i] +
                            $len_codes[$j] + $len_codes[$k] + $len_codes[$m];
        $this->last_entries_tools = new PackedTableTools([
            "PRIMARY KEY" => ["TERM", 16], "LAST_INDEX" => "INT",
            "LAST_OFFSET" => "INT", "NUM_OCCURRENCES" => "INT"],
        if (!$read_only_archive) {
        $this->dictionary = new LSMTree($this->dir_name . "/" .
            self::DICTIONARY_FOLDER, ["PRIMARY KEY" => ["TERM", 16],
            "PARTITION" => "INT", "NUM_DOCS" => "INT",
            "POSTINGS_LEN" => "INT"]);
     * Add the array of $pages to the documents PartitionDocumentBundle
     * @param array $pages data to store
     * @param int $visited_urls_count number to add to the count of visited urls
     *     (visited urls is a smaller number than the total count of objects
     *     stored in the index).
     * @return bool success or failure of adding the pages
    public function addPages($pages, $visited_urls_count)
        crawlLog("Indexer adding pages to document bundle...");
        $success = $this->documents->put($pages);
        return $success;
     * For every partition between next partition and save partition, adds
     * the posting list information to the dictionary LSMTree. At the
     * end of this process next partition and save partition should be the same
     * @param string $taking_too_long_touch a filename of a file to touch
     *  so its last modified time becomes the current time. In a typical
     *  Yioop crawl this is done for the CrawlConstants::crawl_status_file
     *  file to prevent  Yioop's web interface from stopping the crawl because
     *  it has seen no recent  progress activity on a crawl.
     * @param bool $till_equal is set to true will keep adding each partition
     *  up till the save partition if set to false, only adds one partition
     * @param string $rebuild_or_remerge either the string "rebuild',
     *  "rebuild_some_number", or "remerge". If it "rebuild", it will both
     *  recompute partition inverted indexes then build a global dictionary
     *  from these. If it "remerge", if a partition inverted index
     *  exists it is directly merged into a new global dictionary without
     *  recomputing it. "rebuild_some_number" just rebuild partition
     *  some_number's inverted index, but does not merge it into the
     *  global dictionary.
    public function updateDictionary($taking_too_long_touch = null,
        $till_equal = true, $rebuild_or_remerge = "rebuild")
        if (preg_match("/rebuild\_(\d+)/", $rebuild_or_remerge, $matches)) {
            $next_partition = intval($matches[1]);
            $rebuild_or_remerge = "partition_only";
            crawlLog("Rebuilding just partition $next_partition!");
        } else {
            $next_partition = $this->next_partition_to_add;
        $save_partition = $this->documents->parameters["SAVE_PARTITION"];
        $current_num_docs = $this->documents->parameters['ACTIVE_COUNT'];
        $max_items_per_partition =
        if ($rebuild_or_remerge == "rebuild") {
            crawlLog("Current save partition has $current_num_docs documents.");
        crawlLog("Max documents per partition: $max_items_per_partition.");
        $memory_limit = metricToInt(ini_get("memory_limit"));
        $before_usage = memory_get_usage();
        crawlLog("Indexer Memory  limit is $memory_limit. Usage is " .
        $advanced_partition = false;
        while ($next_partition < $save_partition && $advanced_partition <=
            $till_equal) {
            if ($rebuild_or_remerge == "rebuild") {
                crawlLog("Indexer adding Partition to dictionary...");
                crawlLog("...because save partition changed");
            $switch_time = microtime(true);
            // Save current shard dictionary to main dictionary
            $have_inverted_index_files = false;
            if ($rebuild_or_remerge == "remerge") {
                $base_folder = $this->getPartitionBaseFolder($next_partition);
                $have_inverted_index_files = true;
                foreach (self::PARTITION_FILENAMES as $filename) {
                    $component_filename = $base_folder . "/" . $filename;
                    if (!file_exists($component_filename)) {
                        $have_inverted_index_files = false;
            if ($have_inverted_index_files) {
                crawlLog("...Partition $next_partition has all of its inverted".
                    " index files, not recomputing, just merging.");
            } else {
            $num_freed = garbageCollect();
            if ($rebuild_or_remerge != "partition_only") {
                    $next_partition, $taking_too_long_touch);
            crawlLog("Indexer force running garbage collector after partition".
                 " advance. This freed $num_freed bytes.");
            $after_usage = memory_get_usage();
                "Indexer after partition changed memory usage: $after_usage");
            crawlLog("Switch Partition time:".
            file_put_contents($this->dir_name . "/". self::NEXT_PARTITION_FILE,
            file_put_contents($this->dir_name . "/". self::NEXT_PARTITION_FILE
                . "-advanced", $next_partition);
            $advanced_partition = true;
        $this->next_partition_to_add = $next_partition;
        return $advanced_partition;
     * Adds the previously constructed inverted index $partition to the inverted
     * index of the whole bundle
     * @param int $partition which partitions inverted index to add, by
     *  default the current save partition
     * @param string $taking_too_long_touch a filename of a file to touch
     *  so its last modified time becomes the current time. In a typical
     *  Yioop crawl this is done for the CrawlConstants::crawl_status_file
     *  file to prevent Yioop's web interface from stopping the crawl because
     *  it has seen no recent  progress activity on a crawl.
    public function addPartitionPostingsDictionary($partition = -1,
        $taking_too_long_touch = null)
        $save_partition = $this->documents->parameters["SAVE_PARTITION"];
        if ($partition < 0 ) {
            if ($save_partition <= 0) {
                return false;
            $partition = $save_partition - 1;
        $base_folder = $this->getPartitionBaseFolder($partition);
        $postings_tools = $this->postings_tools;
        unset($this->postings, $this->doc_map, $this->positions,
        $last_entries_tools = $this->last_entries_tools;
        $dictionary = $this->dictionary;
        $postings_filename = $base_folder . "/" . self::POSTINGS_FILENAME;
        $last_entries_filename = $base_folder . "/" .
        if (!file_exists($postings_filename)) {
            crawlLog("Postings file for partition $partition does not exist");
            return false;
        if (!file_exists($last_entries_filename)) {
                "Last entries file for partition $partition does not exist");
            return false;
        crawlLog("Start Adding Partition Posting Info to Dictionary");
        $start_time = microtime(true);
        $postings_string = $postings_tools->load($postings_filename,
        $posting_files_len = strlen($postings_string);
        //add a marker for the end of the file as a string
        $key_len = $this->postings_tools->key_len;
        $this->last_entries = $last_entries_tools->load($last_entries_filename);
        $num_postings = substr_count($postings_string, "\xFF") + 1;
        $last_marker = 0;
        $slot = ($dictionary->occupiedTier(0)) ? "B" : "A";
        for ($i = 0; $i < $num_postings; $i++) {
            $cur_marker = strpos($postings_string, "\xFF", $last_marker);
            $diff = ($cur_marker === false) ? null :
                $cur_marker - $last_marker;
            $pre_row = substr($postings_string, $last_marker, $diff);
            $postings_offset = $last_marker + $key_len;
            $last_marker = $cur_marker + 1;
            $term = substr($pre_row, 0, $key_len);
            $encode_row = substr($pre_row, $key_len);
            $postings_len = strlen($encode_row);
            $row = decode255($encode_row);
            if (crawlTimeoutLog("..Indexer Still processing partition ".
                "$partition. Have completed $i postings of $num_postings.") &&
                $taking_too_long_touch) {
                if (file_exists($taking_too_long_touch)) {
                    touch($taking_too_long_touch, time());
            $start = 0;
            $num_docs_term = vByteDecode($row, $start);
            $num_occurrences_term = 0;
            $last_entry = $last_entries_tools->find($this->last_entries, $term);
            if (!empty($last_entry)) {
                $last_entry_row =
                $num_occurrences_term = $last_entry_row[0]["NUM_OCCURRENCES"];
            $dictionary->put(["TERM" => $term, "PARTITION" => $partition,
                "NUM_DOCS" => $num_docs_term,
                "NUM_OCCURRENCES"  => $num_occurrences_term,
                "POSTINGS_OFFSET" => $postings_offset,
                "POSTINGS_LEN" => $postings_len]);
        crawlLog("...Finished Adding Partition Posting Info to " .
            "Dictionary: " . changeInMicrotime($start_time));
     * Gets the file path corresponding to the partition with index $partition
     * @param int $partition desired partition index
     * @return string file path to where this partitions index data is stored
     *  (Not the original documents which are stored in the
     *  PartitionDocumentBundle)
    public function getPartitionBaseFolder($partition)
        return $this->documents->getPartitionFolder($partition);
     * Given the $doc_id of a document and a $partition to look for it in
     * return's the document summary info if present and [] otherwise.
     * @param string $doc_id of document to look up
     * @param int $partition to look for document in
     * @return array desired summary or [] if look up failed
    public function getSummary($doc_id, $partition)
        $row = $this->documents->get($doc_id, $partition, [self::SUMMARY]);
        return $row[self::SUMMARY] ?? [];
     * Given the $doc_id of a document and a $partition to look for it in
     * return's the cached page of the document if present and [] otherwise
     * @param string $doc_id of document to look up
     * @param int $partition to look for document in
     * @return array desired page cache or [] if look up failed
    public function getCachePage($doc_id, $partition)
        $row = $this->documents->get($doc_id, $partition, [self::PAGE]);
        return $row[self::PAGE] ?? [];
     * Builds an inverted index shard for a documents PartitionDocumentBundle
     * partition.
     * @param int $partition to build index for
     * @param string $taking_too_long_touch a filename of a file to touch
     *  so its last modified time becomes the current time. In a typical
     *  Yioop crawl this is done for the CrawlConstants::crawl_status_file
     *  file to prevent Yioop's web interface from stopping the crawl because
     *  it has seen no recent  progress activity on a crawl.
     * @return mixed whether job executed to completion (true or false) if
     *  !$just_stats, otherwise, an array with NUM_DOCS, NUM_LINKS,
     *  and TERM_STATISTICS (the latter having term frequency info)
    public function buildInvertedIndexPartition($partition = -1,
        $taking_too_long_touch = null, $just_stats = false)
        $start_time = microtime(true);
        crawlLog("  Indexer start building inverted index ...  Current Memory:".
        if ($partition < 0) {
            $partition = $this->documents->parameters["SAVE_PARTITION"];
            "Indexer Building index inverted index for partition $partition");
        $base_folder = $this->getPartitionBaseFolder($partition);
        /* set up $doc_map_filename, $postings_filename, $postings_filename,
           $positions_filename, etc
        if (!$just_stats) {
            foreach (self::PARTITION_FILENAMES as $filename) {
                $component_filename = $base_folder . "/" . $filename;
                if (file_exists($component_filename)) {
                $component = $filename . "_filename";
                $$component = $component_filename;
        $doc_map_tools = $this->doc_map_tools;
        $postings_tools = $this->postings_tools;
        $last_entries_tools = $this->last_entries_tools;
        $this->doc_map = "";
        $this->doc_map_counter = 0;
        $this->postings = [];
        $this->last_entries = [];
        $this->positions = "";
        crawlLog("Indexer Preparing Index Map...");
        $index_map = $this->prepareIndexMap($partition);
        crawlLog("Done Prepare Index Map. Number of documents in mapped ".
            "partition:" . count($index_map));
        $cnt = 0;
        $non_aux_doc_cnt = 0;
        $link_cnt = 0;
        $num_partition = count($index_map);
        $doc_field = self::DOC_ID;
        $score_field = self::SCORE;
        $aux_docs_field = self::AUX_DOCS;
        $get_summaries_time = 0;
        $aux_get_summaries_time = 0;
        $safe_score_time = 0;
        $safe_meta_score_time = 0;
        $invert_pages_time = 0;
        $invert_metas_time = 0;
        $invert_links_time = 0;
        $this->extract_phrase_time = 0;
        foreach ($index_map as $hash_url => $url_info) {
            $site = [];
            if (!empty($url_info[$doc_field])) {
                $start_get_summaries = microtime(true);
                $site = $this->getSummary($url_info[$doc_field], $partition);
                $get_summaries_time += changeInMicrotime($start_get_summaries);
                if (empty($site) || !is_array($site)) {
            /* if $site still empty here then current group'd urls didn't have a
               document (downloaded webpage) amongst themselves
            $max_description_len ??= C\MAX_DESCRIPTION_LEN;
            $max_description_len = (empty($site[self::DESCRIPTION])) ?
                $max_description_len : max($max_description_len,
            $metas_only = ($url_info[$aux_docs_field] == 'metas_only');
            $aux_description = "";
            $tmp_description = $site[self::DESCRIPTION] ?? "";
            if (isset($site[self::TYPE]) && $site[self::TYPE] == "link") {
                $site_url = $site[self::TITLE];
            } else {
                $site_url = str_replace('|', "%7C", $site[self::URL] ?? "");
            if ($metas_only) {
                $start_safe_meta_time = microtime(true);
                if (PhraseParser::computeSafeSearchScore($tmp_description,
                        $site_url) < PhraseParser::SAFE_PHRASE_THRESHOLD) {
                    $site[self::IS_SAFE] = true;
                    $url_info[self::IS_SAFE] = true;
                } else {
                    $site[self::IS_SAFE] = false;
                    $url_info[self::IS_SAFE] = false;
                $safe_meta_score_time +=
                $site[self::JUST_METAS] = true;
                $start_invert_metas = microtime(true);
                $site_url = $this->invertOneSite($site, $url_info, $link_cnt);
                $invert_metas_time +=
               Index pages that were hashed together or links to page
               before page itself.
            $pre_aux_docs = explode("\xFF", $url_info[$aux_docs_field]);
            $aux_sites = [];
            foreach ($pre_aux_docs as $pre_aux_doc) {
                $aux_doc = decode255($pre_aux_doc);
                $start_get_summaries = microtime(true);
                $aux_site = $this->getSummary($aux_doc, $partition);
                $aux_get_summaries_time +=
                if (empty($aux_site) || !is_array($aux_site)) {
                    $aux_site = []; // make sure empty
                $aux_site[self::JUST_METAS] = true;
                if (!empty($aux_site[self::DESCRIPTION])) {
                    if (strlen($aux_description) +
                        strlen($aux_site[self::DESCRIPTION]) <
                        $max_description_len) {
                        $aux_description .= " .. " .
                $aux_sites[] = $aux_site;
            if (empty($site) && !empty($aux_site) && is_array($aux_site)) {
                //use one left aux_site for site if site empty
                $site = $aux_site;
                $site[self::DESCRIPTION] = "";
            $site[self::DESCRIPTION] ??= "";
            $site[self::DESCRIPTION] .= $aux_description;
            $start_safe_time = microtime(true);
            if (PhraseParser::computeSafeSearchScore($site[self::DESCRIPTION],
                $site_url) < PhraseParser::SAFE_PHRASE_THRESHOLD) {
                $site[self::IS_SAFE] = true;
                $url_info[self::IS_SAFE] = true;
            } else {
                $site[self::IS_SAFE] = false;
                $url_info[self::IS_SAFE] = false;
            $safe_score_time +=
            $start_invert_page = microtime(true);
            $site_url = $this->invertOneSite($site, $url_info, $link_cnt);
            $invert_pages_time += changeInMicrotime($start_invert_page);
            foreach ($aux_sites as $aux_site) {
                $start_invert_links = microtime(true);
                $site_url = $this->invertOneSite($aux_site, $url_info,
                $invert_links_time += changeInMicrotime($start_invert_links);
            $memory_usage = memory_get_usage();
            $link_to = (isset($site[self::TYPE]) &&
                $site[self::TYPE] == "link") ? "LINK TO:" : "";
            $time_string = makeTimestamp();
            if ($site_url &&
                crawlTimeoutLog("..Indexer Still building inverted index ".
                    "for partition $partition \n" .
                    "$time_string ....Current Indexer Memory Usage is %s.\n" .
                    $time_string .
                    " ....Indexer has processed %s of %s documents.\n" .
                    $time_string .
                    " ....Total links or docs processed by Indexer is %s.\n" .
                    "$time_string ....Last url Indexer processed was %s.",
                $memory_usage, $non_aux_doc_cnt, $num_partition,
                $non_aux_doc_cnt + $cnt, $link_to . $site_url) &&
                $taking_too_long_touch) {
                if (file_exists($taking_too_long_touch)) {
                    touch($taking_too_long_touch, time());
        if ($just_stats) {
            $term_stats = [];
            foreach ($this->postings as $term => $postings) {
                $stat_pos = 0;
                $num_records = vByteDecode($postings, $stat_pos);
                $term_stats[$term] = $num_records;
            $statistics = [
                "NUM_DOCS" => $this->doc_map_counter,
                "NUM_LINKS" => $link_cnt,
                "TERM_STATISTICS" => $term_stats
            return $statistics;
        $start_save_times = microtime(true);
        $doc_map_tools->save($doc_map_filename, $this->doc_map);
        $postings_tools->save($postings_filename, $this->postings);
        $last_entries_tools->save($last_entries_filename, $this->last_entries);
        file_put_contents($positions_filename, $this->positions);
        $final_save_time = changeInMicrotime($start_save_times);
        $time_string = makeTimestamp();
        crawlLog("  Indexer build inverted index time ".
            changeInMicrotime($start_time) .
            "\n$time_string  ..Component times:" .
            "\n$time_string  ....Get page summaries time: $get_summaries_time" .
            "\n$time_string  ....Get link summaries time: " .
                $aux_get_summaries_time .
            "\n$time_string  ....Compute Safe Page time: $safe_score_time" .
            "\n$time_string  ....Compute Safe Meta time: $safe_meta_score_time".
            "\n$time_string  ....Invert pages time: $invert_pages_time" .
            "\n$time_string  ....Invert meta pages time: $invert_metas_time" .
            "\n$time_string  ....Invert links time: $invert_links_time" .
            "\n$time_string  ....Final file saves time: $final_save_time" .
            "\n$time_string  ----" .
            "\n$time_string  ....Of Invert times, time in " .
                "extractPhrasesInLists:". $this->extract_phrase_time);
        return true;
     * Used to create inverted index for one site and add its information to
     * the current partition.
     * @param array $site site to invert
     * @param array $url_info collection of url and hash's of documents which
     *   map to the same document
     * @param int &$link_cnt current count of number of links discovered so far
     * @return string $site_url canonical url for site
    public function invertOneSite($site, $url_info, &$link_cnt)
        $interim_time = microtime(true);
        if (!isset($site[self::HASH]) ||
            (isset($site[self::ROBOT_METAS]) &&
            in_array("JUSTFOLLOW", $site[self::ROBOT_METAS]))) {
            return "";
        //this case  might occur on a recrawl
        if (isset($site[self::TYPE]) && $site[self::TYPE] == "link") {
            $is_link = true;
            $site_url = $site[self::TITLE];
            $host =  UrlParser::getHost($site_url);
            $link_parts = explode('|', $site[self::HASH]);
            if (isset($link_parts[5])) {
                $link_origin = $link_parts[5];
            } else {
                $link_origin = $site_url;
            $url_info = [];
            if (!empty($site[self::LANG])) {
                $url_info[self::LANG] = $site[self::LANG];
            $meta_ids = PhraseParser::calculateLinkMetas($site_url,
                $host, $site[self::DESCRIPTION], $link_origin,
            $link_to = "LINK TO:";
        } else {
            $is_link = false;
            $site_url = str_replace('|', "%7C", $site[self::URL]);
            $meta_ids =  PhraseParser::calculateMetas($site);
            $link_to = "";
        $word_lists = [];
        $host_keywords_end_pos = 0;
        $title_end_pos = 0;
        $path_keywords_end_pos = 0;
        $triplet_lists = [];
            self::JUST_METAS check to avoid getting sitemaps in results
            for popular words
        $lang = null;
        if (!isset($site[self::JUST_METAS])) {
            $host_words = UrlParser::getWordsInHostUrl($site_url);
            $path_words = UrlParser::getWordsLastPathPartUrl(
            if ($is_link) {
                $phrase_string = $site[self::DESCRIPTION];
            } else {
                if (isset($site[self::LANG])) {
                    if (isset($this->programming_language_extension[
                        $site[self::LANG]])) {
                        $phrase_string = $site[self::DESCRIPTION];
                    } else {
                        /* r6t was chosen as short enough not to be
                           changed by chargramming, but rare enough
                           that can be used as a useful splitter
                        $phrase_string = $host_words . " r6t ".
                            $site[self::TITLE] . " r6t ". $path_words .
                            " r6t ". $site[self::DESCRIPTION];
                } else {
                    $phrase_string = $host_words . " r6t " .
                        $site[self::TITLE] . " r6t " . $path_words .
                        " r6t ". $site[self::DESCRIPTION];
            /* at this point we have already extracted meta words,
               we attempt to compute the lang here as a value different
               from empty or mul for the purposes of stemming chargramming.
               (helps with extracting words from images because the image
                itself might have had few words to guess the lnaguage
                but when combined with inlinks it does)
            if (empty($site[self::LANG]) || $site[self::LANG] == "mul") {
                $lang = guessLocaleFromString(
            } else {
                $lang = $site[self::LANG];
            $word_and_qa_lists = PhraseParser::extractPhrasesInLists(
                $phrase_string, $lang);
            if (!isset($this->extract_phrase_time)) {
                $this->extract_phrase_time = 0;
            $this->extract_phrase_time +=
                $word_and_qa_lists['TIMES']['TOTAL_TIME'] ?? 0;
            $word_lists = $word_and_qa_lists['WORD_LIST'];
            if (!empty($word_lists["r6t"][2])) {
                if ($path_keywords_end_pos < 255) {
                    $host_keywords_end_pos = $word_lists["r6t"][0];
                    $title_end_pos = $word_lists["r6t"][1];
                $path_keywords_end_pos = $word_lists["r6t"][2];
            } else if (!empty($word_lists["r6t"])) {
                $path_keywords_end_pos = $word_lists["r6t"][
                    count($word_lists["r6t"]) - 1];
        $description_scores =
            (empty($site[self::DESCRIPTION_SCORES])) ? [] :
        $user_ranks =
            (empty($site[self::USER_RANKS])) ? [] :
        $num_words = 0;
        foreach ($word_lists as $word => $position_list)
            $num_words += count($position_list);
        $doc_id = ($url_info[self::DOC_ID] ??
            ($url_info[self::AUX_DOCS][0] ?? ""));
        if (empty($doc_id)) {
            return "";
        $terms = array_keys($word_lists);
        if (count($terms) > self::NUM_TERMS_FILTER) {
            $terms = array_slice($terms, 0, self::NUM_TERMS_FILTER);
        $terms_filter = $this->storeTerms($terms);
        $this->addScoresDocMap($doc_id, $num_words,
            $url_info[self::SCORE], $host_keywords_end_pos, $title_end_pos,
            $path_keywords_end_pos, $description_scores,
            $user_ranks, $terms_filter);
        $this->addTermPostingLists(0, $word_lists, $meta_ids,
        $interim_elapse = changeInMicrotime($interim_time);
        if ($interim_elapse > 5) {
            crawlLog("..Indexer Inverting " . $link_to . $site_url .
            "...took > 5s.");
        return $site_url;
     * Given a $site array of information about a web page/document. Use
     * CrawlConstant::URL and CrawlConstant::HASH fields to compute a
     * unique doc id for the array.
     * @param array $site site to compute doc_id for
     * @return string the computedd doc_id
    public static function computeDocId($site)
        $doc_id = false;
        if (isset($site[self::TYPE]) && $site[self::TYPE] == "link") {
            $doc_id = $site[self::HTTP_CODE];
        } else {
            $letter_code = chr(0);
            $main_type = (!empty($site[self::TYPE])) ?
                substr($site[self::TYPE], 0, 4) : "binary";
            if (!empty($site[self::IS_VIDEO])) {
                $letter_code = chr(64);
            } else if ($main_type == "text" ) {
                $letter_code = chr(56);
            } else if ($main_type == "imag") {
                $letter_code = chr(48);
            $hash = $site[self::HASH];
            if ($letter_code == chr(56) && !empty($site[self::TITLE])) {
                $trim_title = trim($site[self::TITLE]);
                $hash = substr(crawlHash($trim_title, true), 0, 4) .
                    substr($hash, 4);
            $site_url = str_replace('|', "%7C", $site[self::URL]);
            $host = UrlParser::getHost($site_url);
            $cld = UrlParser::getCompanyLevelDomain($site_url);
            if (in_array($site_url, ["https://$cld/",
                "https://www.$cld/", "http://$cld/", "http://www.$cld/"])) {
                $letter_code = chr(ord($letter_code) + 128);
            if (strpos($cld, "wikipedia") !== false) {
                $letter_code = chr(ord($letter_code) + 4);
            $num_slashes = substr_count(substr($site[self::URL], strlen($host)),
             * Discount any trailing slashes in the URL.
            if ($num_slashes > 0 && substr($site[self::URL], -1) == '/') {
             * The first two bits hold the number of / values (for the
             * NUM_SLASHES_BONUS). This value is mapped into buckets of
             * {0-1, 2-4,  5-6, 7+}, wherein all the values in a bucket get the
             * same bonus. These  buckets were decided after experimentation;
             * the fundamental idea is  that URLs for root pages/singly-nested
             * pages are usually more important than those doubly nested  or
             * quadruply nested, which are in turn  more important than those
             * quintuply nested, etc.
            if ($num_slashes >= 2 && $num_slashes < 5) {
                $letter_code = chr(ord($letter_code) + 1);
            }  else if ($num_slashes >=5  && $num_slashes < 7){
                $letter_code = chr(ord($letter_code) + 2);
            } else if ($num_slashes >=7) {
                $letter_code = chr(ord($letter_code) + 3);
            $doc_id = crawlHash($site_url, true) . $hash .
                $letter_code . substr(crawlHash($host . "/", true), 1);
        return $doc_id;
     * Used to add a doci_id => doc_record to the current partition's
     * document map ($this->doc_map). A doc record records the number of words
     * in the document, an overall length of the document, the length of its
     * title, scores for each of the sentences included into the summary
     * for the documents, and classifier scores for each classifier that was
     * used by the crawl.
     * @param string $doc_id new document id to add a record for
     * @param int $num_words number of terms in the document associated with the
     *  doc-id
     * @param float $score overall score for the important of this document
     * @param int $host_keywords_end_pos end of the  portion of the
     *  document summary containing terms coming from the hostname
     * @param int $title_end_pos end of the portion of the document
     *  summary containing terms in the title
     * @param int $path_keywords_end_pos length of the portion of the
     *  document summary containing terms in the url path
     * @param array $description_scores pairs of the form (length of summary
     *  portion, score for that portion)
     * @param array $user_ranks for each user defined classifier for this crawl
     *  the float score of the classifier on this document
    public function addScoresDocMap($doc_id, $num_words, $score,
        $host_keywords_end_pos, $title_end_pos, $path_keywords_end_pos,
        $description_scores, $user_ranks, $terms_filter = "")
        $num_description_scores = count($description_scores);
        $preface_positions =
            ((($host_keywords_end_pos << 8) + $title_end_pos) << 8) +
        $out_rows = [["POS" => $num_words, "SCORE" => floatval($score)],
            ["POS" => $preface_positions, "SCORE" =>
        foreach ($description_scores as $position => $score) {
            $out_rows[] = ["POS" => $position, "SCORE" => floatval($score)];
        foreach ($user_ranks as $user_rank) {
            $out_rows[] = ["POS" => 0, "SCORE" => floatval($score)];
        $entry = $this->doc_map_tools->pack($out_rows);
        /* the doc_map entry is prepended with a string representing
         * the bloom filter of terms in the document
        $entry = $terms_filter . $entry;
        $this->doc_map_tools->add($this->doc_map, $doc_id, $entry,
     * Creates a bloom filter string made up of the 300 most
     * important terms in the current document. This filter is used
     * later to check if a term belongs to the document.
     * @param array $terms terms in document
     * @return string term bloom filter, prepended with 't'
     * to check for backward compatibility
    public static function storeTerms($terms)
        $hash_functions = self::TERMSFILTER_HASHFN_COUNT;
        $size = self::TERMSFILTER_BITS_LEN;
        $terms_filter = str_repeat(chr(0), self::TERMSFILTER_LEN);
        foreach ($terms as $term) {
            $term = canonicalTerm($term);
            for ($i = 0; $i < $hash_functions; $i++) {
                $hash = crc32($term . $i) % $size;
                $byte = (int)($hash >> 3);
                $bit = $hash & 7;
                $ascii_char = ord($terms_filter[$byte]);
                $ascii_char |= (1 << $bit);
                $terms_filter[$byte] = chr($ascii_char);
        return 't' . $terms_filter;
     * Check if the current term id exists in the term bloom filter
     * associated with the doc_map entry.
     * @param string $term to look up
     * @param string $terms_filter term bloom filter
     * @return boolean exists or not
    public static function checkTermExists($term, $terms_filter)
        $hash_functions = self::TERMSFILTER_HASHFN_COUNT;
        $size = self::TERMSFILTER_BITS_LEN;
        for ($i = 0; $i < $hash_functions; $i++) {
            $hash = crc32($term . $i) % $size;
            $byte = (int)($hash >> 3);
            $bit = $hash % 7;
            $ascii_char = ord($terms_filter[$byte]);
            if (($ascii_char & (1 << $bit)) == 0) {
                return false;
        return true;
     * Adds posting records associated to a document to the posting lists for
     * a partition.
     * @param int $position_offset number of header bytes that might be used
     *  before including any position data in the file that positions will
     *  eventually be stored.
     * @param array $word_lists term => positions within current document of
     *  that term for the document whose posting data we are adding
     * @param array $meta_ids meta terms associated with the document we are
     *  adding. An example, meta term might be "media:news"
     * @param int $doc_map_index which document within the partition is the one
     *  we are adding. I.e., 5 would mean there were 5 earlier documents whose
     *  postings we have already added.
    public function addTermPostingLists($position_offset, $word_lists,
        $meta_ids, $doc_map_index)
        $postings_tools = $this->postings_tools;
        $last_entries_tools = $this->last_entries_tools;
        foreach ($meta_ids as $meta_id) {
            $word_lists[$meta_id] = [];
        foreach ($word_lists as $word => $position_list) {
            $term_id = canonicalTerm($word);
            $occurrences = count($position_list);
            if ($occurrences > 0) {
                $encoded_position_list = encodePositionList($position_list);
                $offset = $position_offset + strlen($this->positions);
                $len = strlen($encoded_position_list);
                $this->positions .= $encoded_position_list;
            } else {
                $offset = 0;
                $len = 0;
            $last_entry = $last_entries_tools->find($this->last_entries,
            if (empty($last_entry)) {
                list($last_index, $last_offset, $num_occurrences) = [0, 0, 0];
            } else {
                $last_entry_row = $last_entries_tools->unpack($last_entry);
                list($last_index, $last_offset, $num_occurrences) =
            $diff_doc_map_index = $doc_map_index - $last_index;
            $diff_offset = ($occurrences > 0) ?
                $offset - $last_offset : 0;
            //note:pack adds vByteEncode of num rows packed to front
            $entry = $postings_tools->pack([
                "DOC_MAP_INDEX" => $diff_doc_map_index,
                "FREQUENCY" => $occurrences, "POSITIONS_OFFSET" => $diff_offset,
                "POSITIONS_LEN" => $len]);
            /* multiple entries can be associated with the same term_id.
               term_id => vbyte_encoded_num_entries entry1 \xFF entry2 ...
            $postings_tools->add($this->postings, $term_id, $entry,
                PackedTableTools::ADD_MEM_TABLE, PackedTableTools::APPEND_MODE);
            $add_entry = $last_entries_tools->pack(
                ["LAST_INDEX" => $doc_map_index, "LAST_OFFSET" => $offset,
                "NUM_OCCURRENCES" => $num_occurrences + $occurrences]);
            $last_entries_tools->add($this->last_entries, $term_id, $add_entry);
     * Checks if a doc_id $key is that of a host url.
     * I.e., a url as opposed to
     * @param string $key to check if doc or not
    public static function isAHostDocId($key)
        if (strlen($key) == self::DOCID_LEN && substr($key, 1, 7) ==
            substr($key, -7)) {
            return true;
        return false;
     * Checks if a doc_id $key is that of a Company level domain (cld) or
     * www.cld.
     * I.e., a url  or as opposed to
     * @param string $key to check if doc or not
    public static function isACldDocId($key)
        return (ord($key[self::DOCID_PART_LEN << 1] ?? chr(0)) & 128) > 0;
     * Checks if a doc_id $key is that of a Wikipedia page.
     * @param string $key to check if Wikipedia page or not
    public static function isAWikipediaPage($key)
        return (ord($key[self::DOCID_PART_LEN << 1] ?? chr(0)) & 4) > 0;
     * Finds number of '/' in the url after the hostname represented by doc_id
     * $key.
     * @param string $key to find '/' count
    public static function findNumSlashes($key)
        return (ord($key[self::DOCID_PART_LEN << 1] ?? chr(0)) & 3);

     * Checks if a doc_id corresponds to a particular large scale type among
     * external_link, internal_link, link (union of previous two),
     * binary, feed, image, text, video, document (union of previous five)
     * @param string $key to check if doc or not
     * @param string|array if a string then a particular type from above list
     *  to check against if array then an array of types to check against
     * @return bool true if a document
    public static function isType($key, $types)
        $type_map = [
            0 => "binary",
            8 => "old_doc",
            16 => "external_link",
            24 => "feed",
            32 => "internal_link",
            40 => "old_link",
            48 => "image",
            56 => "text",
            64 => "video",
         * This map is maintained for backward compatibility, i.e.,
         * for the $key values using the previous letter code format.
        $old_type_map = [
            "b" => "binary",
            "d" => "old_doc",
            "e" => "external_link",
            "f" => "feed",
            "i" => "internal_link",
            "l" => "old_link",
            "p" => "image",
            "t" => "text",
            "v" => "video",
        if (is_string($types)) {
            $types = [$types];
        if (in_array("link", $types)) {
            $types = array_merge($types, ["external_link", "internal_link",
        } else if (in_array("doc", $types) || in_array("document", $types)) {
            $types = array_merge($types, ["binary", "feed", "image",
                "old_doc", "text", "video"]);
        $doc_id_format = ord($key[self::DOCID_PART_LEN << 1] ?? 0) & 96;
        if ($doc_id_format != 96) {
            $key_type = ord($key[self::DOCID_PART_LEN << 1] ?? 0) & 120;
            return in_array($type_map[$key_type] ?? "old_link", $types);
        // $key uses the old letter code format
        $key_type = chr(ord($key[self::DOCID_PART_LEN << 1] ?? 0) & 127);
        return in_array($old_type_map[$key_type] ?? "old_link", $types);
     * As pre-step to calculating the inverted index information for a partition
     * this method groups documents and links to documents into single objects.
     * It also does simple deduplication of documents that have the same hash.
     * It then returns an array of the grouped document data.
     * Grouping is done by giving a score to each document based on
     * (number of doc in index - order doc added). For two entries with
     * the same hash_url, a document will be chosen over a link as the
     * representative; otherwise, the one with higher score will be chosen as
     * the representative. The representative document is given the sum of
     * the scores of its constituents. A second phase where documents are
     * grouped by hash of the text body is also done. Finally, the returned
     * documents are sorted by their scores. So the order of documents from
     * this process is roughly in the order of importance.
     * @param int $partition index of partition to do deduplication for
     *  in the case that test index is empty
     * @param array $test_index is non-null only when doing testing of what
     *  this method does. In which case, it should consist of an array
     *  of $doc_id => string represent a possible record for that doc.
     *  As deduplication is done entirely based on component of the doc_id
     *  (hash_url, doc_type, hash_doc, hash_host) the string doesn't matter
     *  too much.
     * @return array groups doc_id => records associated with that doc_id
    public function prepareIndexMap($partition, $test_index = [])
        if (empty($test_index)) {
            $doc_index = $this->documents->loadPartitionIndex($partition, true);
        } else {
            $doc_index = $test_index;
        if (empty($doc_index)) {
            return [];
        $doc_ids = array_keys($doc_index);
        $num_ids = count($doc_ids);
        $grouped_urls = [];
        $grouped_hashes = [];
        $score = $num_ids;
        $doc_key_len = self::DOCID_PART_LEN;
        $doc_field = self::DOC_ID;
        $score_field = self::SCORE;
        $aux_docs_field = self::AUX_DOCS;
        foreach ($doc_ids as $doc_id) {
            list($hash_url, $hash_code, ) = str_split($doc_id, $doc_key_len);
            $current_grouped_urls = $grouped_urls[$hash_url] ??
                [$aux_docs_field => "", $score_field => 0];
            $current_grouped_hashes = $grouped_hashes[$hash_code] ?? "";
            if (!$this->isType($doc_id, "link")) {
                $current_grouped_hashes .= "\xFF". encode255($hash_url);
                $current_grouped_urls[$doc_field] = $doc_id;
            } else {
                $current_grouped_urls[$aux_docs_field] .= "\xFF".
            $current_grouped_urls[$score_field] += $score;
            if (!empty($current_grouped_hashes)) {
                $grouped_hashes[$hash_code] = $current_grouped_hashes;
            $grouped_urls[$hash_url] = $current_grouped_urls;
        foreach ($grouped_hashes as $pre_same_hash_group) {
            if (strlen($pre_same_hash_group) <= 2 * $doc_key_len) {
            $max_score = 0;
            $max_url = "";
            $same_hash_group = explode("\xFF", $pre_same_hash_group);
            foreach ($same_hash_group as $pre_hash_url) {
                if (empty($pre_hash_url)) {
                $hash_url = decode255($pre_hash_url);
                $hash_score = $grouped_urls[$hash_url][$score_field];
                if ($hash_score > $max_score) {
                    $max_score = $hash_score;
                    $max_url = $hash_url;
            $max_group = $grouped_urls[$max_url];
            foreach ($same_hash_group as $pre_hash_url) {
                if (empty($pre_hash_url)) {
                $hash_url = decode255($pre_hash_url);
                if ($hash_url != $max_url) {
                    $hash_group = $grouped_urls[$hash_url];
                    $max_group[$score_field] += $hash_group[$score_field];
                    if ($max_group[$aux_docs_field] != "metas_only") {
                        $max_group[$aux_docs_field] .= "\xFF" .
                            encode255($hash_group[$doc_field]) .
                    $hash_group[$aux_docs_field] = "metas_only";
                    $grouped_urls[$hash_url] = $hash_group;
            $grouped_urls[$max_url] = $max_group;
        uasort($grouped_urls, function ($a, $b) use ($score_field) {
            return intval($b[$score_field] - $a[$score_field]);
        return $grouped_urls;
     * Forces the current shard to be saved
    public function forceSave()
     * Used when a crawl stops to perform final dictionary operations
     * to produce a working stand-alone index.
    public function stopIndexing()
     * Gets an array of posting list positions for each shard in the
     * bundle $index_name for the word id $term_id
     * @param string $term_id id of phrase or word to look up in bundle
     *     dictionary
     * @param int $threshold after the number of results exceeds this amount
     *     stop looking for more dictionary entries.
     * @param int $offset
     * @param int $num_partitions
     * @param bool $with_remaining_total whether to total number of
     *      postings found as well or not
     * @return array either [total, sequence of four tuples]
    *       or sequence of four tuples:
     *      (index_shard generation, posting_list_offset, length, exact id
     *      that match $term_id)
    public function getWordInfo($term_id, $threshold = -1,
        $offset = 0, $num_partitions = -1, $with_remaining_total = false)
        $dictionary = $this->dictionary ?? [];
        if (!$dictionary) {
            return [];
        $result = ["ROWS" =>
            $dictionary->get($term_id, $offset, $num_partitions)];
        if (empty($result["ROWS"])) {
            $result = [];
        $max_found_partition = 0;
        $doc_count = 0;
        $occurrence_count = 0;
        $num_rows = 0;
        $threshold_met = false;
        $save_partition = $this->documents->parameters["SAVE_PARTITION"];
        if (empty($result['ROWS'])) {
            $result['ROWS'] = [];
        foreach ($result['ROWS'] as $row) {
            if ($threshold > 0 && $doc_count > $threshold) {
                $result['ROWS'] = array_slice($result['ROWS'], 0, $num_rows);
                $threshold_met = true;
            $max_found_partition = ($max_found_partition < $row['PARTITION']) ?
                $row['PARTITION'] : $max_found_partition;
            $doc_count += $row['NUM_DOCS'];
            $occurrence_count += $row['NUM_OCCURRENCES'];
        $parameters = $this->documents->parameters;
        $result['AVG_ITEMS_PER_PARTITION'] = $doc_count/max($num_rows, 1.0);
        $result['TOTAL_NUM_DOCS'] = $parameters["VISITED_URLS_COUNT"] ?? 0;
        $result['TOTAL_NUM_LINKS_AND_DOCS'] = $parameters["ACTIVE_COUNT"] +
        $result['MAX_ITEMS_PER_PARTITION'] = $parameters["MAX_ITEMS_PER_FILE"];
        $result['TOTAL_NUMBER_OF_PARTITIONS'] = $parameters["SAVE_PARTITION"]
            + 1;
        if ($threshold_met) {
            $fraction_seen = ($save_partition - $offset) /
                ($max_found_partition - $offset);
            $result['TOTAL_COUNT'] = $fraction_seen * $doc_count;
            $result['TOTAL_OCCURRENCES'] = $fraction_seen * $occurrence_count;
            $result['THESHOLD_EXCEEDED'] = true;
            return $result;
        $base_folder = $this->getPartitionBaseFolder($save_partition);
        $postings_filename = $base_folder . "/" . self::POSTINGS_FILENAME;
        $postings_tool = $this->postings_tools;
        if (file_exists($postings_filename)) {
            $active_dictionary =
                $postings_tool::AS_STRING_MODE, true);
            $active_postings_entry =
            $active_postings = (empty($active_postings_entry)) ? [] :
        if (!empty($active_postings)) {
            $row = ["PARTITION" => $save_partition,
                "NUM_DOCS" => count($active_postings),
                "POSTINGS" => $active_postings];
            $doc_count += $row["NUM_DOCS"];
            $active_occurrences = $this->deDeltaPostingsSumFrequencies(
            $row['NUM_OCCURRENCES'] = $active_occurrences;
            $occurrence_count += $active_occurrences;
            $result['ROWS'][] = $row;
        $result['TOTAL_COUNT'] = $doc_count;
        $result['TOTAL_OCCURRENCES'] = $occurrence_count;
        return $result;
     * Get the postings stored in the postings file in a partition from
     * $offset to $offset+len remove the 255 encoding.
     * @param int $partition partition to retrieve posting from
     * @param int $offset byte offset int partition/postings file to look for
     *  them
     * @param int $len length of the posting list to retrieve.
     * @return string encoded posting list data -- vbyte encoded number of
     *  postings, followed by the posting data in PacktableTools format
    public function getPostingsString($partition, $offset, $len)
        static $file_handles = [];
        static $memory_limit = 0;
        if (!$memory_limit) {
            $memory_limit =
        if (memory_get_usage() > $memory_limit ||
            count($file_handles) > self::MAX_POSTING_CACHE_ITEMS) {
            array_shift($file_handles); /*just in case file handles causing
                    memory leak */
        if (empty($file_handles[$partition])) {
            $postings_filename = $this->getPartitionBaseFolder($partition) .
                "/" . IndexDocumentBundle::POSTINGS_FILENAME;
            if (file_exists($postings_filename)) {
                $fh = fopen($postings_filename, "r");
            } else {
                return "";
            $file_handles[$partition] = $fh;
        } else {
            $fh = $file_handles[$partition];
            unset($file_handles[$partition]); // move to front of queue
            $file_handles[$partition] = $fh;
        if ($fh && fseek($fh, $offset) == 0 && $len > 0) {
            $out = decode255(fread($fh, $len) ?? "");
            return $out;
        return "";
     * Given the postings as a string for a partition for a term, unpacks them
     * into an array of postings, doing de-delta of doc_map_indices and
     * de-delta of positions. Each posting represents occurrence of a term
     * in a documents, so the frequency component  is the number of occurrences
     * of the term in the document. This method also computes the sum of these
     * frequencies over all postings in partition.
     * @param string $postings_string compress string representation of a
     *   set of postings for a term
     * @return array a pair [array of unpacked postings, sum of frequencies
     *   of all the postings]
    public function unpackPostings($postings_string)
        if (empty($postings_string)) {
            return [];
        $unpack_map = $this->unpack_map;
        $unpack_len_map = $this->unpack_len_map;
        $current_pos = 0;
        $num_items = vByteDecode($postings_string, $current_pos);
        $items = [];
        $sum_frequencies = 0;
        $doc_map_index = 0;
        $positions_offset = 0;
        $len_posting_strings = strlen($postings_string);
        for ($i = 0; $i < $num_items; $i++) {
            if (!isset($postings_string[$current_pos])) {
                 crawlLog("Posting decode error - Start beyond posting");
                 crawlLog("..Number to decode items: " . $num_items);
                 crawlLog("..Number decoded: " . $i);
                 crawlLog("..Length posting string: " .
                crawlLog("..Current position: " . $current_pos);
                return [$items, $sum_frequencies]; // sanity check 1
            $int_info = ord($postings_string[$current_pos]);
            $len_unpack_info = $unpack_len_map[$int_info];
            if ($current_pos + $len_unpack_info > $len_posting_strings) {
                crawlLog("Posting decode error -".
                    " Decode length longer than string");
                crawlLog(".. Decode Format Length was: " . $len_unpack_info);
                crawlLog("..Number to decode items: " . $num_items);
                crawlLog("..Length needed to decode: " .
                    ($len_unpack_info * $num_items));
                crawlLog("..Number decoded: " . $i);
                crawlLog("..Length posting string: " .
                crawlLog("..Current position: " . $current_pos);
                return [$items, $sum_frequencies]; // sanity check 2
            $pre_item = unpack($unpack_map[$int_info], $postings_string,
            if ($pre_item["FREQUENCY"] > C\MAX_DESCRIPTION_LEN) {
                crawlLog("Posting decode error! Frequency too large");
                crawlLog(".. Decode Format was: " . $unpack_map[$int_info]);
                crawlLog("..Number to decode items: " . $num_items);
                crawlLog("..Number decoded: " . $i);
                crawlLog("..Length posting string: " .
                crawlLog("..Current position: " . $current_pos);
                crawlLog("..Large Frequency Observed: ".
                    $pre_item["FREQUENCY"] .
                    " more than max description length:".
                return [$items, $sum_frequencies]; // sanity check 3
            $item = $pre_item;
            $item["DOC_MAP_INDEX"] += $doc_map_index;
            $item["POSITIONS_OFFSET"] += $positions_offset;
            $doc_map_index += $pre_item["DOC_MAP_INDEX"];
            $positions_offset += $pre_item["POSITIONS_OFFSET"];
            $sum_frequencies += $pre_item["FREQUENCY"];
            $current_pos += $len_unpack_info;
            $items[] = $item;
        return [$items, $sum_frequencies];
     * Within postings DOC_MAP_INDEX and POSITION_OFFSETS to position lists are
     * stored as delta lists (difference over previous values), this method
     * undoes the delta list to restore the actual DELTA_DOC_MAP_INDEX and
     * POSITION_OFFSETS values. It also computes the of the frequencies of items
     * within the list of postings. This method is current only used for
     * active partition in an index (the one whose terms haven't yet been added
     * to the LSMtree).
     * @param array &$postings a reference to an array of posting lists for a
     *  term (this will be changed by this method)
     * @return int sum of the frequencies of term occurrences as given by the
     *  above postings
    public function deDeltaPostingsSumFrequencies(&$postings)
        if (empty($postings) || !is_array($postings)) {
            return 0;
        list($doc_map_index, $sum_frequencies, $positions_offset) =
        $num_postings = count($postings);
        for ($i = 1; $i < $num_postings; $i++) {
            $posting = & $postings[$i];
            list($doc_map_delta, $frequency, $positions_delta) =
            $sum_frequencies += $frequency;
            $doc_map_index += $doc_map_delta;
            $positions_offset += $positions_delta;
            $posting["DOC_MAP_INDEX"] = $doc_map_index;
            $posting["POSITIONS_OFFSET"] = $positions_offset;
        return $sum_frequencies;
     * Gets the description, count of documents, and number of partitions of the
     * documents store in the supplied directory. If the file
     * arc_description.txt exists, this is viewed as a dummy index archive for
     * the sole purpose of allowing conversions of downloaded data such as arc
     * files into Yioop! format.
     * @param string $dir_name path to a directory containing a documents
     *      IndexDocumentBundle
     * @return array summary of the given archive
    public static function getArchiveInfo($dir_name)
        if (file_exists($dir_name . "/arc_description.txt")) {
            $crawl = [];
            $info = [];
            $crawl['DESCRIPTION'] = substr(
                file_get_contents($dir_name . "/arc_description.txt"), 0, 256);
            $crawl['ARCFILE'] = true;
            $info['VISITED_URLS_COUNT'] = 0;
            $info['COUNT'] = 0;
            $info['NUM_DOCS_PER_PARTITION'] = 0;
            $info['WRITE_PARTITION'] = 0;
            $info["VERSION"] = self::DEFAULT_VERSION;
            $info['DESCRIPTION'] = serialize($crawl);
            return $info;
        $info_path = $dir_name . "/" . self::ARCHIVE_INFO_FILE;
        if (!file_exists($info_path)) {
            $info = [];
            $info['DESCRIPTION'] =
                "Archive does not exist OR Archive description file not found";
            $info['COUNT'] = 0;
            $info['NUM_DOCS_PER_PARTITION'] = -1;
            $info["VERSION"] = self::DEFAULT_VERSION;
            return $info;
        $info = unserialize(file_get_contents($info_path));
        if (!is_array($info)) {
            $info = [];
        $table_info = PartitionDocumentBundle::getParameterInfo($dir_name . "/".
        if (!is_array($table_info)) {
            $table_info = [];
        $info = array_diff_key($info, $table_info);
        $info = array_merge($table_info, $info);
        return $info;
     * Sets the archive info struct for the web archive bundle associated with
     * this bundle. This struct has fields like: DESCRIPTION
     * (serialized store of global parameters of the crawl like seed sites,
     * timestamp, etc).
     * @param string $dir_name folder with archive bundle
     * @param array $update_info struct with above fields
    public static function setArchiveInfo($dir_name, $update_info)
        $archive_info_path = $dir_name . "/" . self::ARCHIVE_INFO_FILE;
        if (file_exists($archive_info_path)) {
            $info = self::getArchiveInfo($dir_name);
        if (empty($info) || !is_array($info)) {
            $info = [];
        $pdb_info = [];
        $got_pdb_info = false;
        if (!empty($info)) {
            $doc_folder =  $dir_name. "/" . self::DOCUMENTS_FOLDER;
            if (file_exists($doc_folder)) {
                $pdb_info = PartitionDocumentBundle::getParameterInfo(
                if (!empty($pdb_info)) {
                    $got_pdb_info = true;
                // avoid getting same data (_COUNTS) stored in two locations
                if (!empty($info) && !empty($pdb_info)) {
                    $info = array_diff_key($info, $pdb_info);
        $pdb_change = false;
        foreach ($update_info as $field => $value) {
            if (isset($pdb_info[$field])) {
                $pdb_info[$field] = $value;
                $pdb_change = true;
            $info[$field] = $value;
        if (empty($info["VERSION"])) {
            $info["VERSION"] = self::DEFAULT_VERSION;
        file_put_contents($archive_info_path, serialize($info));
        if ($got_pdb_info && $pdb_change) {
            $parameter_path = $doc_folder . "/" .
            file_put_contents($parameter_path, serialize($pdb_info),
     * Returns the last time the archive info of the bundle was modified.
     * @param string $dir_name folder with archive bundle
     * @returb mixed either time if file exists or false
    public static function getParamModifiedTime($dir_name)
        $doc_param_path = $dir_name . "/" . self::DOCUMENTS_FOLDER . "/" .
        if (file_exists($doc_param_path)) {
            return filemtime($doc_param_path);
        return false;