viewgit/inc/functions.php:22 Function utf8_encode() is deprecated [8192]
<?php /** * SeekQuarry/Yioop -- * Open Source Pure PHP Search Engine, Crawler, and Indexer * * Copyright (C) 2009 - 2024 Chris Pollett chris@pollett.org * * LICENSE: * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <https://www.gnu.org/licenses/>. * * END LICENSE * * @author Chris Pollett chris@pollett.org * @license https://www.gnu.org/licenses/ GPL3 * @link https://www.seekquarry.com/ * @copyright 2009 - 2024 * @filesource */ namespace seekquarry\yioop\library; use seekquarry\yioop\configs as C; /** * This class implements a Log Structured merge tree data structure suitable for * storing and retrieving sortable records * * @author Chris Pollett */ class LSMTree { /** * */ const BLOCK_FACTOR = 10000; /** * */ const RECORD_CACHE_SIZE = 100; /** * */ const DATA_FILE_PREFIX = "D"; /** * */ const FOLDER_PREFIX = "F"; /** * */ const INDEX_FILE = "index.txt"; /** * */ const MAX_FILE_SIZE = 32768; /** * */ const MAX_TIER_FILENAME = "max_tier.txt"; /** * */ const TIER_PREFIX = "Tier"; /** * @var string */ public $base_slot = "A"; /** * @var string */ public $block_factor; /** * Folder for storing the LSMTree files * @var string */ public $folder; /** * @var int */ public $max_file_size; /** * * @var Tier */ public $put_slot = null; /** * PackedTableTools used to pack/unpack key/values records * @var PackedTableTools */ public $table_tools; /** * The highest tiered index in the LSMTree * @var int */ public static $max_tier; /** * Creates/Loads LSM-Tree having specified folder and minimum_degree. * * @param string $folder is the folder for storing the B+-Tree files * @param array $format the column names, keys and types for this * LSTM object * @param int $max_file_size */ public function __construct($folder, $format, $max_file_size = self::MAX_FILE_SIZE, $block_factor = self::BLOCK_FACTOR) { if (empty($format['PRIMARY KEY'][1]) || $format['PRIMARY KEY'][1] < 0) { throw new \Exception( "LSMTree class requires fixed lengthed keys"); } $this->folder = $folder; if (!file_exists($folder)) { mkdir($folder); chmod($folder, 0777); } $this->getMaxTier(); //computes max tier if not set yet $this->table_tools = new PackedTableTools($format); $this->max_file_size = $max_file_size; $this->block_factor = $block_factor; } /** * @return string */ public function getTierFolder($tier) { return $this->folder . "/" . self::TIER_PREFIX . sprintf("%'.04d", $tier); } /** * @return bool */ public function occupiedTier($tier) { return file_exists($this->getTierFolder($tier) . "/A"); } /** * Returns the highest occupied tier of the LSMTree * @param bool $recompute whether to return cache value if exists (false) or * recompute it by examing the file system (true) * @return int the maximum tier */ public function getMaxTier($recompute = false) { if (!$recompute && isset(self::$max_tier)) { return self::$max_tier; } $max_path = $this->folder . "/" . self::MAX_TIER_FILENAME; if (!$recompute && file_exists($max_path)) { return self::$max_tier = intval(file_get_contents($max_path)); } $tier_path = $this->folder . "/" . self::TIER_PREFIX; $tiers = glob($tier_path . "*"); sort($tiers); $last_tier = (!empty($tiers) && count($tiers) > 0) ? $tiers[count($tiers) - 1] : $this->folder . "/" . self::TIER_PREFIX . "0"; $max_tier = intval(substr($last_tier, strlen($tier_path))); file_put_contents($max_path, $max_tier); return self::$max_tier = $max_tier; } /** * Within a tier select which of the two slots (A or B) to write * entry data to */ public function selectPutSlot($letter) { $this->base_slot = ($letter == "A") ? "A" : "B"; $this->put_slot = null; } /** * */ public function put($row) { if (empty($this->put_slot)) { $put_folder = $this->getTierFolder(0) . "/" . $this->base_slot; $this->put_slot = new Tier($put_folder, $this->table_tools, "w", $this->max_file_size, $this->block_factor); } $this->put_slot->put($row , false); } /** * Flushes any in-memory data into the active slot at tier 0. */ public function flush() { if (!empty($this->put_slot)) { $this->put_slot->flush(); } } /** * Merges any tiers with both slots filled in the LSTM into * a tier in a slot one level higher */ public function mergeTiers() { crawlLog("..Begin LSMTiers Merging Tiers.."); $max_tier = $this->getMaxTier(); crawlLog("....begin Max Tier is: $max_tier"); for ($i = 0; $i <= $max_tier; $i++) { $tier_folder = $this->getTierFolder($i); if (file_exists($tier_folder . "/B")) { crawlLog("....Merging Tier $i"); $this->mergeTier($i); } else { crawlLog("....Tier $i doesn't need to be merged."); } } $max_tier = $this->getMaxTier(true); crawlLog("....end Max Tier is: $max_tier"); crawlLog("..End LSMTiers Merging Tiers.."); } /** * Deletes tier $tier from the LSMTree * * @param int $tier tier to delete from LSMTree */ public function emptyTier($tier) { $tier_folder = $this->getTierFolder($tier); $db_class = C\NS_DATASOURCES . ucfirst(C\DBMS) . "Manager"; $db = new $db_class(); $db->unlinkRecursive($tier_folder, true); } /** * Compares the keys in two LSMTree entries and returns * -1 is $entry_a < $entry_b, 0 if $entry_a == $entry_b, * and 1 if $entry_a > $entry_b * * @param string $entry_a first LSMTree entry * @param string $entry_b first LSMTree entry * @return int result of comparison as described above */ public function compare($entry_a, $entry_b) { $key_len = $this->table_tools->key_len; return strncmp($entry_a, $entry_b, $key_len); } /** * Combines two LSMTree entries with the same key value into a single * entry. The values of an entry begin with number of items stored * followed by items in the format specified by the LSMTree constructor. * So the output items and the two number of items and concatenates the * items themselves. * * @param string $entry_a first entry to combine * @param string $entry_b second entry to combine * @return string cobined entry */ public function mergeEntries($entry_a, $entry_b) { $table_tools = $this->table_tools; list($encoded_key, $values_a) = entryToKeyValues($entry_a, $table_tools); list(, $values_b) = entryToKeyValues($entry_b, $table_tools); $out_values = $table_tools->mergeRowValues($values_a, $values_b); return $encoded_key . $out_values; } /** * Merges two tier slots of the same tier $tier into a single tier slot * at tier $tier + 1. If fewer than two slots filled at a given tier * than does nothing. * * @param int $tier tier to perform merging for */ public function mergeTier($tier) { $tier_folder = $this->getTierFolder($tier); if (!file_exists($tier_folder . "/B")) { return; } if (!file_exists($tier_folder . "/A")) { rename($tier_folder . "/B", $tier_folder . "/A"); return; } $a_tier = new Tier($tier_folder . "/A", $this->table_tools); $b_tier = new Tier($tier_folder . "/B", $this->table_tools); $next_tier_folder = $this->getTierFolder($tier + 1); $target_folder = ($this->occupiedTier($tier + 1)) ? $next_tier_folder . "/B" : $next_tier_folder . "/A"; $target_tier = new Tier($target_folder, $this->table_tools, "w", $this->max_file_size, $this->block_factor); $a_row = $a_tier->next(); $b_row = $b_tier->next(); $cnt = 0; while(!empty($a_row) || !empty($b_row)) { if (empty($b_row) && !empty($a_row)) { $target_tier->put($a_row); $a_row = $a_tier->next(); } else if (empty($a_row) && !empty($b_row)) { $target_tier->put($b_row); $b_row = $b_tier->next(); } else if (!empty($a_row) && !empty($b_row)) { $cmp = $this->compare($a_row, $b_row); if ($cmp < 0) { $target_tier->put($a_row); $a_row = $a_tier->next(); } else if ($cmp > 0) { $target_tier->put($b_row); $b_row = $b_tier->next(); } else { $target_tier->put($this->mergeEntries($a_row, $b_row)); $a_row = $a_tier->next(); $b_row = $b_tier->next(); } } $cnt++; crawlTimeoutLog("......have merged $cnt items of Tier $tier."); } $target_tier->flush(); $this->emptyTier($tier); } /** * Returns the record associated with a $key as stored in the LSMTree. * If $key is not present in tree then returns null * @param string $key to look up in current LSMTree * @param int $offset starting record item associated with this key * to return * @param int $limit maximum number of record items associated with this * key to return, null == till end * @return array of items associated with this key in the LSMTree */ public function get($key, $offset = 0, $limit = null) { $max_tier = $this->getMaxTier(); $rows = []; $max_rows = $offset + ($limit ?? 0); for ($i = $max_tier; $i >= 0; $i--) { $add_rows = $this->getTier($i, $key); if (is_array($add_rows)) { // use array_merge rather than + or get wrong results here $rows = array_merge($rows, $add_rows); } if ($limit > 0 && count($rows) > $max_rows) { break; } } if ($offset > 0 || $limit > 0) { $rows = array_slice($rows, $offset, $limit); } return $rows; } /** * Returns the values associated with a key in a given Tier of the * LSMTree * * @param int $tier tier to get values from * @param string $key key to look up values for * @return array values associated with $key (unpacked) */ public function getTier($tier, $key) { $slot_folder = $this->getTierFolder($tier) . "/A"; if (!file_exists($slot_folder)) { return null; } $slot = new Tier($slot_folder, $this->table_tools); return $slot->get($key); } } /** * Splits a string containing one row of data for the LSMTree into the * key and a string for the values. * * @param string $entry string encoded row to be split * @param PackedTableTools $table_tools which has the format used to * encode the entry * @return array [$key, $values (as a string)] */ function entryToKeyValues($entry, $table_tools) { $key_len = $table_tools->key_len; $key = substr($entry, 0, $key_len); $values = substr($entry, $key_len); return [$key, $values]; } /** * Auxiliary Class used to manage a single Tier from the Logarithmic Merge Tree * structure */ class Tier { /** * how many data files should be in a block folder before making a * new block folder * @var int */ public $block_factor; /** * For data that has not been flushed to disk, the first key in sorted * order * @var string */ public $first_active_key; /** * File path to where data in this tier is to be stored * @var string */ public $folder; /** * @var int */ public $iterator_folder_index; /** * @var array */ public $iterator_folders; /** * @var int */ public $iterator_file_index; /** * @var array */ public $iterator_files; /** * @var int */ public $iterator_entry_index; /** * @var array */ public $iterator_entries; /** * @var int */ public $max_file_size; /** * Access mode for data in this tier: r - read, w - write * @var string */ public $mode; /** * @var PackedTableTools */ public $table_tools; /** * @var string */ private $records; /** * @var string */ private $active_filename; /** * @var array */ private static $cache = []; /** * */ public function __construct($folder, $table_tools, $mode = "r", $max_file_size = LSMTree::MAX_FILE_SIZE, $block_factor = LSMTree::BLOCK_FACTOR) { $this->folder = $folder; $this->table_tools = $table_tools; $this->mode = $mode; $this->max_file_size = $max_file_size; $this->block_factor = $block_factor; $folder_exists = file_exists($folder); if (!$folder_exists) { if ($mode == "w") { makePath($folder); } else { return null; } } if ($mode == "w") { $this->records = ""; } $this->reset(); } /** * */ public function flush() { if ($this->mode != "w" || empty($this->first_active_key)) { return false; } if (empty($this->records)) { return true; } $old_cwd = getcwd(); chdir($this->folder); $tier_folders = glob(LSMTree::FOLDER_PREFIX . "*"); $tiers_changed = false; if (empty($tier_folders)) { $active_folder = LSMTree::FOLDER_PREFIX . rawurlencode($this->first_active_key); mkdir($active_folder); chmod($active_folder, 0777); $tier_folders = [$active_folder]; $tiers_changed = true; $data_files = []; } else { $last_folder = $tier_folders[count($tier_folders) - 1]; chdir($last_folder); $data_files = glob(LSMTree::DATA_FILE_PREFIX . "*"); if (empty($data_files)) { $data_files = []; } chdir($this->folder); $num_data_files = count($data_files); if ($num_data_files >= $this->block_factor) { $active_folder = LSMTree::FOLDER_PREFIX . rawurlencode($this->first_active_key); mkdir($active_folder); $data_files = []; $tier_folders[] = $active_folder; $tiers_changed = true; } else { $active_folder = $last_folder; } } chdir($old_cwd); $active_path = $this->folder . "/" . $active_folder; $data_file = LSMTree::DATA_FILE_PREFIX . rawurlencode( $this->first_active_key); $data_files[] = $data_file; file_put_contents("$active_path/$data_file", $this->records); $this->writeRecords("$active_path/" . LSMTree::INDEX_FILE, $data_files); if ($tiers_changed) { $this->writeRecords($this->folder . "/" . LSMTree::INDEX_FILE, $tier_folders); } $this->records = ""; return true; } /** * */ public function get($key) { $tier_folders = $this->readRecords( $this->folder . "/" . LSMTree::INDEX_FILE); $url_encoded_key = rawurlencode($key); $key_folder = $this->binarySearch(LSMTree::FOLDER_PREFIX . $url_encoded_key, $tier_folders); if (!$key_folder) { return false; } $key_path = $this->folder . "/$key_folder"; $data_files = $this->readRecords("$key_path/" . LSMTree::INDEX_FILE); $data_file = $this->binarySearch(LSMTree::DATA_FILE_PREFIX . $url_encoded_key, $data_files); if (!$data_file) { return false; } $table_tools = $this->table_tools; $key_len = $table_tools->key_len; $record_string = file_get_contents("$key_path/$data_file"); $start = strpos($record_string, "\xFF$key"); if ($start) { $start++; } else { if (strncmp($record_string, $key, $key_len) != 0) { return false; } $start = 0; } $end = strpos($record_string, "\xFF", $start); $end = ($end > 0) ? $end : strlen($record_string); $length = $end - $start; $record = decode255(substr($record_string, $start, $end)); $values = substr($record, $key_len); return $table_tools->unpack($values); } /** * */ public function put($row, $is_packed = true) { if($this->mode != "w") { return false; } $table_tools = $this->table_tools; if (!$is_packed) { $key = $row[$table_tools->key_field]; $packed_row = $table_tools->pack($row); $entry = $key . $packed_row; } else { $entry = $row; } $encoded_entry = encode255($entry); if (strlen($this->records) + strlen($encoded_entry) + 1 > $this->max_file_size) { $this->flush(); } if (empty($this->records)) { list($this->first_active_key,) = entryToKeyValues($entry, $table_tools); } $separator = (strlen($this->records) > 0) ? "\xFF" : ""; $this->records .= $separator . $encoded_entry; return true; } /** * */ public function binarySearch($needle, $haystack) { $low = 0; $high = count($haystack) - 1; if ($high < 0 || strcmp($needle, $haystack[$low]) < 0) { return false; } if (strcmp($needle, $haystack[$high]) >= 0) { return $haystack[$high]; } while ($high - $low > 1) { $mid = ($high + $low) >> 1; $cmp = strcmp($needle, $haystack[$mid]); if ($cmp == 0) { return $haystack[$mid]; } else if ($cmp < 0) { $high = $mid; } else { $low = $mid; } } return $haystack[$low]; } /** * Returns the first entry as a packed string in the LSMTree tier. Also * resets iterator of this object. * * @return string|bool first entry if exists, else false */ public function firstEntry() { $folder = $this->folder; $this->iterator_folders = $this->readRecords("$folder/" . LSMTree::INDEX_FILE); if (empty($this->iterator_folders) || !is_array($this->iterator_folders)) { return false; } $this->iterator_folder_index = 0; $iterator_folder = $this->iterator_folders[0]; $file_path = "$folder/$iterator_folder"; $this->iterator_files = $this->readRecords( "$file_path/" . LSMTree::INDEX_FILE); if (empty($this->iterator_files) || !is_array($this->iterator_files)) { return false; } $this->iterator_file_index = 0; $iterator_file = $this->iterator_files[0]; $this->iterator_entries = $this->readRecords( "$file_path/{$iterator_file}", "\xFF"); if (empty($this->iterator_entries) || !is_array($this->iterator_entries)) { return false; } $this->iterator_entry_index = 0; return decode255($this->iterator_entries[0]); } /** * Returns the next tier entry as a packed string iterated over by this * Tier object. * @return string|bool next entry if exists, else false */ public function next() { if (empty($this->iterator_folders)) { return $this->firstEntry(); } $this->iterator_entry_index++; $entry = $this->iterator_entries[$this->iterator_entry_index] ?? false; if ($entry) { return decode255($entry); } $folder = $this->folder; $this->iterator_entry_index = 0; $this->iterator_file_index++; $file = $this->iterator_files[$this->iterator_file_index] ?? false; if ($file) { $iterator_folder = $this->iterator_folders[ $this->iterator_folder_index]; $file_path = "$folder/$iterator_folder/$file"; $this->iterator_entries = $this->readRecords($file_path, "\xFF"); return decode255($this->iterator_entries[0]) ?? false; } $this->iterator_file_index = 0; $this->iterator_folder_index++; $iterator_folder = $this->iterator_folders[ $this->iterator_folder_index] ?? false; if (!$iterator_folder) { return false; } $folder_path = "$folder/$iterator_folder"; $this->iterator_files = $this->readRecords("$folder_path/" . LSMTree::INDEX_FILE); if (empty($this->iterator_files) || !is_array($this->iterator_files)) { return false; } $file = $this->iterator_files[0]; $file_path = "$folder_path/$file"; $this->iterator_entries = $this->readRecords($file_path, "\xFF"); return decode255($this->iterator_entries[0]) ?? false; } /** * Resets to the first entry of the tier, the iterator associated with * the current Tier object. */ public function reset() { $this->iterator_folder_index = 0; $this->iterator_folders = []; $this->iterator_file_index = 0; $this->iterator_files = []; $this->iterator_entry_index = 0; $this->iterator_entries = []; } /** * Write a sequence of string records $lines into the file $filename, * separating records with delimiter $delimiter. Deletes file from LRU * cache of read files * * @param string $filename name of file to write records to * @param array $lines records to write to $filename * @param string $delimiter string used to separate one records from the * next */ function writeRecords($filename, $lines, $delimiter = "\n") { file_put_contents($filename, implode($delimiter, $lines)); $name_hash = crawlHash($filename); unset(self::$cache[$name_hash]); } /** * Returns the contents of a file managed by this LSMTree * as a sequence of string records. Contents come from either * a cache or from the filesystem. Has logic for LRU cache * @param string $filename name of file to get records for * @param string $delimiter delimeter used to separate individual * records * @return array of string records */ function readRecords($filename, $delimiter = "\n") { $name_hash = crawlHash($filename); if (isset(self::$cache[$name_hash])) { $tmp = self::$cache[$name_hash]; unset(self::$cache[$name_hash]); self::$cache[$name_hash] = $tmp; //move to end of array return $tmp; } $record = explode($delimiter, file_get_contents($filename)); self::$cache[$name_hash] = $record; if (count(self::$cache) >= LSMTree::RECORD_CACHE_SIZE || memory_get_usage() > metricToInt(C\INDEX_FILE_MEMORY_LIMIT) * C\MEMORY_FILL_FACTOR) { array_shift(self::$cache); } return $record; } }