viewgit/inc/functions.php:22 Function utf8_encode() is deprecated [8192]
<?php /** * SeekQuarry/Yioop -- * Open Source Pure PHP Search Engine, Crawler, and Indexer * * Copyright (C) 2009 - 2022 Chris Pollett chris@pollett.org * * LICENSE: * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <https://www.gnu.org/licenses/>. * * END LICENSE * * @author Chris Pollett chris@pollett.org * @license https://www.gnu.org/licenses/ GPL3 * @link https://www.seekquarry.com/ * @copyright 2009 - 2022 * @filesource */ namespace seekquarry\yioop\library; use seekquarry\yioop\configs as C; /** * Loads crawlLog functions if needed */ require_once __DIR__ . "/Utility.php"; /** * A collection of methods to encode and decode records according to * a signature. * * @author Chris Pollett */ class PackedTableTools { /** * If not specified when constructing the instance, than this * will be the seekquarry\yioop\library\compressor\Compressor * used to compress rows. */ const DEFAULT_COMPRESSOR = C\NS_COMPRESSORS . "NonCompressor"; /** * If not specified by the format then this will be the assumed fixed * length of the primary key */ const DEFAULT_KEY_LEN = 16; /** * Array of synonyms for the different possible column name types. */ const TYPE_SYNONYMS = [ "BOOLEAN" => "BOOL", "BOOL" => "BOOL", "CLOB" => "TEXT", "DOUBLE" => "DOUBLE", "FLOAT" => "REAL", "INT" => "INT", "INTEGER" => "INT", "REAL"=> "REAL", "TEXT" => "TEXT" ]; /** * Constant used to indicate that add() should be adding row to an * in memory table */ const ADD_MEM_TABLE = 0; /** * Constant used to indicate that add() should be adding row using * a file handle */ const ADD_FILE_HANDLE = 1; /** * Constant used to indicate that add() should be append row row * a specified file (not having a file handle to it yet) */ const ADD_FILE_PATH = 2; /** * Constant used to indicate that add() should be adding row to an * in memory table where then in memory table is kept as an encoded * string rasther than as an array of key => data pairs. */ const ADD_MEM_TABLE_STRING = 3; /** * Constant used to indicate that add() should replace the value if * there already a row in the table with the same primary key */ const REPLACE_MODE = 0; /** * Constant used to indicate that add() should cluster the value if * there already a row in the table with the same primary key */ const APPEND_MODE = 1; /** * Constant used to indicate that loading shouldn't try to decode the * serialized disk format */ const AS_STRING_MODE = 2; /** * A string compression algorithm used to compress rows represented as * strings * @var seekquarry\yioop\library\compressor\Compressor */ public $compressor; /** * This is the signature of the records this PackedTableTools will * manipulate. This should be an associative array of element of one of * the forms: "PRIMARY KEY" => column_name, "PRIMARY KEY" => * [column_name, length_of_primary_key], column_name => column_type pairs * column_type's are from among BOOL, TEXT, DOUBLE, REAL or their * synonyms as given in PackedTableTools::TYPE_SYNONYMS. There should * be only one primary key pair and if its value is not an array the * key length is assumed to be PackedTableTools::DEFAULT_KEY_LEN. * @var array */ public $format; /** * Name of the column used for the primary key. * @var string */ public $key_field; /** * Fixed number of bytes used to store the primary key. * @var int */ public $key_len; /** * Number of columns in a record that are of type BOOL * @var int */ public $num_bool_columns; /** * Number of columns in a record that are of type INT * @var int */ public $num_int_columns; /** * Number of columns in a record that are of type TEXT * @var int */ public $num_text_columns; /** * * @var array */ public $table_cache; /** * * @var array */ public $table_entry_markers; /** * Used to create an instance of a PackedTableTools according to the * $format for record columns and $compressor_type to be used for row * compression. * * @param array associative array of items in one of the forms: * "PRIMARY KEY" => column_name, "PRIMARY KEY" => * [column_name, length_of_primary_key], column_name => column_type pairs * column_type's are from among BOOL, TEXT, DOUBLE, REAL or their * synonyms as given in PackedTableTools::TYPE_SYNONYMS. * @param seekquarry\yioop\library\compressor\Compressor used to compress * records */ public function __construct($format, $compressor_type = self::DEFAULT_COMPRESSOR) { $this->compressor = new $compressor_type(); $this->table_cache = []; $this->format = []; $type_synonyms = self::TYPE_SYNONYMS; $i = 0; $this->key_field = null; $this->key_len = self::DEFAULT_KEY_LEN; $this->num_bool_columns = 0; $this->num_int_columns = 0; $this->num_text_columns = 0; foreach ($format as $field_name => $type) { if (strtoupper($field_name) == "PRIMARY KEY") { if (is_array($type)) { $this->key_field = $type[0]; $this->key_len = $type[1]; } else { $this->key_field = $type; } continue; } $type = strtoupper($type); if (empty($type_synonyms[$type])) { return null; } $this->format[$field_name] = $type_synonyms[$type]; if ($type == "BOOL") { $this->num_bool_columns++; } else if ($type == "INT") { $this->num_int_columns++; } else if ($type == "TEXT") { $this->num_text_columns++; } } if (empty($this->key_field)) { return null; } } /** * Adds ($key, $table_row) as an entry into $table * using the adding and replace methods specified * * @param mixed &$table either an associative array of key =>value * pairs where the keys are strings of length matching this * PackedTableTools sginature and the values have been packed according to * this signature, or the file name of a file containing a serialized * version of such a table, or a file handle to the end of such a file * @param string $key a key string of length given by the signature of this * PackedTableTools * @param string $table_row a record packed according tot the signature * of this PackedTableTools * @param int $add_method one of self::ADD_MEM_TABLE, self::ADD_FILE_HANDLE, * self::ADD_FILE_PATH indicating which of the three possibilities * for $table we have * @param int $mode either self::APPEND_MODE or self::REPLACE_MODE * If the former the table_row data will be appended to any data * currently associated with the key, if the latter it will replace * such data * @return bool whether or not adding was successful */ public function add(&$table, $key, $table_row, $add_method = self::ADD_MEM_TABLE, $mode = self::REPLACE_MODE) { $encode_key = ($this->key_len > 0) ? $key : chr(strlen($key)) . $key; switch ($add_method) { case self::ADD_FILE_PATH: $separator = (fsize($table) > 0) ? "\xFF" : ""; $out = $separator . encode255($encode_key . $table_row); return (file_put_contents($table, $out , FILE_APPEND) > 0); case self::ADD_FILE_HANDLE: $separator = (ftell($table) > 0) ? "\xFF" : ""; $out = $separator . encode255($encode_key . $table_row); return (fwrite($table, $out) > 0); case self::ADD_MEM_TABLE_STRING: $separator = (strlen($table) > 0) ? "\xFF" : ""; $out = $separator . encode255($encode_key . $table_row); $table .= $out; return (strlen($out) > 0); case self::ADD_MEM_TABLE: default: if ($mode == self::REPLACE_MODE || empty($table[$key])) { $table[$key] = $table_row; } else if ($mode == self::APPEND_MODE) { $table[$key] = $this->mergeRowValues($table[$key], $table_row); } } return true; } /** * Merges two rows of items packed according to this packed table tools * into a single row of items. * * @param string $row_values1 a row (less key) packed according according * to this packed tabled tools * @param string $row_values2 a second row (less key) packed according * according to this packed tabled tools * @return string a merged row consisting of the items in the first row * followed by those in the second. */ public function mergeRowValues($row_values1, $row_values2) { $row_values1_pos = 0; $row_values1_count = vByteDecode($row_values1, $row_values1_pos); $row_values2_pos = 0; $row_values2_count = vByteDecode($row_values2, $row_values2_pos); return vByteEncode($row_values1_count + $row_values2_count) . substr($row_values1, $row_values1_pos) . substr($row_values2, $row_values2_pos); } /** * Removes $key and any records associated with it from $table * * @param array& $table array of key => records pairs where the * key is a string of length given by this PackedTableTool's signature * and the records are packed according this PackedTableTool's signature * @param string $key to remove records for * @return bool success or failure of removal */ public function delete(&$table, $key) { if (!empty($table[$key])) { unset($table[$key]); return true; } return false; } /** * Return any records in $table associated with $key * * @param array $table array of key => records pairs where the * key is a string of length given by this PackedTableTool's signature * and the records are packed according this PackedTableTool's signature * @param string $key to return records for * @return array of records that have been packed according to this * PackedTableTool's signature */ public function find($table, $key) { return $table[$key] ?? null; } /** * Looks up the data associated with a key in a string formatted as * series of rows of the given PackedTableTool type. * @param string $table_string table as a string of this PackedTableTool * type * @param string $key key field of this PackedTableTool type * @param int $offset byte offset to start search from * @return ?string just the value portion of the key value entry associated * with the given key if it exists null otherwise */ public function findRowFromKeyTableString($table_string, $key, $offset = 0) { if (strncmp($key, $table_string, strlen($key)) == 0) { $start_row = 0; } else { $encoded_key_prefix = "\xFF" . encode255($key); $start_row = strpos($table_string, $encoded_key_prefix); if ($start_row === false) { return null; } $start_row += 1; } $next_ff_pos = strpos($table_string, "\xFF", $start_row); if ($next_ff_pos === false) { $next_ff_pos = strlen($table_string); } $default_key_len = $this->key_len; $fixed_length_key = ($this->key_len > 0); $key_offset = $fixed_length_key ? 0 : 1; $entry = decode255(substr($table_string, $start_row, $next_ff_pos - $start_row)); $key_len = ($fixed_length_key) ? $default_key_len : ord($entry[0]); $values = substr($entry, $key_len + $key_offset); return $values; } /** * Determines the number of items stored in the packed table stored as a * file * @param string $table_name name of file with packed data * @return int number of entries in table */ public function countTableEntries($table_name) { $markers = $this->getEntryMarkers($table_name); return count($markers) + 1; } /** * Determines the end of record marker positions for a packed table * stored as a file * * @param string $table_name name of file with packed data * @return array of integer record marker locations */ public function getEntryMarkers($table_name) { $hash_name = crawlHash($table_name); if (!empty($this->table_entry_markers[$hash_name])) { return $this->table_entry_markers[$hash_name]; } $this->table_entry_markers[$hash_name] = []; $table_string = $this->load($table_name, self::AS_STRING_MODE, true); $delim = "\xFF"; $this->table_entry_markers[$hash_name] = []; if (preg_match_all("/$delim/", $table_string, $matches, PREG_OFFSET_CAPTURE)) { $this->table_entry_markers[$hash_name] = array_column($matches[0], 1); } return $this->table_entry_markers[$hash_name]; } /** * Returns the $index'th entry out of a string packed according to the * current PackedTableTool. * @param string $table_string table as a string of this PackedTableTool * type * @param int $index the entry to find * @return ?string the $index'th key value entry if it exists null otherwise */ public function findEntryAtIndexTableName($table_name, $index) { $table_string = $this->load($table_name, self::AS_STRING_MODE, true); $hash_name = crawlHash($table_name); $entry_markers = $this->getEntryMarkers($table_name); $start_pos = $entry_markers[$index - 1] ?? 0; $start_pos = ($start_pos == 0) ? 0 : $start_pos + 1; $end_pos = $entry_markers[$index] ?? strlen($table_string); $entry = decode255(substr($table_string, $start_pos, $end_pos - $start_pos)); return $entry; } /** * Reads in the file $table_path and unpacks it according to this * PackedTableTool's signature. Repeated values for a key are handled * according to $mode. * * @param string $table_path path to file containing PackedTableTool's * records * @param int $mode one of self::REPLACE_MODE (replace previous value), * self::APPEND_MODE (make an array of all values with same key), * self::AS_STRING_MODE read in string of file but don't decode * @param bool $cache_table whether to cache the file string of the * loaded table * @return array of key => packed_records pairs */ public function load($table_path, $mode = self::REPLACE_MODE, $cache_table = false) { if ($cache_table) { $hash_name = crawlHash($table_path); } if ($cache_table && !empty($this->table_cache[$hash_name])) { $table = $this->table_cache[$hash_name]; } else { if (!file_exists($table_path)) { return ($mode == PackedTableTools::AS_STRING_MODE) ? "" : null; } $table = $this->compressor->uncompressGetFile($table_path); if ($cache_table) { $this->table_cache[$hash_name] = $table; } } if ($mode == self::AS_STRING_MODE) { return $table; } $rows = []; $fixed_length_key = ($this->key_len > 0); $replace_mode = ($mode == self::REPLACE_MODE); $default_key_len = $this->key_len; $key_offset = $fixed_length_key ? 0 : 1; $more_rows = true; $cur_start_row = 0; while ($more_rows) { $next_ff_pos = strpos($table, "\xFF", $cur_start_row); if ($next_ff_pos === false) { $next_ff_pos = strlen($table); $more_rows = false; } $entry = decode255(substr($table, $cur_start_row, $next_ff_pos - $cur_start_row)); $cur_start_row = $next_ff_pos + 1; $key_len = ($fixed_length_key) ? $default_key_len : ord($entry[0]); $key = substr($entry, $key_offset, $key_len); $new_values = substr($entry, $key_len + $key_offset); $rows[$key] = ($replace_mode || empty($rows[$key])) ? $new_values : $this->mergeRowValues($rows[$key], $new_values); } return $rows; } /** * Packs as a string an array of records. Here each records should be an * associative array of field items, with field names and types * according to this packed table tools signature. The format of the * packed records string is: output of bool columns as bit string in * order of columns as appear in signature, bit data on sizes to use * for each int column (for each column two bit code 00 - 1byte, 01 - 2byte, * 10 - 4byte, 11 - 8byte), text column length (1byte/column saying how * long the data stored in that column is), This is followed by * the actual column data (except bool columns) in the order it is listed * in the signature. Int's use their high order bit as a sign bit and are * stored using the number of bytes give by their code in the int column * bit data. Real/doubles are stored as 8byte doubles. * * @param array $items array of records to pack * @return string records packed into the string format used by * PackedTableTools */ public function pack($items) { $format = $this->format; if (!isset($items[0])) { $items = [$items]; } $packed_items = vByteEncode(count($items)); foreach ($items as $item) { $bool_column_data = ""; $cur_bool_int = -1; $cur_bool_shift = 7; $int_column_types = ""; $text_column_lengths = ""; $shift = 6; $cur_int_char = 0; $int_occurred = false; $packed_data = ""; foreach ($format as $field_name => $type) { if (!isset($item[$field_name])) { return null; } switch ($type) { case "BOOL": $cur_bool_int = ($cur_bool_int == -1) ? 0 : $cur_bool_int; $bool_value = ($item[$field_name]) ? 1 : 0; $cur_bool_int += ($bool_value << $cur_bool_shift); $cur_bool_shift--; if ($cur_bool_shift < 0) { $bool_column_data .= chr($cur_bool_int); $cur_bool_shift = 7; $cur_bool_int = -1; } break; case "DOUBLE": $packed_data .= pack("E", $item[$field_name]); break; case "INT": $int_occurred = true; $magnitude = abs($item[$field_name]); $positive = ($item[$field_name] == $magnitude) ? 0 : 128; if ($magnitude < 128) { $packed_data .= chr($magnitude + $positive); $cur_int_char = ($cur_int_char == -1) ? 0 : $cur_int_char; } else { if ($magnitude < 32768) { $packed_int = pack("n", $magnitude); $cur_int_add = (1 << $shift); } else if ($magnitude < 2147483647) { $packed_int = pack("N", $magnitude); $cur_int_add = (2 << $shift); } else { $packed_int = pack("J", $magnitude); $cur_int_add = (3 << $shift); } if (!$positive) { $packed_int[0] = chr(ord($packed_int[0]) + $positive); } $packed_data .= $packed_int; $cur_int_char = max($cur_int_char, 0) + $cur_int_add; } $shift -= 2; if ($shift < 0) { $int_column_types .= chr($cur_int_char); $cur_int_char = -1; $shift = 6; } break; case "REAL": $packed_data .= pack("G", $item[$field_name]); break; case "TEXT": $len = strlen($item[$field_name]); if ($len > 255) { return null; } $packed_data .= $item[$field_name]; $text_column_lengths .= chr($len); break; } } if ($cur_bool_int != -1) { $bool_column_data .= chr($cur_bool_int); } if ($cur_int_char != -1 && $int_occurred) { $int_column_types .= chr($cur_int_char); } $packed_items .= $bool_column_data . $int_column_types . $text_column_lengths . $packed_data; } return $packed_items; } /** * Saves an associative array of key => packed_records to a file * $table_path on disk. Here the packed_records should be packed according * to this PackedTableTools signature. Each key value pair is concatenated * if variable length keys are used by this packed table tool then the * length of the key is stored as a char before the key. Records are then * encoded by replacing \xFE => \xFE\xFD and \xFF => \xFE\xFE. The * records are then concatenated separated by \xFF, and the result * compressed using the PackedTableTool's compressor. * * @param string $table_path filename to save to * @param mixed string contaiing an encoded table or * array $table array of key => packed_records * @return bool success or not */ public function save($table_path, $table) { if (is_string($table)) { $out = $table; } else { $out = ""; if ($this->key_len > 0) { foreach ($table as $key => $row) { $out .= "\xFF" . encode255($key . $row); } } else { foreach ($table as $key => $row) { $out .= "\xFF" . encode255(chr(strlen($key)) . $key . $row); } } $out = substr($out, 1); } return ($this->compressor->compressPutFile($table_path, $out) > 0); } /** * Given a table_row, which might represent several items grouped because * share a key, returns the total number of items stored in the row * @param string $table_row sequence of items packed according to this * PackedTableTools signature * @return int number of items sctored in the table row */ public function count($table_row) { $current_pos = 0; return vByteDecode($table_row, $current_pos); } /** * Given a table_row, which might represent several items grouped because * they share a key, unpacks and returns the $offset through $limit numbered * items * @param string $table_row sequence of items packed according to this * PackedTableTools signature * @param int $offset index of item to begin with * @param int $limit maximum number of items to return starting at offset * @return array unplacked items */ public function unpack($table_row, $offset = 0, $limit = -1) { $num_int_columns = $this->num_int_columns; $num_text_columns = $this->num_text_columns; $num_bool_columns = $this->num_bool_columns; $bool_info_len = intval(ceil($this->num_bool_columns/8)); $int_info_len = intval(ceil($num_int_columns / 4)); $unpack_code = [1 => "n", 2=> "N", 3 => "J"]; $current_pos = 0; $num_items = vByteDecode($table_row, $current_pos); if ($offset >= $num_items || empty($table_row)) { return []; } $limit = ($limit <= 0) ? $num_items : $limit; $len_row = strlen($table_row); $num_items = min($limit + $offset, $num_items); for ($i = 0; $i < $num_items; $i++) { $bool_info = substr($table_row, $current_pos, $bool_info_len); $current_pos += $bool_info_len; $int_info = substr($table_row, $current_pos, $int_info_len); $current_pos += $int_info_len; $text_info = substr($table_row, $current_pos, $num_text_columns); $current_pos += $num_text_columns; $format = $this->format; $item = []; $current_bool_pos = 0; $cur_bool_shift = 7; $current_int_pos = 0; $current_text_pos = 0; $shift = 6; $bools_used = 0; $ints_used = 0; foreach ($format as $field_name => $type) { switch ($type) { case "BOOL": if ($bools_used >= $num_bool_columns) { return null; } $item[$field_name] = ((ord($bool_info[$current_bool_pos]) & (1 << $cur_bool_shift)) >> $cur_bool_shift) > 0; $bools_used ++; $cur_bool_shift --; if ($cur_bool_shift < 0) { $current_bool_pos ++; $cur_bool_shift = 7; } break; case "DOUBLE": $len = min(8, $len_row - $current_pos); $item[$field_name] = ($len == 8) ? unpack("E", $table_row, $current_pos)[1] : 0; $current_pos += $len; break; case "INT": if ($ints_used >= $num_int_columns || empty($int_info)) { return null; } $int_code = (ord($int_info[$current_int_pos]) & (3 << $shift)) >> $shift; if (!isset($table_row[$current_pos])) { return null; } $first_char = ord($table_row[$current_pos]); $len = 1 << $int_code; if ($int_code == 0) { $value = $first_char; if ($value > 127) { $value = - ($value & 127); } $item[$field_name] = $value; } else { $sign = ($first_char > 127) ? -1 : 1; if ($sign < 0) { $table_row[$current_pos] = chr($first_char - 128); } $item[$field_name] = $sign * (unpack( $unpack_code[$int_code], $table_row, $current_pos)[1]); } $current_pos += $len; $ints_used++; $shift -= 2; if ($shift < 0) { $current_int_pos++; $shift = 6; } break; case "REAL": $len = min(4, $len_row - $current_pos); $item[$field_name] = ($len == 4) ? unpack("G", $table_row, $current_pos)[1] : 0; $current_pos += $len; break; case "TEXT": if ($current_text_pos >= $num_text_columns) { return null; } $text_len = ord($text_info[$current_text_pos]); $item[$field_name] = substr($table_row, $current_pos, $text_len); $current_pos += $text_len; $current_text_pos++; break; } } if ($i >= $offset) { $items[] = $item; } } return $items; } }