viewgit/inc/functions.php:22 Function utf8_encode() is deprecated [8192]
<?php /** * SeekQuarry/Yioop -- * Open Source Pure PHP Search Engine, Crawler, and Indexer * * Copyright (C) 2009 - 2024 Chris Pollett chris@pollett.org * * LICENSE: * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <https://www.gnu.org/licenses/>. * * END LICENSE * * A library of string, error reporting, log, hash, time, and conversion * functions * * @author Chris Pollett chris@pollett.org * @license https://www.gnu.org/licenses/ GPL3 * @link https://www.seekquarry.com/ * @copyright 2009 - 2024 * @filesource */ namespace seekquarry\yioop\library; use seekquarry\yioop\configs as C; /** For Yioop global defines */ require_once __DIR__ . "/../configs/Config.php"; /** * Adds delimiters to a regex that may or may not have them * * @param string $expression a regex * @return string rgex with delimiters if not there */ function addRegexDelimiters($expression) { $first = $expression; $len = strlen($expression); $last = $expression[$len - 1]; if (($first != $last && $len > 1) || $len == 1) { $expression = ($first != '/' ) ? "/" . $expression . "/" : "@" . $expression . "@"; } return $expression; } /** * search for a pcre pattern in a subject from a given offset, * return position of first match if found -1 otherwise. * * @param string $pattern a Perl compatible regular expression * @param string $subject to search for pattern in * @param int $offset character offset into $subject to begin searching from * @param boolean $return_match whether to return as well what the * match was for the pattern * @return mixed if $return_match is false then the integer position of first * match, otherwise, it returns the ordered pair [$pos, $match]. */ function preg_search($pattern, $subject, $offset = 0, $return_match = false) { $pos = -1; if (preg_match($pattern, $subject ?? "", $matches, PREG_OFFSET_CAPTURE, $offset)) { $pos = $matches[0][1]; } if ($return_match) { $match = (empty($matches[0][0])) ? [] : $matches[0][0]; return [$pos, $match]; } return $pos; } /** * Replaces a pcre pattern with a replacement in $subject starting from * some offset. * * @param string $pattern a Perl compatible regular expression * @param string $replacement what to replace the pattern with * @param string $subject to search for pattern in * @param int $offset character offset into $subject to begin searching from * @return string result of the replacements */ function preg_offset_replace($pattern, $replacement, $subject, $offset = 0) { $subject ??= ""; $offset ??= 0; $start = substr($subject, 0, $offset); $end = substr($subject, $offset); return $start . preg_replace($pattern, $replacement, $end); } /** * Yioop replacement for parse_ini_file($name, true) in case * parse_ini_file is on the disable_functions list. Name has underscores * to match original function. This function * checks if parse_ini_file is disabled on not. If not, it just * calls parse_ini_file; otherwise, it simulates it enough so * that configure.ini files used for string translations can be read. * * @param string $file filename of ini data to parse into an array * @return array data parse from file */ function parse_ini_with_fallback($file) { static $disabled; if (!isset($disabled)) { $disabled_string = ini_get('disable_functions'); $disabled = strpos($disabled_string, "parse_ini_file") !== false; } if (!$disabled) { return parse_ini_file($file, true); } $lines = file($file); $name_space = null; $ini = []; $assigned = '"((?:[^"\\\]|\\\.)*)"|\w+|'."'". '((?:[^'."'".'\\\]|\\\.)*)'."'"; foreach ($lines as $line) { if (preg_match('/\[(\w+)\]/', $line, $matches)) { $name_space = $matches[1]; $ini[$name_space] = []; } else if (preg_match("/(\w+|(\w+)(\[\]))\s*\=\s*($assigned)/", $line, $matches)){ if ($name_space) { if ($matches[3] == '[]') { $ini[$name_space][$matches[2]][] = getIniAssignMatch($matches); } else { $ini[$name_space][$matches[1]] = getIniAssignMatch($matches); } } else { if ($matches[3] == '[]') { $ini[$name_space][$matches[2]][] = getIniAssignMatch($matches); } else { $ini[$matches[1]] = getIniAssignMatch($matches); } } } } return $ini; } /** * Auxiliary function called from parse_ini_with_fallback to extract from * the $matches array produced by the former function's preg_match * what kind of assignment occurred in the ini file being parsed. * * @param string $matches produced by a preg_match in * parse_ini_with_fallback * @return mixed value of ini file assignment */ function getIniAssignMatch($matches) { if (isset($matches[6])) { return $matches[6]; } else if (isset($matches[5])) { return $matches[5]; } else if (isset($matches[4])) { $tmp = $matches[4]; if ($tmp == "true") { return true; } else if ($tmp == "false") { return false; } return $tmp; } return false; } /** * Copies from $source string beginning at position $start, $length many * bytes to destination string * * @param string $source string to copy from * @param string &$destination string to copy to * @param int $start starting offset * @param int $length number of bytes to copy * @param string $timeout_msg message to print if taking more * than 30 seconds */ function charCopy($source, &$destination, $start, $length, $timeout_msg = "") { $endk = intval($length - 1); $end = intval($start + $endk); $start = intval($start); if (strlen($destination) <= $endk) { return; } if ($timeout_msg == "") { for ($j = $end, $k = $endk; $j >= $start; $j--, $k--) { $destination[$j] = $source[$k]; } } else { $time_out_check_frequency = 5000; for ($j = $end, $k = $endk, $t = 0; $j >= $start; $j--, $k--, $t++) { $destination[$j] = $source[$k]; if ($t > $time_out_check_frequency) { crawlTimeoutLog($timeout_msg); $t = 0; } } } } /** * Encodes an integer using variable byte coding. * * @param int $pos_int integer to encode * @return string a string of 1-5 chars depending on how bit $pos_int was */ function vByteEncode($pos_int) { $result = ($pos_int < 128) ? chr($pos_int & 127) : chr(128 | ($pos_int & 127)); $pos_int >>= 7; while($pos_int > 0) { $result .= ($pos_int < 128) ? chr($pos_int & 127) : chr(128 | ($pos_int & 127)); $pos_int >>= 7; } return $result; } /** * Decodes from a string using variable byte coding an integer. * * @param string $str string to use for decoding * @param int &$offset byte offset into string when var int stored * @return int the decoded integer */ function vByteDecode($str, &$offset) { $pos_int = ord($str[$offset]) & 127; $shift = 7; while ((ord($str[$offset++]) & 128) > 0) { $pos_int += ((ord($str[$offset]) & 127) << $shift); $shift += 7; } return $pos_int; } /** * Appends a number re-encoded in unary to the end of an input string starting * at a given bit offset into the string. Here n in unary has bit * representation n-1 0's followed by a 1. * * @param int $number number to append * @param string the string to append to * @param bool whether to return the resulting string or the number of bits * long the resulting string would be * @return mixed either the resulting string or its length */ function appendUnary($number, $input, &$start_bit_offset, $just_bit_offset = false) { $total_bits = $start_bit_offset + $number; if ($just_bit_offset) { // only compute new bit offset $start_bit_offset = $total_bits; return $input; } $bits_last_byte = $total_bits & 7; $total_chars = ($bits_last_byte == 0) ? ($total_bits >> 3) : (($total_bits >> 3) + 1); $bits_first_byte = $start_bit_offset & 7; $start_char = $start_bit_offset >> 3; $output = str_pad(substr($input, 0 , $start_char + 1), $total_chars, "\x00"); $start_char_ord = ord($output[$start_char] ?? 0); $start_char_ord &= ((1 << $bits_first_byte) - 1) << (8 - $bits_first_byte); $output[$start_char] = chr($start_char_ord); $last_ord = ord($output[$total_chars - 1]); $output[$total_chars - 1] = chr($last_ord + (($bits_last_byte == 0) ? 1 : (1 << (8 - $bits_last_byte)))); $start_bit_offset = $total_bits; return $output; } /** * Decodes a unary number from an input string at a given bit offset. Here n * in unary has bit representation n-1 0's followed by a 1. * * @param string $input the string that we want to decode a unary number from * @param int &$start_bit_offset the starting bit offset in $input to start * decoding from. After the call it will be the position after the decode * @return int the decoded unary number */ function decodeUnary($input, &$start_bit_offset) { $cur_char = $start_bit_offset >> 3; if (!isset($input[$cur_char])) { return false; } $bits_first_byte = $start_bit_offset & 7; $cur_ord = (ord($input[$cur_char]) << $bits_first_byte) & 255; if ($cur_ord > 0) { $decoded_number = 9 - ceil(log($cur_ord + 1, 2)); $start_bit_offset += $decoded_number; return $decoded_number; } $decoded_number = 8 - $bits_first_byte; do { $cur_char++; $cur_ord = ord($input[$cur_char] ?? "\0"); $decoded_number += ($cur_ord == 0) ? 8 : 9 - ceil(log($cur_ord + 1, 2)); } while ($cur_ord == 0 && isset($input[$cur_char])); $start_bit_offset += $decoded_number; return $decoded_number; } /** * Appends $num_bits bits from the start of the binary rep of $number * beginning at offset $start_bit_offset of $input string overwriting any bits * present. If $num_bits == -1, then appends all of $number. * * @param int $number to append * @param string $input the string to append to. * @param int &$start_bit_offset starting location to begin append from * @param $num_bits number of bits of $input to append. * @return string resulting string */ function appendBits($number, $input, &$start_bit_offset, $num_bits = -1) { $start_char = $start_bit_offset >> 3; $num_bits = ($num_bits == -1) ? ceil(log($number + 1, 2)) : $num_bits; $total_bits = $start_bit_offset + $num_bits; $bits_last_byte = $total_bits & 7; $total_chars = ($bits_last_byte == 0) ? ($total_bits >> 3) : (($total_bits >> 3) + 1); $number &= ((1 << $num_bits) - 1); // notice using low order here $output = str_pad(substr($input, 0 , $start_char + 1), $total_chars, "\x00"); $cur_char = $total_chars - 1; $cur_bits = $number & ((1 << $bits_last_byte) - 1); $number >>= $bits_last_byte; $start_remaining_bits = $num_bits - $bits_last_byte; $shift_last_byte = ($bits_last_byte == 0) ? 0 : 8 - $bits_last_byte; $output[$cur_char] = chr(ord($output[$cur_char]) + ($cur_bits << $shift_last_byte)); $cur_char--; for ($remaining_bits = $start_remaining_bits; $remaining_bits > 7 ; $remaining_bits -= 8, $cur_char--, $number >>= 8) { $output[$cur_char] = chr($number & 255); } if ($remaining_bits > 0) { $start_char_ord = ord($output[$start_char]); $start_char_ord &= (255 - (1 << ($remaining_bits - 1))); $output[$start_char] = chr($start_char_ord + $number); } $start_bit_offset = $total_bits; return $output; } /** * Decode $num_bits many bits from the $input string beginning at offset * $start_bit_offset. The result of this operation is up $start_bit_offset * by number of bits that were able to be decoded. * * @param string $input string to decode bits from * @param int &$start_bit_offset bit offset to start decoding from in $input * @param int $num_bits number of bits tot try to decode * @return int the number decoded */ function decodeBits($input, &$start_bit_offset, $num_bits) { $cur_char = $start_bit_offset >> 3; $total_bits = $start_bit_offset + $num_bits; $bits_first_byte = $start_bit_offset & 7; $input_char = $input[$cur_char] ?? ''; $cur_ord = ((ord($input_char) << $bits_first_byte) & 255) >> $bits_first_byte; $output = $cur_ord & ((1 << (8 - $bits_first_byte)) - 1); if ($num_bits <= 8 - $bits_first_byte) { $excess_bits = 8 - $bits_first_byte - $num_bits; $output >>= $excess_bits; $start_bit_offset = $total_bits; return $output; } $remaining_bits = $num_bits - (8 - $bits_first_byte); $cur_char++; while ($remaining_bits > 7 && isset($input[$cur_char])) { $output <<= 8; $output += ord($input[$cur_char]); $remaining_bits -= 8; $cur_char++; } if ($remaining_bits > 0 && isset($input[$cur_char])) { $last_char_ord = ord($input[$cur_char]); $output <<= $remaining_bits; $output += ($last_char_ord >> (8 - $remaining_bits)); } $start_bit_offset = $total_bits; return $output; } /** * Appends gamma code of $number beginning at offset $start_bit_offset of * $input string overwriting any bits present. $start_bit_offset is updated * to bit position after append. * * @param int $number to append * @param string $input the string to append to. * @param int &$start_bit_offset starting bit location to begin append from * @return string resulting string */ function appendGamma($number, $input, &$start_bit_offset) { $bit_len = ceil(log($number + 1, 2)); appendUnary($bit_len, $input, $start_bit_offset, true); $start_bit_offset--; /* $start_bit_offset will have advanced by appendUnary call */ return appendBits($number, $input, $start_bit_offset, $bit_len); } /** * Decodes up to $num_decode gamma encoded integers beginning at * $start_bit_offset. $start_bit_offset is updated to the bit position * after the decoded integers. * * @param string $input the string to decode from * @param int &$start_bit_offset starting bit location to decode from * @param int $num_decode number of int's to decode * @return array decoded int's */ function decodeGammaList($input, &$start_bit_offset, $num_decode) { $out_list = []; for ($i = 0; $i < $num_decode; $i++) { $num_bits = decodeUnary($input, $start_bit_offset); $start_bit_offset--; $out_list[] = decodeBits($input, $start_bit_offset, $num_bits); } return $out_list; } /** * Appends using a Rice coding a sequence of integers $int_sequence at * offset $start_bit_offset to the string $output, overwriting any bits present * at that location. $start_bit_offset is updated to bit position after append. * Encoding is done as a difference list. If $delta_start is set to * a value other than >= then the first gap is assumed to be from int * $delta_start * * @param array $int_sequence int's to append * @param int $modulus i in the 2^i modulus to use for Rice code * @param string $output the string to append to. * @param int &$start_bit_offset starting bit location to begin append from * @param int $delta_start if >= 0 previous int to use for difference list * otherwise the first integer is encoded as itself rather than a difference * @return string resulting string */ function appendRiceSequence($int_sequence, $modulus, $output, &$start_bit_offset, $delta_start = -1) { $last_encode = $delta_start; $output = appendUnary($modulus, $output, $start_bit_offset); $mask = (1 << $modulus) - 1; $i= 0; foreach ($int_sequence as $pre_to_encode) { $to_encode = ($delta_start < 0) ? $pre_to_encode : $pre_to_encode - $last_encode; $to_encode--; $last_encode = $pre_to_encode; $output = appendUnary(($to_encode >> $modulus) + 1, $output, $start_bit_offset); $output = appendBits($to_encode & $mask, $output, $start_bit_offset, $modulus); } return $output; } /** * Decodes up to $num_decode rice encoded difference list of integers beginning * at $start_bit_offset. $start_bit_offset is updated to the bit position * after the decoded integers. If $delta_start >= 0 then the first int is * assumed to be the difference from $delta_start; * * @param string $input the string to decode from * @param int &$start_bit_offset starting bit location to decode from * @param int $num_decode number of int's to decode * @param int $delta_start if >= 0 previous int to use for difference list * otherwise the first integer is decoded as itself rather than a difference * @return array decoded int's */ function decodeRiceSequence($input, &$start_bit_offset, $num_decode, $delta_start = -1) { $out_list = []; $last_decode = $delta_start; $modulus = decodeUnary($input, $start_bit_offset); for($i = 0; $i < $num_decode; $i++) { $quotient = decodeUnary($input, $start_bit_offset); $quotient--; $remainder = decodeBits($input, $start_bit_offset, $modulus); $decode = ($quotient << $modulus) + $remainder + 1; $last_decode = ($delta_start == -1) ? $decode : $last_decode + $decode; $out_list[] = $last_decode; } return $out_list; } /** * Encodes a list of integer positions of a term in a document. This * is done as a gamma code of the first integer followed by the Rice * coding of the remaining integers using a modulus based on the average * gap between integers. If the number of positions is 1 or 2 then a gamma * of each position only is used. * * @param array $positions integer term positions * @return string encoded position list */ function encodePositionList($positions) { $output = ""; $num_positions = count($positions); $start_bit_offset = 0; $start_position = $positions[0]; $last = 0; if ($num_positions < 3) { foreach ($positions as $position) { $output = appendGamma($position - $last, $output, $start_bit_offset); $last = $position; } return $output; } $average_gap = ($positions[$num_positions - 1] - $positions[0]) / $num_positions; $average_len = max(ceil(log($average_gap + 1, 2)), 2); $output = appendGamma($start_position, $output, $start_bit_offset); array_shift($positions); return appendRiceSequence($positions, $average_len, $output, $start_bit_offset, $start_position); } /** * Decodes up to $num_decode term in document position integers from * string $input under the assumption $input is encoded as per * @see encodePositionList . * * @param string $input string to decode from * @param int $num_decode number of integer to decode * @return array decoded positions */ function decodePositionList($input, $num_decode) { $start_bit_offset = 0; if ($num_decode < 3) { $out_list = decodeGammaList($input, $start_bit_offset, $num_decode); if (count($out_list) == 2) { $out_list[1] += $out_list[0]; } return $out_list; } $first = decodeGammaList($input, $start_bit_offset, 1)[0]; $out_list = decodeRiceSequence($input, $start_bit_offset, $num_decode - 1, $first); array_unshift($out_list, $first); return $out_list; } /** * Recodes a string in a 1-1 fashion to a string not involving * \xFF (255). I.e., it maps characters \xFE -> \xFE\FD and \xFF -> \xFE\FE * * @param string $str to be encoded * @return string encoded string without \xFF */ function encode255($str) { return strtr($str, ["\xFE" => "\xFE\xFD", "\xFF" => "\xFE\xFE"]); } /** * Decodes a string in a 1-1 fashion from a string not involving * \xFF (255). I.e., it maps characters \xFE\FE -> \xFF and \xFE\FD -> \xFF * * @param string $str to be frcoded * @return string decoded string */ function decode255($str) { return strtr($str, ["\xFE\xFE"=>"\xFF", "\xFE\xFD" => "\xFE"]); } /** * Recodes a string in a 1-1 fashion to a string not involving underscore * (_). I.e., it maps characters - -> -- and _ -> -= * * @param string $str to be encoded * @return string encoded string without _ */ function encodeUnderscore($str) { return strtr(($str ?? ""), ["-" => "--", "_" => "-="]); } /** * Decodes a string in a 1-1 fashion from a string not involving underscore * (_). I.e., it maps characters -= -> _ and -- -> - * * @param string $str to be frcoded * @return string decoded string */ function decodeUnderscore($str) { return strtr($str, ["-="=>"_", "--" => "-"]); } /** * Encodes a list of strings as their @see encode255 versions separated by * \xFF's * * @param array $strs strings to encode as a single string * @return string encoded list */ function packEncode255($strs) { return implode("\xFF", array_map($strs, encode255)); } /** * Decodes a list of strings from a string that encoded as their @see encode255 * of its elements separated by \xFF's * * @param string $encoded_strs string to decode into a list of strings * @return array decoded list */ function unpackDecode255($encoded_strs) { return array_map(explode("\xFF", $encoded_strs), decode255); } /** * Makes an packed integer string from a docindex and the number of * occurrences of a word in the document with that docindex. * * @param int $doc_index index (i.e., a count of which document it * is rather than a byte offset) of a document in the document string * @param array $position_list integer positions word occurred in that doc * @param bool $delta if true then stores the position_list as a sequence of * differences (a delta list) * @return string a modified9 (our compression scheme) packed * string containing this info. */ function packPosting($doc_index, $position_list, $delta = true) { if ($delta) { $delta_list = deltaList($position_list); } else { $delta_list = $position_list; } if ( $doc_index >= (2 << 14) && isset($delta_list[0]) && $delta_list[0] < (2 << 9) && $doc_index < (2 << 17)) { $delta_list[0] += (((2 << 17) + $doc_index) << 9); } else { // we add 1 to doc_index to make sure not 0 (modified9 needs > 0) array_unshift($delta_list, ($doc_index + 1)); } $encoded_list = encodeModified9($delta_list); return $encoded_list; } /** * Given a packed integer string, uses the top three bytes to calculate * a doc_index of a document in the shard, and uses the low order byte * to computer a number of occurrences of a word in that document. * * @param string $posting a string containing * a doc index position list pair coded encoded using modified9 * @param int &$offset a offset into the string where the modified9 posting * is encoded * @param bool $dedelta if true then assumes the list is a sequence of * differences (a delta list) and undoes the difference to get * the original sequence * @return array consisting of integer doc_index and a subarray consisting * of integer positions of word in doc. */ function unpackPosting($posting, &$offset, $dedelta = true) { $delta_list = (array) decodeModified9($posting, $offset); $doc_index = array_shift($delta_list); if (($doc_index & (2 << 26)) > 0) { $delta0 = $doc_index; $doc_index >>= 9; $doc_index -= (2 << 17); $delta0 -= (((2 << 17) + $doc_index) << 9); array_unshift($delta_list, $delta0); } else { $doc_index--; } if ($dedelta) { deDeltaList($delta_list); } return [$doc_index, $delta_list]; } /** * This method is used while appending one index shard to another. * Given a string of postings adds $add_offset add to each offset to the * document map in each posting. * * @param string &$postings a string of index shard postings * @param int $add_offset an fixed amount to add to each postings doc map offset * * @return string $new_postings where each doc offset has had $add_offset added * to it */ function addDocIndexPostings(&$postings, $add_offset) { $offset = 0; $new_postings = ""; $postings_len = strlen($postings); while($offset < $postings_len) { $post_string = nextPostString($postings, $offset); if ($post_string == "" || !($tmp = unpack("N*", $post_string))) { continue; } $posting_list = call_user_func_array("array_merge", array_map(C\NS_LIB . "unpackListModified9", $tmp)); if (!is_array($posting_list)) { continue; } $doc_index = array_shift($posting_list); if (($doc_index & (2 << 26)) > 0) { $post0 = ($doc_index & ((2 << 9) - 1)); array_unshift($posting_list, $post0); $doc_index -= $post0; $doc_index -= (2 << 26); $doc_index >>= 9; } else { $doc_index--; } $doc_index += $add_offset; if ($doc_index >= (2 << 14) && isset($posting_list[0]) && $posting_list[0] < (2 << 9) && $doc_index < (2 << 17)) { $posting_list[0] += (((2 << 17) + $doc_index) << 9); } else { // we add 1 to doc_index to make sure not 0 (modified9 needs > 0) array_unshift($posting_list, ($doc_index + 1)); } $new_postings .= encodeModified9($posting_list); } return $new_postings; } /** * Computes the difference of a list of integers. * i.e., (a1, a2, a3, a4) becomes (a1, a2-a1, a3-a2, a4-a3) * * @param array $list a nondecreasing list of integers * @return array the corresponding list of differences of adjacent * integers */ function deltaList($list) { $last = 0; $delta_list = []; foreach ($list as $elt) { $delta_list[] = $elt - $last; $last = $elt; } return $delta_list; } /** * Given an array of differences of integers reconstructs the * original list. This computes the inverse of the deltaList function * * @see deltaList * @param array $delta_list a list of nonegative integers * @return array a nondecreasing list of integers */ function deDeltaList(&$delta_list) { $last = 0; $num = count($delta_list); for ($i = 1; $i < $num; $i++) { $delta_list[$i] += $delta_list[$i - 1]; } } /** * Mini-class (so not own file) used to hold encode decode info related to * Mod9 encoding (as variant of Simplified-9 specify to Yioop). * Mod9 is used to incode a sequence of positive (greater than 0) integers * as a string. WARNING: do not expect is to work/decode correctly if * sequence has a 0 as the decoding process assumes 0 indicates end of sequence. * @see encodeModified9 for a complete description */ class Mod9Constants { /** * Used in Modified 9 encoding. The ith array entry represents the number of * i bit elements that can be stored in a word using modified 9 (0th index * location is a dummy value 0 as can't store 0 bit numbers) * @array */ public static $MOD9_PACK_POSSIBILITIES = [ 0, 24, 12, 7, 6, 5, 4, 3, 3, 3, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]; /** * Used in Modified 9 encoding. Key values are the number of elements we * would like to store in the current word. Values are the bit prefix to use * on first byte of word. Notices bits 7 and 6 (128 and 64) are not parts of * prefixes as used for continuation bits. * @array */ public static $MOD9_NUM_ELTS_CODES = [ 24 => 63, 12 => 62, 7 => 60, 6 => 56, 5 => 52, 4 => 48, 3 => 32, 2 => 16, 1 => 0]; /** * Keys of this array are prefix codes from the high order byte of a word * encoded using Modified 9, values are the number of bits used to encode * an element if that prefix code was used. * @array */ public static $MOD9_NUM_BITS_CODES = [63 => 1, 62 => 2, 60 => 3, 56 => 4, 52 => 5, 48 => 6, 32 => 9, 16 => 14, 0 => 28]; /** * Keys of this array are prefix codes from the high order byte of a word * encoded using Modified 9, values are the number of elts stored in the * remaining bits of the word * @array */ public static $MOD9_NUM_ELTS_DECODES = [ 63 => 24, 62 => 12, 60=> 7, 56 => 6, 52 => 5, 48 => 4, 32 => 3, 16 => 2, 0 => 1]; } /** * Encodes a sequence of integers x, such that 1 <= x <= 2<<28-1 * as a string. NOTICE x>=1. * * The encoded string is a sequence of 4 byte words (packed int's). * The high order 2 bits of a given word indicate whether or not * to look at the next word. The codes are as follows: * 11 start of encoded string, 10 continue four more bytes, 01 end of * encoded, and 00 indicates whole sequence encoded in one word. * * After the high order 2 bits, the next most significant bits indicate * the format of the current word. There are nine possibilities: * 00 - 1 28 bit number, 01 - 2 14 bit numbers, 10 - 3 9 bit numbers, * 1100 - 4 6 bit numbers, 1101 - 5 5 bit numbers, 1110 6 4 bit numbers, * 11110 - 7 3 bit numbers, 111110 - 12 2 bit numbers, 111111 - 24 1 bit * numbers. * * @param array $list a list of positive integers satsfying above * @return string encoded string */ function encodeModified9($list) { $MOD9_PACK_POSSIBILITIES = Mod9Constants::$MOD9_PACK_POSSIBILITIES; $cnt = 0; $cur_size = 1; $cur_len = 1; $pack_list = []; $list_string = ""; $continue_bits = 3; foreach ($list as $elt) { $old_len = $cur_len; while($elt > $cur_size) { $cur_len++; $cur_size = (1 << $cur_len) - 1; } if ($cnt < $MOD9_PACK_POSSIBILITIES[$cur_len]) { $pack_list[] = $elt; $cnt++; } else { $list_string .= packListModified9($continue_bits, $MOD9_PACK_POSSIBILITIES[$old_len], $pack_list); $continue_bits = 2; $pack_list = [$elt]; $cur_size = 1; $cur_len = 1; $cnt = 1; while($elt > $cur_size) { $cur_size = (1 << $cur_len) - 1; $cur_len++; } } } $continue_bits = ($continue_bits == 3) ? 0 : 1; $list_string .= packListModified9($continue_bits, $MOD9_PACK_POSSIBILITIES[$cur_len], $pack_list); return $list_string; } /** * Packs the contents of a single word of a sequence being encoded * using Modified9. * * @param int $continue_bits the high order 2 bits of the word * @param int $cnt the number of element that will be packed in this word * @param array $pack_list a list of positive integers to pack into word * @return string encoded 4 byte string * @see encodeModified9 */ function packListModified9($continue_bits, $cnt, $pack_list) { $MOD9_NUM_ELTS_CODES = Mod9Constants::$MOD9_NUM_ELTS_CODES; $MOD9_NUM_BITS_CODES = Mod9Constants::$MOD9_NUM_BITS_CODES; $out_int = 0; $code = $MOD9_NUM_ELTS_CODES[$cnt]; $num_bits = $MOD9_NUM_BITS_CODES[$code]; foreach ($pack_list as $elt) { $out_int <<= $num_bits; $out_int += $elt; } $out_string = packInt($out_int); $out_string[0] = chr(($continue_bits << 6) + $code + ord($out_string[0])); return $out_string; } /** * Returns the next complete posting string from $input_string being at offset. * Does not do any decoding. * * @param string &$input_string a string of postings * @param int &$offset an offset to this string which will be updated after call * @return string undecoded posting */ function nextPostString(&$input_string, &$offset) { if (!isset($input_string[$offset + 3])) { $offset +=4; //make sure offset always increases to be safe return ""; } $flag_mask = 192; $continue_threshold = 128; $len = strlen($input_string); $end = $offset; $error = false; $flag_bits = (ord($input_string[$end]) & $flag_mask) ; if ($flag_bits && $flag_bits != $flag_mask) { crawlLog("!! Decode Error Flags: $flag_bits $flag_mask"); crawlLog("!! Dropping posting at $offset cycle to next posting."); crawlLog("!! Dropped posting length $len."); $offset += 4; return ""; } $end += 4; while($end < $len && $flag_bits >= $continue_threshold) { $flag_bits = (ord($input_string[$end]) & $flag_mask); $end += 4; } $post_string = substr($input_string, $offset, $end - $offset); $offset = $end; return $post_string; } /** * Decoded a sequence of positive integers from a string that has been * encoded using Modified 9 * * @param string $input_string string to decode from * @param int &$offset where to string in the string, after decode * points to where one was after decoding. * @return array sequence of positive integers that were decoded * @see encodeModified9 */ function decodeModified9($input_string, &$offset) { $post_string = nextPostString($input_string, $offset); return call_user_func_array( "array_merge", array_map(C\NS_LIB . "unpackListModified9", unpack("N*", $post_string))); } if (!extension_loaded("yioop") ) { /** * Decode a single word with high two bits off according to modified 9 * * @param string $encoded_list four byte string to decode * @return array sequence of integers that results from the decoding. */ function unpackListModified9($encoded_list) { switch ($encoded_list & 0x30000000) { case 0: return [$encoded_list & 0x0FFFFFFF]; //lop off high nibble break; case 0x10000000: $encoded_list &= 0xEFFFFFFF; $num_bits = 14; $num_elts = 2; $mask = 0x3FFF; $shift = 14; break; case 0x20000000: $encoded_list &= 0xDFFFFFFF; $num_bits = 9; $num_elts = 3; $mask = 0x1FF; $shift = 18; break; default: $MOD9_NUM_BITS_CODES = Mod9Constants::$MOD9_NUM_BITS_CODES; $MOD9_NUM_ELTS_DECODES = Mod9Constants::$MOD9_NUM_ELTS_DECODES; $int_string = packInt($encoded_list); $first_char = ord($int_string[0]); foreach ($MOD9_NUM_BITS_CODES as $code => $num_bits) { if (($first_char & $code) == $code) { break; } } $num_elts = $MOD9_NUM_ELTS_DECODES[$code]; $mask = (1 << $num_bits) - 1; $int_string[0] = chr($first_char - $code); $encoded_list = unpackInt($int_string); } $decoded_list = []; for ($i = 0; $i < $num_elts; $i++) { if (($pre_elt = $encoded_list & $mask) == 0) { break; } array_unshift($decoded_list, $pre_elt); $encoded_list >>= $num_bits; } return $decoded_list; } /** * Given an int encoding encoding a doc_index followed by a position * list using Modified 9, extracts just the doc_index. * * @param int $encoded_list in the just described format * @return int a doc index into an index shard document map. */ function docIndexModified9($encoded_list) { $t26 = 2 << 26; switch ($encoded_list & 0x30000000) { case 0: $encoded_list &= 0x0FFFFFFF; //lop off high nibble return (($encoded_list & $t26) > 0) ? ($encoded_list - $t26 + ($encoded_list & 0x1FF)) >> 9 : $encoded_list - 1; break; case 0x10000000: $encoded_list &= 0xEFFFFFFF; $num_bits = 14; $mask = 0x3FFF; $shift = 14; break; case 0x20000000: $encoded_list &= 0xDFFFFFFF; $num_bits = 9; $mask = 0x1FF; $shift = 18; break; default: $MOD9_NUM_BITS_CODES = Mod9Constants::$MOD9_NUM_BITS_CODES; $MOD9_NUM_ELTS_DECODES = Mod9Constants::$MOD9_NUM_ELTS_DECODES; $first_char = $encoded_list >> 24; foreach ($MOD9_NUM_BITS_CODES as $code => $num_bits) { if (($first_char & $code) == $code) break; } $num_elts = $MOD9_NUM_ELTS_DECODES[$code]; $mask = (1 << $num_bits) - 1; $shift = $num_bits * ($num_elts - 1); $int_string = packInt($encoded_list); $int_string[0] = chr($first_char - $code); $encoded_list = (int)hexdec(bin2hex($int_string)); } do { if ($doc_index = (($encoded_list >> $shift) & $mask)) { $doc_index -= (($doc_index & $t26) > 0) ? $t26 + ($doc_index & 0x1FF) : 1; return $doc_index; } $shift -= $num_bits; } while($shift >= 0); return $doc_index; //shouldn't get here } /** * Unpacks an int from a 4 char string * * @param string $str where to extract int from * @return int extracted integer */ function unpackInt($str) { if (is_string($str)) { return (int)hexdec(bin2hex($str)); } return false; } }// end extension_loaded check /** * Packs an int into a 4 char string * * @param int $my_int the integer to pack * @return string the packed string */ function packInt($my_int) { return pack("N", $my_int); } /** * Unpacks a float from a 4 char string * * @param string $str where to extract int from * @return float extracted float */ function unpackFloat($str) { if (!is_string($str)) return false; $tmp = unpack("G", $str); return $tmp[1]; } /** * Packs an float into a four char string * * @param float $my_float the float to pack * @return string the packed string */ function packFloat($my_float) { return pack("G", $my_float); } /** * Used to change the namespace of a serialized php object (assumes doesn't * have nested subobjects) * * @param string $class_name new fully qualified name with namespace * @param string $object_string serialized object * * @return string serialized object with new name */ function renameSerializedObject($class_name, $object_string) { /* number of digits in the length of name of the object needs to be less than 12 digits (probably more like 4) for this to work. */ $name_length = intval(substr($object_string, 2, 14)); $name_space_info_length = strlen("O:".$name_length.":") + $name_length + 2; // 2 for quotes; $object_string = 'O:' . strlen($class_name) . ':"'. $class_name.'"' . substr($object_string, $name_space_info_length); return $object_string; } /** * Parses a provided string to make a DOM object. First tries to parse * using XML and if this fails uses the more robust HTML Dom parser * and manipulates the resulting DOM tree to make correspond to original * tags for XML that isn't HTML * * @param string $to_parse the string to parse a DOMDocument from * @return DOMDocument computed based on the provided string */ function getDomFromString($to_parse) { set_error_handler(null); $dom = new \DOMDocument("1.0"); $dom->formatOutput = true; $dom->preserveWhiteSpace = false; if (stristr($to_parse, "<html") === false && !empty($to_parse)) { @$dom->loadXML($to_parse); } if ($dom->documentElement == null) { $dom = new \DOMDocument("1.0"); $dom->formatOutput = true; $dom->preserveWhiteSpace = false; //this hack modified from php.net @$dom->loadHTML('<?xml encoding="UTF-8">' . $to_parse); foreach ($dom->childNodes as $item) { if ($item->nodeType == XML_PI_NODE) { $dom->removeChild($item); // remove hack } } $dom->encoding = 'UTF-8'; if (!empty($dom->documentElement->firstChild) && !empty($dom->documentElement->firstChild->tagName) && $dom->documentElement->firstChild->tagName == 'body' && !empty($dom->documentElement->firstChild->firstChild)) { $node = $dom->documentElement->firstChild->firstChild; $to_parse = $dom->saveXML($node); if (!empty($to_parse)) { $dom = new \DOMDocument("1.0"); $dom->formatOutput = true; $dom->preserveWhiteSpace = false; @$dom->loadXML($to_parse); if ($dom->documentElement == null) { $dom = new \DOMDocument("1.0"); $dom->formatOutput = true; @$dom->loadHTML($to_parse); } } } } set_error_handler(C\NS_CONFIGS . "yioop_error_handler"); return $dom; } /** * Returns an array of DOMDocuments for the nodes that match an xpath * query on $dom, a DOMDocument * * @param DOMDocument $dom document to run xpath query on * @param string $query xpath query to run * @return array of DOMDocuments one for each node matching the * xpath query in the original DOMDocument */ function getTags($dom, $query) { $nodes = []; if (is_a($dom, "DOMDocument")) { $dom_xpath = new \DOMXPath($dom); if (!$dom_xpath) { return []; } $tags = $dom_xpath->query($query); } if (empty($tags) || $tags->length == 0) { if(!empty($dom->textContent)) { $redom = new \DOMDocument; @$redom->loadHTML($dom->textContent); $dom_xpath = new \DOMXPath($redom); if (!$dom_xpath) { return []; } $tags = $dom_xpath->query($query); } if (empty($tags) || $tags->length == 0) { return []; } } $i = 0; while ($item = $tags->item($i)) { $tmp_dom = new \DOMDocument; $tmp_dom->formatOutput = true; $node = $tmp_dom->importNode($item, true); $tmp_dom->appendChild($node); $nodes[] = $tmp_dom; $i++; } return $nodes; } /** * Converts a string to string where each char has been replaced by its * hexadecimal equivalent * * @param string $str what we want rewritten in hex * @return string the hexified string */ function toHexString($str) { $str ??= ""; if (!is_string($str)) { trigger_error("toHexString called with non-string:" . serialize($str)); return ""; } $out = ""; for ($i = 0; $i < strlen($str); $i++) { $out .= sprintf("%02X",ord($str[$i]))." "; } return $out; } /** * Converts a string to string where each char has been replaced by a Integer * equivalent * * @param string $str what we want rewritten in hex * @return string the hexified string */ function toIntString($str) { if (!is_string($str)) { trigger_error("toIntString called with non-string:" . serialize($str)); return ""; } $out = ""; for ($i = 0; $i < strlen($str); $i++) { $out .= sprintf("%03u",ord($str[$i]))." "; } return $out; } /** * Converts a string to string where each char has been replaced by its * binary equivalent * * @param string $str what we want rewritten in hex * @return string the binary string */ function toBinString($str) { if (!is_string($str)) { trigger_error("toBinString called with non-string:" . serialize($str)); return ""; } $out = ""; for ($i = 0; $i < strlen($str); $i++) { $out .= substr(decbin(256+ord($str[$i])), 1)." "; } return $out; } /** * Converts a string of the form some int followed by K, M, or G. * into its integer equivalent. For example 4K would become 4000, * 16M would become 16000000, and 1G would become 1000000000 * Note not using base 2 for K, M, G * * @param string $metric_num metric number to convert * @return int number the metric string corresponded to */ function metricToInt($metric_num) { $num = intval($metric_num); if (is_string($metric_num)) { $last_char = $metric_num[strlen($metric_num) - 1]; switch (strtoupper($last_char)) { case "K": $num *= 1000; break; case "M": $num *= pow(1000, 2); break; case "G": $num *= pow(1000, 3); break; case "T": $num *= pow(1000, 4); break; } } return $num; } /** * Converts a number to a string followed by nothing, K, M, G, T * depending on whether number is < 1000, < 10^6, < 10^9, or < 10^(12) * * @param int $num number to convert * @return string number the metric string corresponded to */ function intToMetric($num) { if (is_int($num)) { $metric_letters = ["", "K", "M", "G", "T", "P", "E"]; $power = max(intval(log($num, 1000)), 0); $power = (isset($metric_letters[$power])) ? $power : 6; $num = round($num / pow(1000, $power)). $metric_letters[$power]; } return $num; } /** * Logs a message to a logfile or the screen. The super-global field * $_SERVER['LOG_TO_FILES'] determines if this will log to a file. If not, * then in cli mode, will log to stdout, otherwise it will use error_log. When * logging to file $_SERVER["NO_ROTATE_LOGS"] controls whether or not there * will be a log file rotation. The first call to this method is typically used * to set up a process to check for liveness. For example a call: * crawlLog("\n\nInitialize logger..", $this->process_name, true); * says $this->process_name should be checked for liveness as part of any * subsequent logging activity such as a call crawlLog("Another Message"); * (note subsequent call don't need to specify the process name). * * @param string $msg message to log. If empty then no message written * @param string $lname name of log file in the LOG_DIR directory, rotated logs * will also use this as their basename followed by a number followed by * gzipped (since they are gzipped (older versions of Yioop used bzip * Some distros don't have bzip but do have gzip. Also gzip was * being used elsewhere in Yioop, so to remove the dependency bzip was * replaced )). * @param bool $check_process_handler by default set to false. After the first * time set to true, as long as in subsequent calls set to false, * processHandler will be called to check how long the code has run * since the last time processHandler called. */ function crawlLog($msg, $lname = null, $check_process_handler = false) { static $logname; static $last_check_time = 0; static $check_handler = false; if (!empty($_SERVER["NO_LOGGING"])) { return; } if ($lname != null) { $logname = $lname; } else if (!isset($logname)) { $logname = "message"; } if ($check_process_handler) { $check_handler = $check_process_handler; } $out_msg = makeTimestamp() . " $msg"; if (!empty($_SERVER["LOG_TO_FILES"])) { $pre_logfile = C\LOG_DIR . "/$logname"; $logfile = "$pre_logfile.log"; clearstatcache(); //hopefully, this doesn't slow things too much if (empty($_SERVER["NO_ROTATE_LOGS"]) && file_exists($logfile) && filesize($logfile) > C\MAX_LOG_FILE_SIZE) { $last_logfile = "$pre_logfile." . C\NUMBER_OF_LOG_FILES . ".log.gz"; if (file_exists($last_logfile)) { unlink($last_logfile); } for ($i = C\NUMBER_OF_LOG_FILES; $i > 0; $i--) { $previous_logfile = "$pre_logfile.".($i-1).".log.gz"; if (file_exists($previous_logfile)) { rename($previous_logfile, "$pre_logfile.$i.log.gz"); } } file_put_contents("$pre_logfile.0.log.gz", gzencode(file_get_contents($logfile))); file_put_contents($logfile, ""); } //don't use error_log options in this case to happify hiphop4php file_put_contents($logfile, $out_msg . "\n", FILE_APPEND); if (changeInMicrotime($last_check_time) > 5 && $check_handler && !$check_process_handler) { $continue = ($logname == 'index') ? true : false; CrawlDaemon::processHandler($continue); $last_check_time = microtime(true); } } else if (php_sapi_name() != 'cli' && (!C\nsdefined("IS_OWN_WEB_SERVER") || (!empty($_SERVER['LOG_TO_FILES']) && $_SERVER['LOG_TO_FILES'] == "terminal"))) { error_log($out_msg."\n"); } else { file_put_contents("php://stdout", $out_msg . "\n", FILE_APPEND); } return; } /** * Used to make a log file entry time string of format: entry number, time in r * format. * @param int $time a unix timestamp * @return string [line_count_in_log r_formatted_date] */ function makeTimestamp($time = -1) { static $count = 0; $count = ($count + 1) % 1000000000; $time = ($time == -1) ? time() : $time; $time_string = date("r", $time); return "[$count $time_string]"; } /** * Writes a log message $msg if more than LOG_TIMEOUT time has passed since * the last time crawlTimeoutLog was called. Useful in loops to write a message * as progress is made through the loop (but not on every iteration, but * say every 30 seconds). * * @param mixed $msg usually a string with what to be printed out after the * timeout period. If $msg === true then clears the timeout cache * @return bool whether a log message was written */ function crawlTimeoutLog($msg) { static $cache_time = 0; if (changeInMicrotime($cache_time) < C\LOG_TIMEOUT) { return false; } if (func_num_args() > 1) { $out_msg = call_user_func_array('sprintf', func_get_args()); } else { $out_msg = & $msg; } crawlLog($out_msg . " Current memory usage:".memory_get_usage()); $cache_time = microtime(true); return true; } /** * Computes an 8 byte hash of a string for use in storing documents. * * An eight byte hash was chosen so that the odds of collision even for * a few billion documents via the birthday problem are still reasonable. * If the raw flag is set to false then an 11 byte base64 encoding of the * 8 byte hash is returned. The hash is calculated as the xor of the * two halves of the 16 byte md5 of the string. (8 bytes takes less storage * which is useful for keeping more doc info in memory) * * @param string $string the string to hash * @param bool $raw whether to leave raw or base 64 encode * @return string the hash of $string */ function crawlHash($string, $raw = false) { $pre_hash = md5($string, true); $left = substr($pre_hash, 0, 8); $right = substr($pre_hash, 8, 8); $combine = $right ^ $left; if (!$raw) { $hash = base64Hash($combine); // common variant of base64 safe for urls and paths } else { $hash = $combine; } return $hash; } /** * Used to create a 20 byte hash of a string (typically a word or phrase * with a wikipedia page). Format is 8 byte crawlHash of term (md5 of term * two halves XOR'd), followed by a \x00, followed by the first 11 characters * from the term. If there are not enough char's to make 20 bytes, then the * string is padded with \x00s to 20bytes. * * @param string $string word to hash * @param bool $raw whether to base64Hash the result * @return string first 8 bytes of md5 of $string concatenated with \x00 * to indicate the hash is of a word not a phrase concatenated with the * padded to 11 byte $meta_string. */ function crawlHashWord($string, $raw = false) { $pre_hash = substr(md5($string, true), 0, 8) . "\x00" . substr($string, 0, 11); $pre_hash = str_pad($pre_hash, 20, "\x00"); if (!$raw) { $hash = base64Hash($pre_hash); // common variant of base64 safe for urls and paths } else { $hash = $pre_hash; } return $hash; } /** * Take a $term that might have come from adocuments and converts it to * a string of 16 bytes which is either the original term padded by * underscores or the first seven chars of the term followed by an underscore * followed by the base64 encoding of the first 6 chars of its md5 hash. * Base64 used to make this all nice and printable. * @param string $term to made into a canonical form * @return string canonicalize by apbove version of term. */ function canonicalTerm($term) { $term = encodeUnderscore($term); $term = rawurlencode($term); $term = str_replace("%", "", $term); $hash = (strlen($term) < 16) ? str_pad($term, 16, "_") : substr($term, 0, 7) . "_" . base64Hash( substr(md5($term, true), 0, 6)); return $hash; } /** * Used to compare to ids for index dictionary lookup. ids * are a 8 byte crawlHash together with 12 byte non-hash suffix. * * @param string $id1 20 byte word id to compare * @param string $id2 20 byte word id to compare * @return int negative if $id1 smaller, positive if bigger, and 0 if * same */ function compareWordHashes($id1, $id2) { return strcmp($id1, $id2); } /** * Converts a crawl hash number to something closer to base64 coded but * so doesn't get confused in urls or DBs * * @param string $string a hash to base64 encode * @return string the encoded hash */ function base64Hash($string) { return strtr(rtrim(base64_encode($string), "="), ["/" => "_", "+" => "-"]); } /** * Decodes a crawl hash number from base64 to raw ASCII * * @param string $base64 a hash to decode * @return string the decoded hash */ function unbase64Hash($base64) { //get rid of out modified base64 encoding return base64_decode(strtr($base64, ["_" => "/", "-" => "+"]) . "="); } /** * Encodes a string in a format suitable for post data * (mainly, base64, but str_replace data that might mess up post in result) * * @param string $str string to encode * @return string encoded string */ function webencode($str) { return strtr(base64_encode($str), ["/" => "_", "=" => "~", "+" => "."]); } /** * Decodes a string encoded by webencode * * @param string $str string to encode * @return string encoded string */ function webdecode($str) { return base64_decode(strtr($str ?? "", ["." => "+", "~" => "=", "_" => "/"])); } /** * The crawlHash function is used to encrypt passwords stored in the database. * It tries to use the best version the Blowfish variant of php's crypt * function available on the current system. * * @param string $string the string to encrypt * @param int $salt salt value to be used (needed to verify if a password is * valid) * @return string the crypted string where crypting is done using crawlHash */ function crawlCrypt($string, $salt = null) { if ($salt !== null) { return crypt($string, $salt); } /* The length of the salt and its starting prefix say which hash public function crypt uses. Blowfish's begins with $2a$, $2x$, or $2y$ followed by the base 2 logarithm of the iteration count for blowfish. We us the more secure 2y. */ $salt = '$2y$12$'; if (function_exists('random_bytes')) { $salt .= strtr(base64_encode(random_bytes(16)), '+', '.'); } else if (function_exists('mcrypt_create_iv')) { $salt .= strtr(base64_encode(mcrypt_create_iv(16, MCRYPT_DEV_URANDOM)), '+', '.'); } else { $salt .= substr(str_replace('+', '.', base64_encode( pack('N4', mt_rand(), mt_rand(), mt_rand(), mt_rand()))), 0, 22); } return crypt($string, $salt); } /** * Used by a controller to take a table and return those rows in the * table that a given queue_server would be responsible for handling * * @param array $table an array of rows of associative arrays which * a queue_server might need to process * @param string $field column of $table whose values should be used * for partitioning * @param int $num_partition number of queue_servers to choose between * @param int $instance the id of the particular server we are interested * in * @param object $callback function or static method that might be * applied to input before deciding the responsible queue_server. * For example, if input was a url we might want to get the host * before deciding on the queue_server * @return array the reduced table that the $instance queue_server is * responsible for */ function partitionByHash($table, $field, $num_partition, $instance, $callback = null) { $out_table = []; if (is_array($table)) { foreach ($table as $row) { $cell = ($field === null) ? $row : $row[$field]; $hash_int = calculatePartition($cell, $num_partition, $callback); if ($hash_int == $instance) { $out_table[] = $row; } } } return $out_table; } /** * Used by a controller to say which queue_server should receive * a given input * @param string $input can view as a key that might be processes by a * queue_server. For example, in some cases input might be * a url and we want to determine which queue_server should be * responsible for queuing that url * @param int $num_partition number of queue_servers to choose between * @param object $callback function or static method that might be * applied to input before deciding the responsible queue_server. * For example, if the input was a url we might want to get the host * before deciding on the queue_server * @return int id of server responsible for input */ function calculatePartition($input, $num_partition, $callback = null) { if ($callback !== null) { $callback_parts = explode("::", $callback); if (count($callback_parts) == 1) { $input = $callback($input); } else { $class_name = $callback_parts[0]; $method_name = $callback_parts[1]; $tmp_class = new $class_name; $input = $tmp_class->$method_name($input); } } $hash_int = abs(unpackInt(substr(crawlHash($input, true), 0, 4))) % $num_partition; return $hash_int; } /** * Measures the change in time in seconds between two timestamps to microsecond * precision * * @param string $start starting time with microseconds * @param string $end ending time with microseconds, if null use current time * @return float time difference in seconds */ function changeInMicrotime($start, $end = null) { if ( !$end ) { $end = microtime(true); } return $end - $start; } /** * Timestamp of current epoch with microsecond precision useful for situations * where time() might cause too many collisions (account creation, etc) * @return string timestamp to microsecond of time in second since start of * current epoch */ function microTimestamp() { return vsprintf('%d.%06d', gettimeofday()); } /** * Checks that a timestamp is within the time interval given by a * start time (HH:mm) and a duration * * @param string $start_time string of the form (HH:mm) * @param string $duration string containing an int in seconds * @param int $time a Unix timestamp. * @return int -1 if the time of day of $time is not within the given interval. * Otherwise, the Unix timestamp at which the interval will be over for * the same day as $time. */ function checkTimeInterval($start_time, $duration, $time = -1) { $duration = intval($duration); if ($duration <= 0 ) { return -1; } if (intval($time) < 0 ) { $time = time(); } $today = date('Y-m-d', $time); $timezone_offset = date('P', $time); $start_time = trim($start_time); if (!preg_match("/\d\d\:\d\d/", $start_time)) { $start_time = "00:00"; } $start_timestamp = strtotime($today . "T" . $start_time . ":00". $timezone_offset); $end_timestamp = $start_timestamp + $duration; if ($time >= $start_timestamp && $time <= $end_timestamp) { return $end_timestamp; } return -1; } /** * Converts a CSS unit string into its equivalent in pixels. This is * used by @see SvgProcessor. * * @param string $value a number followed by a legal CSS unit * @return int a number in pixels */ function convertPixels($value) { $len = strlen($value); if (is_int($value) || $len < 2) { return intval($value); } if ($value[$len - 1] == "%") { $num = floatval(substr($value, 0, $len - 1)); return ($num > 0) ? floor(8 * min(100, $num)) : 0; } $num = floatval(substr($value, 0, $len - 2)); $unit = substr($value, $len - 2); switch ($unit) { case "cm": case "pt": return intval(28 * $num); break; case "em": case "pc": return intval(6 * $num); break; case "ex": return intval(12 * $num); break; case "in": //assume screen 72 dpi as on mac return intval(72 * $num); break; case "mm": return intval(2.8 * $num); break; case "px": return intval($num); break; default: $num = $value; } return intval($num); } /** * Returns the number of files in a folder * @param string $folder path to folder to count * @return int number of files */ function countFiles($folder) { $file_iterator = new \FilesystemIterator($folder); return iterator_count($file_iterator); } /** * Creates folders along a filesystem path if they don't exist * * @param string $path a file system path * @return bool success or failure */ function makePath($path) { $path_parts = explode("/", $path); $num_parts = count($path_parts); $path_so_far = ""; for ($i = 0; $i < $num_parts; $i++) { $path_so_far .= $path_parts[$i] . "/"; if (!is_dir($path_so_far)) { if (file_exists($path_so_far) || !mkdir($path_so_far) ) { return false; } } } return true; } /** * This is a callback function used in the process of recursively deleting a * directory * * @param string $file_or_dir the filename or directory name to be deleted * @see DatasourceManager::unlinkRecursive() */ function deleteFileOrDir($file_or_dir) { if (file_exists($file_or_dir)) { if (is_file($file_or_dir)) { unlink($file_or_dir); } else { rmdir($file_or_dir); } } } /** * This is a callback function used in the process of recursively chmoding to * 777 all files in a folder * * @param string $file the filename or directory name to be chmod * @see DatasourceManager::setWorldPermissionsRecursive() */ function setWorldPermissions($file) { if (php_sapi_name() == 'cli') { chmod($file, 0777); return; } set_error_handler(null); chmod($file, 0777); set_error_handler(C\NS_CONFIGS . "yioop_error_handler"); } /** * This is a callback function used in the process of recursively calculating * an array of file modification times and files sizes for a directory * * @param string $file a name of a file in the file system * @return an array whose single element contain an associative array * with the size and modification time of the file */ function fileInfo($file) { $info["name"] = $file; $info["size"] = filesize($file); $info["is_dir"] = is_dir($file); $info["modified"] = filemtime($file); return [$info]; } //ordering functions used in sorting /** * Callback function used to sort documents by a field * * Should be initialized before using in usort with a call * like: orderCallback($tmp, $tmp, "field_want"); * * @param string $word_doc_a doc id of first document to compare * @param string $word_doc_b doc id of second document to compare * @param string $order_field which field of these associative arrays to sort by * @return int -1 if first doc bigger 1 otherwise */ function orderCallback($word_doc_a, $word_doc_b, $order_field = null) { static $field = "a"; if ($order_field !== null) { $field = $order_field; return -1; } return ((float)$word_doc_a[$field] > (float)$word_doc_b[$field]) ? -1 : 1; } /** * Callback function used to sort documents by a field where field is assume to * be a string * * Should be initialized before using in usort with a call * like: stringOrderCallback($tmp, $tmp, "field_want"); * * @param string $word_doc_a doc id of first document to compare * @param string $word_doc_b doc id of second document to compare * @param string $order_field which field of these associative arrays to sort by * @return int -1 if first doc smaller 1 otherwise */ function stringOrderCallback($word_doc_a, $word_doc_b, $order_field = null) { static $field = "a"; if ($order_field !== null) { $field = $order_field; return -1; } return ((string)$word_doc_a[$field] > (string)$word_doc_b[$field]) ? -1 : 1; } /** * Callback function used to sort documents by a field where field is assume to * be a string * * Should be initialized before using in usort with a call * like: stringROrderCallback($tmp, $tmp, "field_want"); * * @param string $word_doc_a doc id of first document to compare * @param string $word_doc_b doc id of second document to compare * @param string $order_field which field of these associative arrays to sort by * @return int -1 if first doc bigger 1 otherwise */ function stringROrderCallback($word_doc_a, $word_doc_b, $order_field = null) { static $field = "a"; if ($order_field !== null) { $field = $order_field; return -1; } return ((string)$word_doc_a[$field] < (string)$word_doc_b[$field]) ? -1 : 1; } /** * Callback function used to sort documents by a field in reverse order * * Should be initialized before using in usort with a call * like: rorderCallback($tmp, $tmp, "field_want"); * * @param string $word_doc_a doc id of first document to compare * @param string $word_doc_b doc id of second document to compare * @param string $order_field which field of these associative arrays to sort by * @return int 1 if first doc bigger -1 otherwise */ function rorderCallback($word_doc_a, $word_doc_b, $order_field = null) { static $field = "a"; if ($order_field !== null) { $field = $order_field; return -1; } return ((float)$word_doc_a[$field] > (float)$word_doc_b[$field]) ? 1 : -1; } /** * Callback to check if $a is less than $b * * Used to help sort document results returned in PhraseModel called * in IndexArchiveBundle * * @param float $a first value to compare * @param float $b second value to compare * @return int -1 if $a is less than $b; 1 otherwise * @see IndexArchiveBundle::getSelectiveWords() * @see PhraseModel::getPhrasePageResults() */ function lessThan($a, $b) { if ($a == $b) { return 0; } return ($a < $b) ? -1 : 1; } /** * Callback to check if $a is greater than $b * * Used to help sort document results returned in PhraseModel called in * IndexArchiveBundle * * @param float $a first value to compare * @param float $b second value to compare * @return int -1 if $a is greater than $b; 1 otherwise * @see IndexArchiveBundle::getSelectiveWords() * @see PhraseModel::getTopPhrases() */ function greaterThan($a, $b) { if ($a == $b) { return 0; } return ($a > $b) ? -1 : 1; } /** * shorthand for echo * * @param string $text string to send to the current output */ function e($text) { echo $text; } /** * Compute the real remote address of the incoming connection * including forwarding */ function remoteAddress() { return (!empty($_SERVER['HTTP_X_FORWARDED_FOR'])) ? $_SERVER['HTTP_X_FORWARDED_FOR'] : ( (!empty($_SERVER['REMOTE_ADDR'])) ? $_SERVER['REMOTE_ADDR'] : "127.0.0.1"); } /** * Used to read a line of input from the command-line * @return string from the command-line */ function readInput() { $stdin = fopen('php://stdin', 'r'); $line = fgets($stdin); $line = trim($line); fclose($stdin); return $line; } /** * Used to read a line of input from the command-line * (on unix machines without echoing it) * @return string from the command-line */ function readPassword() { system('stty -echo'); $line = readInput(); if (!strstr(PHP_OS, "WIN")) { e(str_repeat("*", strlen($line))."\n"); } system('stty echo'); return $line; } /** * Used to read a several lines from the terminal up until * a last line consisting of just a "." * @return string from the command-line */ function readMessage() { $message = ""; $line = ""; do { $message .= $line; $line = readInput()."\n"; } while(rtrim($line) != "."); return rtrim($message); } /** * Returns the mime type of the provided file name if it can be determined. * * @param string $file_name (name of file including path to figure out * mime type for) * @param bool $use_extension whether to just try to guess from the file * extension rather than looking at the file * @return string mime type or unknown if can't be determined */ function mimeType($file_name, $use_extension = false) { return WebSite::mimeType($file_name, $use_extension); } /** * Checks if class_1 is the same as class_2 or has class_2 as a parent * Behaves like 3 param version (last param true) of PHP is_a function * that came into being with Version 5.3.9. * * @param mixed $class_1 object or string class name to see if in class2 * @param mixed $class_2 object or string class name to see if contains class1 * @return bool equal or contains class */ function generalIsA($class_1, $class_2) { if ($class_1 == $class_2) { return true; } return (is_a($class_1, $class_2) || is_subclass_of($class_1, $class_2)); } /** * Given the contents of a start XML/HMTL tag strips out all the attributes * non listed in $safe_attribute_list * * @param string $start_tag_contents the contents of an HTML/XML tag. I.e., * if the tag was <tag stuff> then $start_tag_contents could be stuff * @param array $safe_attribute_list a list of attributes which should be kept * @return string containing only safe attributes and their values */ function stripAttributes($start_tag_contents, $safe_attribute_list = []) { $out = ""; if ($safe_attribute_list != []) { $safe_regex = '/(?:(?:\A|\s+)(?:'; $first = ""; foreach ($safe_attribute_list as $attribute) { $safe_regex .= $first . $attribute; $first = "|"; } $safe_regex .= ')\s*=\s*(?:"[^"]+"|'."'[^']+'))/"; preg_match_all($safe_regex, $start_tag_contents, $matches); if (isset($matches[0])) { foreach ($matches[0] as $attribute) { $out .= " ".$attribute; } if ($out) { $out = trim($out); } } } return $out; } /** * Used to parse into a two dimensional array a string that contains CSV data. * @param string $csv_string string with csv data * @return array two dimensional array of elements from csv */ function parseCsv($csv_string) { $lines = str_getcsv($csv_string, "\n"); $out_lines = []; foreach ($lines as $line) { $out_lines[] = str_getcsv($line); } return $out_lines; } /** * Converts an array of values to a comma separated value formatted string. * * @param array $arr values to convert * @return string CSV string after conversion */ function arraytoCsv($arr) { $fp = fopen('php://temp', 'r+'); fputcsv($fp, $arr); rewind($fp); $data = fgets($fp); fclose($fp); return rtrim($data, "\n"); } /** * Computes a Unix-style diff of two strings. That is it only * outputs lines which disagree between the two strings. It outputs +line * if a line occurs in the second but not first string and -line if a * line occurs in the first string but not the second. * * @param string $data1 first string to compare * @param string $data2 second string to compare * @param bool $html whether to output html highlighting * @return string representing info about where $data1 and $data2 don't match */ function diff($data1, $data2, $html = false) { if ($html) { $start = "<div>"; $start_same = "<div class='light-gray'>"; $start1 = "<div class='red'>"; $start2 = "<div class='green'>"; $end = "</div>"; } else { $start = ""; $start_same = ""; $start1 = ""; $start2 = ""; $end = ""; } $lines1 = explode("\n", $data1); $lines2 = explode("\n", $data2); $num_lines1 = count($lines1); $num_lines2 = count($lines2); $shorter_len = min($num_lines1, $num_lines2); $longer_len = max($num_lines1, $num_lines2); $first_diff = 0; // trim off the starts and end lines of the strings that are same $head_lcs = []; while($first_diff < $shorter_len && strcmp($lines1[$first_diff], $lines2[$first_diff]) == 0) { $head_lcs[] = [$first_diff, $first_diff, $lines1[$first_diff]]; $first_diff++; } if ($first_diff == $shorter_len) { if ($num_lines1 == $num_lines2) { return ""; } else { $tmp = $lines1; $prefix = "$start1-"; if ($num_lines1 == $shorter_len) { $tmp = $lines2; $prefix = "$start2+"; } $out = "$start@@ -$shorter_len,0 +$shorter_len,". "$longer_len @@$end\n"; for ($i = $shorter_len; $i < $longer_len; $i++) { $out .= $prefix . $tmp[$i]."$end\n"; } return $out; } } $last_diff = 0; $tail_lcs = []; $index1 = $num_lines1 - 1 - $last_diff; $index2 = $num_lines2 - 1 - $last_diff; while($shorter_len - $last_diff > $first_diff && $lines1[$index1] == $lines2[$index2]) { array_unshift($tail_lcs, [$index1, $index2, $lines1[$index1]]); $last_diff++; $index1--; $index2--; } $trim_lines1 = array_slice($lines1, $first_diff, -$last_diff); $trim_lines2 = array_slice($lines2, $first_diff, -$last_diff); /* To compute a diff, we first compute the LCS = Longest common subsequence of the two string. */ $lcs = computeLCS($trim_lines1, $trim_lines2, $first_diff); $lcs = array_merge($head_lcs, $lcs, $tail_lcs); $previous_first = -1; $previous_second = -1; $current_first = 0; $current_second = 0; $old_line = ""; $out_string = ""; if ($lcs == []) { $out_string .= "$start@@ -0,$num_lines1 ". " +0,$num_lines2 @@$end\n"; for ($i = 0; $i < $num_lines1; $i++) { $out_string .= "$start1-" . $lines1[$i] . "$end\n"; } for ($i = 0; $i < $num_lines2; $i++) { $out_string .= "$start2+" . $lines2[$i] . "$end\n"; } } else { foreach ($lcs as $lcs_item) { list($current_first, $current_second, $line) = $lcs_item; $gap1 = $current_first - $previous_first; $gap2 = $current_second - $previous_second; if ($gap1 > 1 || $gap2 > 1) { $gap1++; $gap2++; $out_string .= "$start@@ -$previous_first,$gap1 ". " +$previous_second,$gap2 @@$end\n"; $out_string .= "$start_same ".$old_line."$end\n"; for ($i = $previous_first + 1; $i < $current_first; $i++) { $out_string .= "$start1-" . $lines1[$i] . "$end\n"; } for ($i = $previous_second + 1; $i < $current_second; $i++) { $out_string .= "$start2+" . $lines2[$i] . "$end\n"; } $out_string .= "$start_same ".$line."$end\n"; } $previous_first = $current_first; $previous_second = $current_second; $old_line = $line; } if ($current_first < $num_lines1 - 1 || $current_second < $num_lines2 - 1) { $gap1 = $num_lines1 - $current_first; $gap2 = $num_lines2 - $current_second; $out_string .= "$start_same ".$line."$end\n"; for ($i = $current_first + 1; $i < $num_lines1; $i++) { $out_string .= "$start1-" . $lines1[$i] . "$end\n"; } for ($i = $current_second + 1; $i < $num_lines2; $i++) { $out_string .= "$start2+" . $lines2[$i] . "$end\n"; } } } return $out_string; } /** * Computes the longest common subsequence of two arrays * * @param array $lines1 an array of lines to compute LCS of * @param array $lines2 an array of lines to compute LCS of * @param int $offset an offset to shift over array addresses in output by */ function computeLCS($lines1, $lines2, $offset = 0) { /* add a dummy line so don't have to worry about shifting indices in CLRS pseudo-code implementation */ $num_lines1 = count($lines1); $num_lines2 = count($lines2); array_unshift($lines1, 0); array_unshift($lines2, 0); /* LCS = Longest common subsequence of the two string. The code below is based off the pseudo-code in CLRS */ $lcs_moves = []; $lcs_values = []; for ($i = 1; $i <= $num_lines1; $i++) { //initialize first column $lcs_values[$i][0] = 0; } for ($j = 0; $j <= $num_lines2; $j++) { //initialize first column $lcs_values[0][$j] = 0; } $lcs_moves = []; for ($i = 1; $i <= $num_lines1; $i++) { for ($j = 1; $j <= $num_lines2; $j++) { if ($lines1[$i] == $lines2[$j]) { $lcs_values[$i][$j] = $lcs_values[$i - 1][$j - 1] + 1; $lcs_moves[$i][$j] = "d"; //diagonal } elseif ($lcs_values[$i - 1][$j] >= $lcs_values[$i][$j - 1]) { $lcs_values[$i][$j] = $lcs_values[$i - 1][$j]; $lcs_moves[$i][$j] = "u"; // up } else { $lcs_values[$i][$j] = $lcs_values[$i][$j - 1]; $lcs_moves[$i][$j] = "l"; // left } } } $lcs = []; extractLCSFromTable($lcs_moves, $lines1, $num_lines1, $num_lines2, $offset, $lcs); return $lcs; } /** * Extracts from a table of longest common sequence moves (probably calculated * by @see computeLCS) and a starting coordinate $i, $j in that table, * a longest common subsequence * * @param array $lcs_moves a table of move computed by computeLCS * @param array $lines from first of the two arrays computing LCS of * @param int $i a line number in string 1 * @param int $j a line number in string 2 * @param int $offset a number to add to each line number output into $lcs. * This is useful if we have trimmed off the initially common lines from * our two strings we are trying to compute the LCS of * @param array &$lcs an array of triples * (index_string1, index_string2, line) * the indexes indicate the line number in each string, line is the line * in common the two strings */ function extractLCSFromTable($lcs_moves, $lines, $i, $j, $offset, &$lcs) { if ($i == 0 || $j == 0) { return []; } if ($lcs_moves[$i][$j] == "d") { //diagonal moves means common to both //sub-case first so forward order extractLCSFromTable($lcs_moves, $lines, $i - 1, $j - 1, $offset, $lcs); $lcs[] = [$i + $offset- 1, $j + $offset - 1, $lines[$i]]; } elseif ($lcs_moves[$i][$j] == "u") { // up move in matrix extractLCSFromTable($lcs_moves, $lines, $i - 1, $j, $offset, $lcs); } else { // left move in matrix extractLCSFromTable($lcs_moves, $lines, $i, $j - 1, $offset, $lcs); } } /** * Returns an array of the last $num_lines many lines our of a file * * @param string $file_name name of file to return lines from * @param string $num_lines number of lines to retrieve * @return array retrieved lines */ function tail($file_name, $num_lines) { $size = filesize($file_name); $max_line_len = 160; $offset = max(0, $size - $max_line_len * $num_lines); $file_string = file_get_contents($file_name, $offset); $lines = explode("\n", $file_string); $lines = array_slice($lines, -$num_lines); return $lines; } /** * Given an array of lines returns a subarray of those lines containing the * filter string or filter array * * @param string $lines to search * @param mixed $filters either string to filter lines with or an array of * strings (any of which can be present to pass the filter) * @param bool $case_insensitive whether search should be done case * insensitively or not. * @return array lines containing the string */ function lineFilter($lines, $filters, $case_insensitive = true) { $out_lines = []; $search_function = ($case_insensitive) ? "stripos" : "strpos"; if (is_string($filters)) { $filters = [$filters]; } foreach ($lines as $line) { foreach ($filters as $filter) { if ($search_function($line, $filter) !== false) { $out_lines[] = $line; break; } } } return $out_lines; } /** * Tries to extract a timestamp from a line which is presumed to come from * a Yioop log file * * @param string $line to search * @return int timestamp of that log entry */ function logLineTimestamp($line) { preg_match('/^\s*\[\s*\d+\s+(.*)\]/', $line, $matches); if (isset($matches[1])) { return @strtotime($matches[1]); } return 0; } /** * Returns whether an input can be parsed to a positive integer * * @param mixed $input * @return bool whether $input can be parsed to a positive integer. */ function isPositiveInteger($input) { return (is_int($input) && $input > 0) || (is_string($input) && preg_match("/^\d+$/", trim($input)) && intval($input) > 0); } /** * Used to measure the memory footprint in bytes and time spent calling a method * of an object. It also records number of time the method has been called. * Just calls the method without any recording or timing until * an initial call to the function measureCall(null, save_statistics_file) * where save_statistics_file is the name of the file you won't to store * statistics to. * * @param object $object name of object whose method we want to call and measure * @param string $method method we're calling * @param array $argument arguments to be sent to the method * @param string $call_name name to use when outputting stats for this call, * defaults to $method. * @return mixed whatever method would normally returned when called as above */ function measureCall($object, $method, $arguments = [], $call_name = "") { return measureObjectCall($object, $method, $arguments, $call_name); } /** * Used to measure the memory footprint of an object in Yioop and save * it to a statistics file * No recording is done until * an initial call to the function measureCall(null, save_statistics_file) * where save_statistics_file is the name of the file you won't to store * statistics to. * @param object $object name of object whose size we want to measure * @param string $save_file statistics file to write info to * @param string $call_name name of class to use for measuring. If empty * will call getClass to determine this * @param bool true if legitimate arguments supplied */ function measureObject($object, $save_file = "", $class_name = "") { return measureObjectCall($object, $save_file, [], $class_name); } /** * General method called by for @see measureCall and @see measureObject * Used to measure the memory footprint in bytes of an object or memory and * time spent calling a method of an object. It also records number of time * the method has been called. When used to call a method before initialization, * just calls the method without any recording or timing. To initialize, * an initial call to the function measureCall(null, save_statistics_file) * where save_statistics_file is the name of the file you won't to store * statistics to should be done. * * @param object $object name of object whose method we want to call and measure * @param string $method method we're calling * @param array $argument arguments to be sent to the method * @param string $call_name name to use when outputting stats for this call, * defaults to $method. * @return mixed whatever method would normally returned when called as above */ function measureObjectCall($object, $method, $arguments = [], $call_name = "") { static $record_file = ""; static $statistics = ["LAST_UPDATE" => 0, "METHODS" => [], "OBJECTS" => []]; if ($object == null && !empty($method)) { $record_file = $method; return true; } if (empty($method)) { if (empty($record_file) || (!is_object($object) && !is_array($object) && empty($call_name))) { return false; } if (!empty($call_name)) { $object_name = $call_name; } else { $object_name = get_class($object); } if (!is_object($object) && !is_array($object)) { $object_stats = strlen(serialize($object)); } else { $object_stats = []; foreach ($object as $elt => $value) { $value_memory = strlen(serialize($value)); $object_stats[$elt] = $value_memory; } ksort($object_stats); } $statistics["OBJECTS"][$object_name] = $object_stats; $result = true; } else { if (empty($record_file)) { return call_user_func_array([$object, $method], $arguments); } if (empty($call_name)) { $call_name = $method; } $method_stat = $statistics["METHODS"][$call_name] ?? ["CALL_COUNT" => 0, "TIME_SPENT" => 0, "MEMORY_NOT_RETURNED" => 0]; $before_memory = memory_get_usage(); $start_time = microtime(true); $result = call_user_func_array([$object, $method], $arguments); garbageCollect(); $after_memory = memory_get_usage(); $result_clone = variableClone($result); $delta_clone_memory = memory_get_usage() - $after_memory; unset($result_clone); garbageCollect(); $delta_memory = $after_memory - $delta_clone_memory - $before_memory; $method_stat["MEMORY_NOT_RETURNED"] += $delta_memory; $method_stat["TIME_SPENT"] = changeInMicrotime($start_time); $method_stat["CALL_COUNT"]++; $statistics["METHODS"][$call_name] = $method_stat; } if (time() - $statistics["LAST_UPDATE"] > C\ONE_MINUTE) { $statistics["LAST_UPDATE"] = time(); file_put_contents($record_file, print_r($statistics, true)); } return $result; } /** * Makes a deep copy of a variable regardless of its type * @param mixed $var variable to deep copy * @return mixed the deep copy */ function variableClone($var) { if (is_array($var)) { return array_map(C\NS_LIB . "variableClone", $var); } else if (is_object($var)) { return clone $var; } else { return $var; } } /** * Runs various system garbage collection functions and returns * number of bytes freed. * * @return int number of bytes freed */ function garbageCollect() { $bytes_collected = 0; if (function_exists("gc_mem_caches")) { $bytes_collected += gc_mem_caches(); } $bytes_collected += gc_collect_cycles(); return $bytes_collected; } /** * The dom method saveHTML has a tendency to replace UTF-8, non-ascii characters * with html entities. This is supposed to save avoiding the replacement. * What it does is to first save the dom, then it replaces htmlentities of the * form &single_char; or &#some_number; with the UTF-8 they correspond to. * It leaves all other entities as they are * * @param DOMDocument $dom * @return string output of saving html */ function utf8SafeSaveHtml(\DOMDocument $dom) { $out = $dom->saveHTML(); $out = preg_replace("/\&([a-zA-z][a-zA-z]+)\;/u", '-a-m-p-${1};', $out); $out = html_entity_decode($out, ENT_QUOTES | ENT_HTML5, 'UTF-8'); $out = preg_replace("/\-a\-m\-p\-([a-zA-z][a-zA-z]+)\;/u", '&${1};', $out); return $out; } /** * A UTF-8 safe version of PHP's wordwrap function that wraps a string to a * given number of characters * * @param string $string the input string * @param int $width the number of characters at which the string will be * wrapped * @param string $break string used to break a line into two * @param bool $cut whether to always force wrap at $width characters * even if word hasn't ended * @return string the given string wrapped at the specified length */ function utf8WordWrap($string, $width = 75, $break = "\n", $cut = false) { if($cut) { /* Match anything 1 to $width chars long followed by whitespace or EOS, otherwise match anything $width chars long */ $search = '/(.{1,'. $width .'})(?:\s|$)|(.{'.$width.'})/uS'; $replace = '$1$2' . $break; } else { /* Anchor the beginning of the pattern with a lookahead to avoid crazy backtracking when words are longer than $width */ $search = '/(?=\s)(.{1,' . $width . '})(?:\s|$)/uS'; $replace = '$1'. $break; } return preg_replace($search, $replace, $string); }