viewgit/inc/functions.php:22 Function utf8_encode() is deprecated [8192]
diff --git a/src/configs/Config.php b/src/configs/Config.php index 30574ae45..ba199f6e6 100755 --- a/src/configs/Config.php +++ b/src/configs/Config.php @@ -437,6 +437,10 @@ nsconddefine('JOBS_DIR', SCHEDULES_DIR . "/jobs"); * Directory used by the web page clissfiers classes */ nsconddefine('CLASSIFIERS_DIR', WORK_DIRECTORY . "/classifiers"); +/** + * + */ +nsconddefine('OVERFLOW_THRESHOLD', -1); /** Captcha mode indicating to use a hash cash computation for a captcha*/ nsdefine('HASH_CAPTCHA', 2); diff --git a/src/library/IndexDictionary.php b/src/library/IndexDictionary.php index b26d751dc..2912a317c 100644 --- a/src/library/IndexDictionary.php +++ b/src/library/IndexDictionary.php @@ -552,7 +552,7 @@ class IndexDictionary implements CrawlConstants * record should be negation of higher order bit of the given prefix * letter used by the tier file. * @return string a single record with merged strings making use of - * auxliary records as needed containing + * auxiliary records as needed containing * (generation, posting list offset, length) information. */ public function combineDictionaryRecord($record_a, $record_b, $prefix_bit) diff --git a/src/library/IndexDocumentBundle.php b/src/library/IndexDocumentBundle.php index c8e2344ff..6098f55ce 100644 --- a/src/library/IndexDocumentBundle.php +++ b/src/library/IndexDocumentBundle.php @@ -66,7 +66,7 @@ class IndexDocumentBundle implements CrawlConstants * Subfolder of IndexDocumentBundle to store the btree with * term => posting list information (i.e., the inverted index) */ - const DICTIONARY_FOLDER = "dictionary"; + const DICTIONARY_FOLDER = "Dictionary"; /** * DocIds are made of three parts: hash of url, hash of document, hash * of url hostname. Each of these hashes is DOCID_PART_LEN long @@ -120,7 +120,7 @@ class IndexDocumentBundle implements CrawlConstants * .ix files which are used to store doc_id and the associated offsets to * their summary and actual document within the .txt.gz file */ - const DOCUMENTS_FOLDER = "documents"; + const DOCUMENTS_FOLDER = "Documents"; /** * Name of the last entries file used to help compute difference lists * for doc_map_index, and position list offsets used in postings for the @@ -153,25 +153,11 @@ class IndexDocumentBundle implements CrawlConstants * term. */ const POSTINGS_FILENAME = "postings"; - /** - * Temporary name for postings from a POSTINGS_FILENAME file while - * they are being compressed. - */ - const TEMP_POSTINGS_FILENAME = "temp_postings"; /** * How many bytes of posting to buffer before writing, when * addPartitionPostingsDictionary */ const POSTINGS_BUFFER_SIZE = 1000000; - /** - * Name of the folder used to hold position lists and document maps. Within - * this folder there is a subfolder for each partition which contains a - * doc_map file, postings file for the docs within the partition, - * position lists file for those postings, and a last_entries file - * used in the computation of difference list for doc_map_index and position - * list offsets, as well as number of occurrences of terms. - */ - const POSITIONS_DOC_MAP_FOLDER = "positions_doc_maps"; /** * Holds property value pairs concerning the configuration of the * current IndexDocumentBundle @@ -283,13 +269,13 @@ class IndexDocumentBundle implements CrawlConstants */ public function __construct($dir_name, $read_only_archive = true, $description = null, $num_docs_per_partition = - C\NUM_DOCS_PER_PARTITION, $max_keys = BPlusTree::MAX_KEYS) + C\NUM_DOCS_PER_PARTITION, $max_keys = BPlusTree::MAX_KEYS, + $overflow_threshold = C\OVERFLOW_THRESHOLD) { $this->dir_name = $dir_name; $is_dir = is_dir($this->dir_name); if (!$is_dir && !$read_only_archive) { mkdir($this->dir_name); - mkdir($this->dir_name . "/". self::POSITIONS_DOC_MAP_FOLDER); } else if (!$is_dir) { return false; } @@ -342,7 +328,8 @@ class IndexDocumentBundle implements CrawlConstants self::SUMMARY => "SERIAL", self::PAGE => "SERIAL"], $num_docs_per_partition, PartitionDocumentBundle::PARTITION_SIZE_THRESHOLD, - $record_compressor, $blob_compressor); + $record_compressor, $blob_compressor, + $overflow_threshold); if (!$read_only_archive) { $this->documents->index_cache_size = 1; } @@ -511,28 +498,23 @@ class IndexDocumentBundle implements CrawlConstants $start_time = microtime(true); $postings_string = $postings_tools->load($postings_filename, PackedTableTools::AS_STRING_MODE); - $temp_postings_filename = $base_folder . "/" . - self::TEMP_POSTINGS_FILENAME; - rename($postings_filename, $temp_postings_filename); $posting_files_len = strlen($postings_string); //add a marker for the end of the file as a string $key_len = $this->postings_tools->key_len; $this->last_entries = $last_entries_tools->load($last_entries_filename); $num_postings = substr_count($postings_string, "\xFF") + 1; $last_marker = 0; - $out_postings = ""; - $postings_offset = 0; - $fh = fopen($postings_filename, "w"); for ($i = 0; $i < $num_postings; $i++) { $cur_marker = strpos($postings_string, "\xFF", $last_marker); $diff = ($cur_marker === false) ? null : $cur_marker - $last_marker; $pre_row = substr($postings_string, $last_marker, $diff); + $postings_offset = $last_marker + $key_len; $last_marker = $cur_marker + 1; $term = substr($pre_row, 0, $key_len); - $row = decode255(substr($pre_row, $key_len)); - $postings_len = strlen($row); - $out_postings .= $row; + $encode_row = substr($pre_row, $key_len); + $postings_len = strlen($encode_row); + $row = decode255($encode_row); if (crawlTimeoutLog("..Indexer Still processing partition ". "$partition. Have completed $i postings of $num_postings.") && $taking_too_long_touch) { @@ -554,25 +536,10 @@ class IndexDocumentBundle implements CrawlConstants "NUM_OCCURRENCES" => $num_occurrences_term, "POSTINGS_OFFSET" => $postings_offset, "POSTINGS_LEN" => $postings_len]); - $postings_offset += $postings_len; - if (strlen($out_postings) > self::POSTINGS_BUFFER_SIZE) { - fwrite($fh, $out_postings); - $out_postings = ""; - } } $dictionary->flushLastPutNode(); - fwrite($fh, $out_postings); - fclose($fh); - unlink($temp_postings_filename); crawlLog("...Finished Adding Partition Posting Info to " . "Dictionary: " . changeInMicrotime($start_time)); - if (!C\nsdefined("KEEP_PARTITION_CALCULATIONS") || - !C\KEEP_PARTITION_CALCULATIONS) { - if (file_exists($last_entries_filename)) { - unlink($last_entries_filename); - } - crawlLog("..Done deleting partition posting calculations."); - } } /** * Gets the file path corresponding to the partition with index $partition @@ -584,9 +551,7 @@ class IndexDocumentBundle implements CrawlConstants */ public function getPartitionBaseFolder($partition) { - $base_folder = $this->dir_name . "/" . self::POSITIONS_DOC_MAP_FOLDER - . "/$partition"; - return $base_folder; + return $this->documents->getPartitionFolder($partition); } /** * Given the $doc_id of a document and a $partition to look for it in @@ -639,13 +604,6 @@ class IndexDocumentBundle implements CrawlConstants crawlLog( "Indexer Building index inverted index for partition $partition"); $base_folder = $this->getPartitionBaseFolder($partition); - if (!file_exists($base_folder)) { - if (!file_exists($this->dir_name . "/". - self::POSITIONS_DOC_MAP_FOLDER)) { - mkdir($this->dir_name . "/". self::POSITIONS_DOC_MAP_FOLDER); - } - mkdir($base_folder); - } /* set up $doc_map_filename, $postings_filename, $postings_filename, $positions_filename, etc */ @@ -1457,7 +1415,7 @@ class IndexDocumentBundle implements CrawlConstants * @param int $threshold after the number of results exceeds this amount * stop looking for more dictionary entries. * @param int $offset - * @param int $num_partitions + * @param int $num_partitions * @param bool $with_remaining_total whether to total number of * postings found as well or not * @return array either [total, sequence of four tuples] diff --git a/src/library/PartitionDocumentBundle.php b/src/library/PartitionDocumentBundle.php index 084ffb70a..27cbd0a95 100644 --- a/src/library/PartitionDocumentBundle.php +++ b/src/library/PartitionDocumentBundle.php @@ -64,18 +64,13 @@ class PartitionDocumentBundle * Default parameters to use when constructing a PartitionDocumentBundle */ const DEFAULT_PARAMETERS = ["RECORD_COMPRESSOR" => self::DEFAULT_COMPRESSOR, - "BLOB_COMPRESSOR" => self::DEFAULT_COMPRESSOR, - "COUNT" => 0, "PARTITION_SIZE_THRESHOLD" => - self::PARTITION_SIZE_THRESHOLD, + "BLOB_COMPRESSOR" => self::DEFAULT_COMPRESSOR, "COUNT" => 0, + "OVERFLOW_THRESHOLD" => C\OVERFLOW_THRESHOLD, + "PARTITION_SIZE_THRESHOLD" => self::PARTITION_SIZE_THRESHOLD, "FORMAT" => ["PRIMARY KEY" => "KEY", "VALUE" => "BLOB"], "MAX_ITEMS_PER_FILE" => self::MAX_ITEMS_PER_FILE, "SAVE_PARTITION" => 0, "ACTIVE_COUNT" => 0 ]; - /** - * Extension for PartitionDocumentBundle partition files used to contain - * records - */ - const INDEX_EXTENSION = ".ix"; /** * Default maximum number of records to store in a partition */ @@ -85,16 +80,37 @@ class PartitionDocumentBundle * PartitionDocumentBundle */ const PARAMETERS_FILE = "pdb_parameters.txt"; + /** + * Prefix to block folders of PartitionDocumentBundle partition files + */ + const BLOCK_PREFIX = "B"; + /** + * Number of partition files to store in a block folder before making + * another one + */ + const BLOCK_FACTOR = 10000; + /** + * + */ + const ARCHIVE_FILENAME = "archive"; + /** + * + */ + const INDEX_FILENAME = "index"; /** * Prefix to file names of PartitionDocumentBundle partition files */ - const PARTITION_PREFIX = "partition_"; + const PARTITION_PREFIX = "P"; /** * Maximum number of bytes a partition can have before the next partition * is started. Notice this implies a maximum file size to store * in BLOB columns */ const PARTITION_SIZE_THRESHOLD = 2147483648; + /** + * + */ + const OVERFLOW_DIR_FOLDER = "Overflow"; /** * Used to store the file handle to, the partition number, and last add time * for the last time an item's blob/serial columns were added to for @@ -200,7 +216,8 @@ class PartitionDocumentBundle $max_items_per_file = self::MAX_ITEMS_PER_FILE, $partition_size_threshold = self::PARTITION_SIZE_THRESHOLD, $record_compressor_type = self::DEFAULT_COMPRESSOR, - $blob_compressor_type = self::DEFAULT_COMPRESSOR) + $blob_compressor_type = self::DEFAULT_COMPRESSOR, + $overflow_threshold = C\OVERFLOW_DIR_THRESHOLD) { $initial_parameters = self::DEFAULT_PARAMETERS; $initial_parameters["PARTITION_SIZE_THRESHOLD"] = @@ -208,6 +225,7 @@ class PartitionDocumentBundle $initial_parameters["MAX_ITEMS_PER_FILE"] = $max_items_per_file; $initial_parameters["RECORD_COMPRESSOR"] = $record_compressor_type; $initial_parameters["BLOB_COMPRESSOR"] = $blob_compressor_type; + $initial_parameters["OVERFLOW_THRESHOLD"] = $overflow_threshold; $this->record_compressor = new $record_compressor_type(); $this->blob_compressor = new $blob_compressor_type(); $initial_parameters["FORMAT"] = $format; @@ -216,11 +234,15 @@ class PartitionDocumentBundle ini_get('memory_limit'))/128000000)); $this->folder = $folder; $folder_paths = [$folder]; + if ($overflow_threshold > 0) { + $folder_paths[] = $folder . "/" . self::OVERFLOW_DIR_FOLDER; + } $changed_parameters = false; foreach ($folder_paths as $folder_path) { if (!file_exists($folder_path)) { $changed_parameters = true; if (!mkdir($folder_path)) { + chmod($folder_path, 0777); return null; } } @@ -383,6 +405,25 @@ class PartitionDocumentBundle $previous_instance_time]; return $value; } + /** + * + */ + public function getPartitionBlock($i) + { + $threshold = $this->parameters["OVERFLOW_THRESHOLD"]; + $folder = ($threshold > 0 && $i > $threshold) ? + $this->folder . "/" . self::OVERFLOW_DIR_FOLDER : $this->folder; + $block = sprintf("%'.05d", floor($i / self::BLOCK_FACTOR)); + return $folder . "/". self::BLOCK_PREFIX . "$block"; + } + /** + * + */ + public function getPartitionFolder($i) + { + return $this->getPartitionBlock($i) . "/" . self::PARTITION_PREFIX . + sprintf("%'.010d", $i); + } /** * Returns the path to the archive file (used to store BLOB and SERIAL * columns) for the $i partition in this PartitionDocumentBundle @@ -392,8 +433,8 @@ class PartitionDocumentBundle */ public function getPartition($i) { - return $this->folder . "/" . self::PARTITION_PREFIX . - $i . $this->blob_compressor->fileExtension(); + return $this->getPartitionFolder($i) . "/" . self::ARCHIVE_FILENAME . + $this->blob_compressor->fileExtension(); } /** * Returns the path to the index file (used to store all columns @@ -405,8 +446,7 @@ class PartitionDocumentBundle */ public function getPartitionIndex($i) { - return $this->folder . "/" . self::PARTITION_PREFIX . - $i . self::INDEX_EXTENSION; + return $this->getPartitionFolder($i) . "/" . self::INDEX_FILENAME; } /** * Returns the unserialized index file for the $partition partition of @@ -488,6 +528,18 @@ class PartitionDocumentBundle // remove $save_partition from read cache unset($this->index_cache[$save_partition]); $save_partition_name = $this->getPartition($save_partition); + if (!file_exists($save_partition_name)) { + $save_block_name = $this->getPartitionBlock($save_partition); + if (!file_exists($save_block_name)) { + mkdir($save_block_name); + chmod($save_block_name, 0777); + } + $save_folder_name = $this->getPartitionFolder($save_partition); + if (!file_exists($save_folder_name)) { + mkdir($save_folder_name); + chmod($save_folder_name, 0777); + } + } clearstatcache(); $save_partition_len = file_exists($save_partition_name) ? filesize($save_partition_name) : 0; @@ -598,6 +650,20 @@ class PartitionDocumentBundle if (file_exists($new_save_index_name)) { unlink($new_save_index_name); } + $new_save_file_name = $this->getPartition($new_save_partition); + if (file_exists($new_save_file_name)) { + unlink($new_save_file_name); + } + $new_save_block_name = $this->getPartitionBlock($new_save_partition); + if (!file_exists($new_save_block_name)) { + mkdir($new_save_block_name); + chmod($new_save_block_name, 0777); + } + $new_save_folder_name = $this->getPartitionFolder($new_save_partition); + if (!file_exists($new_save_folder_name)) { + mkdir($new_save_folder_name); + chmod($new_save_folder_name, 0777); + } $this->parameters["SAVE_PARTITION"] = $new_save_partition; $this->parameters['COUNT'] += $this->parameters['ACTIVE_COUNT']; $this->parameters['ACTIVE_COUNT'] = 0; @@ -657,8 +723,26 @@ class PartitionDocumentBundle } if (empty($parameters['SAVE_PARTITION']) || $parameters['SAVE_PARTITION'] == 0) { - $parameters['SAVE_PARTITION'] = - max(count(glob("$folder/*" . self::INDEX_EXTENSION))-1, 0); + $block_folders = glob("$folder/" . self::BLOCK_PREFIX . "*"); + $parameters['SAVE_PARTITION'] = 0; + if (!empty($block_folders)) { + $overflow_block_folders = glob("$folder/" . + self::OVERFLOW_DIR_FOLDER . "/" . + self::BLOCK_PREFIX . "*"); + if (!empty($overflow_block_folders) && + count($overflow_block_folders) > 0) { + $block_folders = $overflow_block_folders; + } + sort($block_folders); + $last_folder = $block_folders[count($block_folders) - 1]; + $partition_path = "$last_folder/" . self::PARTITION_PREFIX; + $partition_folders = glob("$partition_path*"); + sort($partition_folders); + $last_index_file = $partition_folders[ + count($partition_folders) - 1]; + $parameters['SAVE_PARTITION'] = intval( + substr($last_index_file, strlen($partition_path))); + } } return $parameters; } else { diff --git a/src/library/StochasticTermSegmenter.php b/src/library/StochasticTermSegmenter.php index 73137e923..364858fe9 100644 --- a/src/library/StochasticTermSegmenter.php +++ b/src/library/StochasticTermSegmenter.php @@ -545,4 +545,4 @@ class StochasticTermSegmenter // Set end of term marker $sub_trie['$'] = $frequency; } -} \ No newline at end of file +} diff --git a/src/models/GroupModel.php b/src/models/GroupModel.php index 5b3762f61..54f2460c0 100644 --- a/src/models/GroupModel.php +++ b/src/models/GroupModel.php @@ -4537,9 +4537,10 @@ EOD; if (is_writable($pre_resource)) { $resource['is_writable'] = true; } - if (in_array($name . ".jpg", $thumbs)) { + if (in_array($name . ".jpg", $thumbs) || + in_array($name . ".webp", $thumbs)) { $resource['has_thumb'] = true; - if (in_array($name.".gif", $thumbs)) { + if (in_array($name . ".gif", $thumbs)) { $resource['has_animated_thumb'] = true; } } else if ($thumb_folder && !$resource['is_dir'] && diff --git a/tests/IndexDocumentBundleTest.php b/tests/IndexDocumentBundleTest.php index 5716120d4..c1eedca6b 100644 --- a/tests/IndexDocumentBundleTest.php +++ b/tests/IndexDocumentBundleTest.php @@ -320,6 +320,7 @@ use seekquarry\yioop\library\UnitTest; } $this->assertEqual($sum + count($active_postings), $num_docs, "Term 'be' occurs in correct number of documents"); + $key_len = $posting_tools->key_len; for ($i = 0; $i < 2; $i++) { $partition = $term_row[$i]['PARTITION']; $partition_folder = $this->index_archive->getPartitionBaseFolder(