viewgit/inc/functions.php:22 Function utf8_encode() is deprecated [8192]
Filename | |
---|---|
src/library/FeedArchiveBundle.php |
diff --git a/src/library/FeedArchiveBundle.php b/src/library/FeedArchiveBundle.php new file mode 100644 index 000000000..93ae1f5b1 --- /dev/null +++ b/src/library/FeedArchiveBundle.php @@ -0,0 +1,158 @@ +<?php +/** + * SeekQuarry/Yioop -- + * Open Source Pure PHP Search Engine, Crawler, and Indexer + * + * Copyright (C) 2009 - 2019 Chris Pollett chris@pollett.org + * + * LICENSE: + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <https://www.gnu.org/licenses/>. + * + * END LICENSE + * + * @author Chris Pollett chris@pollett.org + * @license https://www.gnu.org/licenses/ GPL3 + * @link https://www.seekquarry.com/ + * @copyright 2009 - 2019 + * @filesource + */ +namespace seekquarry\yioop\library; + +use seekquarry\yioop\configs as C; + +/** + * Used for crawlLog, crawlHash, and garbageCollect + */ +require_once __DIR__ . '/Utility.php'; +/** + * + * @author Chris Pollett + */ +class FeedArchiveBundle extends IndexArchiveBundle +{ + /** + * + * @var BloomFilterFile + */ + public $filter_a; + /** + * + * @var BloomFilterFile + */ + public $filter_b; + + /** + * Makes or initializes an FeedArchiveBundle with the provided parameters + * + * @param string $dir_name folder name to store this bundle + * @param bool $read_only_archive whether to open archive only for reading + * or reading and writing + * @param string $description a text name/serialized info about this + * IndexArchiveBundle + * @param int $num_docs_per_generation the number of pages to be stored + * in a single shard + */ + public function __construct($dir_name, $read_only_archive = true, + $description = null, $num_docs_per_generation = + C\NUM_DOCS_PER_GENERATION) + { + parent::__construct($dir_name, $read_only_archive, $description, + $num_docs_per_generation); + if (file_exists($dir_name . "/filter_a.ftr")) { + $this->filter_a = BloomFilterFile::load($dir_name . + "/filter_a.ftr"); + } else { + $this->filter_a = new BloomFilterFile($dir_name . "/filter_a.ftr", + C\URL_FILTER_SIZE); + chmod($dir_name . "/filter_a.ftr", 0755); + } + if (file_exists($dir_name . "/filter_b.ftr")) { + $this->filter_a = BloomFilterFile::load($dir_name . + "/filter_b.ftr"); + } else { + $this->filter_b = null; + } + } + /** + * Add the array of $pages to the summaries WebArchiveBundle pages being + * stored in the partition $generation and the field used + * to store the resulting offsets given by $offset_field. + * + * @param int $generation field used to select partition + * @param string $offset_field field used to record offsets after storing + * @param string $key_field field used to store unique identifier for a + * each page item. + * @param array& $pages data to store + * @param int $visited_urls_count number to add to the count of visited urls + * (visited urls is a smaller number than the total count of objects + * stored in the index). + */ + public function addPagesAndSeenKeys($generation, $offset_field, $key_field, + &$pages, $visited_urls_count) + { + foreach ($pages as $page) { + $key = $page[$key_field]; + $this->addFilters($key); + } + parent::addPages($generation, $offset_field, $pages, + $visited_urls_count); + } + /** + * + */ + public function addFilters($key) + { + if ($this->filter_a->count > C\URL_FILTER_SIZE/2 && + !$this->filter_b) { + if (file_exists($this->dir_name . "/filter_b.ftr")) { + $this->filter_b = BloomFilterFile::load($dir_name . + "/filter_b.ftr"); + } else { + $this->filter_b = new BloomFilterFile( + $this->dir_name . "/filter_b.ftr", C\URL_FILTER_SIZE); + chmod($dir_name . "/filter_a.ftr", 0755); + } + } + if ($this->filter_a->count > C\URL_FILTER_SIZE) { + unlink($this->dir_name . "/filter_a.ftr"); + rename($this->dir_name . "/filter_b.ftr", + $this->dir_name . "/filter_a.ftr"); + } + $this->filter_a->add($key); + if ($this->filter_b) { + $this->filter_b->add($key); + } + } + /** + * + */ + public function contains($key) + { + return $this->filter_a->contains($key); + } + /** + * Forces the current shard to be saved + */ + public function forceSave() + { + $this->getActiveShard()->save(false, true); + $this->filter_a->save(); + chmod($this->dir_name . "/filter_a.ftr", 0755); + if ($this->filter_b) { + $this->filter_b->save(); + chmod($this->dir_name . "/filter_b.ftr", 0755); + } + } +}