viewgit/inc/functions.php:22 Function utf8_encode() is deprecated [8192]

Add FeedArchiveBundle, a=chris

Chris Pollett [2019-12-18 01:Dec:th]
Add FeedArchiveBundle, a=chris
Filename
src/library/FeedArchiveBundle.php
diff --git a/src/library/FeedArchiveBundle.php b/src/library/FeedArchiveBundle.php
new file mode 100644
index 000000000..93ae1f5b1
--- /dev/null
+++ b/src/library/FeedArchiveBundle.php
@@ -0,0 +1,158 @@
+<?php
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009 - 2019  Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ *
+ * END LICENSE
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @license https://www.gnu.org/licenses/ GPL3
+ * @link https://www.seekquarry.com/
+ * @copyright 2009 - 2019
+ * @filesource
+ */
+namespace seekquarry\yioop\library;
+
+use seekquarry\yioop\configs as C;
+
+/**
+ * Used for crawlLog, crawlHash, and garbageCollect
+ */
+require_once __DIR__ . '/Utility.php';
+/**
+ *
+ * @author Chris Pollett
+ */
+class FeedArchiveBundle extends IndexArchiveBundle
+{
+    /**
+     *
+     * @var BloomFilterFile
+     */
+    public $filter_a;
+    /**
+     *
+     * @var BloomFilterFile
+     */
+    public $filter_b;
+
+    /**
+     * Makes or initializes an FeedArchiveBundle with the provided parameters
+     *
+     * @param string $dir_name folder name to store this bundle
+     * @param bool $read_only_archive whether to open archive only for reading
+     *  or reading and writing
+     * @param string $description a text name/serialized info about this
+     *      IndexArchiveBundle
+     * @param int $num_docs_per_generation the number of pages to be stored
+     *      in a single shard
+     */
+    public function __construct($dir_name, $read_only_archive = true,
+        $description = null, $num_docs_per_generation =
+        C\NUM_DOCS_PER_GENERATION)
+    {
+        parent::__construct($dir_name, $read_only_archive, $description,
+            $num_docs_per_generation);
+        if (file_exists($dir_name . "/filter_a.ftr")) {
+            $this->filter_a = BloomFilterFile::load($dir_name .
+                "/filter_a.ftr");
+        } else {
+            $this->filter_a = new BloomFilterFile($dir_name . "/filter_a.ftr",
+                C\URL_FILTER_SIZE);
+            chmod($dir_name . "/filter_a.ftr", 0755);
+        }
+        if (file_exists($dir_name . "/filter_b.ftr")) {
+            $this->filter_a = BloomFilterFile::load($dir_name .
+                "/filter_b.ftr");
+        } else {
+            $this->filter_b = null;
+        }
+    }
+    /**
+     * Add the array of $pages to the summaries WebArchiveBundle pages being
+     * stored in the partition $generation and the field used
+     * to store the resulting offsets given by $offset_field.
+     *
+     * @param int $generation field used to select partition
+     * @param string $offset_field field used to record offsets after storing
+     * @param string $key_field field used to store unique identifier for a
+     *      each page item.
+     * @param array& $pages data to store
+     * @param int $visited_urls_count number to add to the count of visited urls
+     *     (visited urls is a smaller number than the total count of objects
+     *     stored in the index).
+     */
+    public function addPagesAndSeenKeys($generation, $offset_field, $key_field,
+        &$pages, $visited_urls_count)
+    {
+        foreach ($pages as $page) {
+            $key = $page[$key_field];
+            $this->addFilters($key);
+        }
+        parent::addPages($generation, $offset_field, $pages,
+            $visited_urls_count);
+    }
+    /**
+     *
+     */
+    public function addFilters($key)
+    {
+        if ($this->filter_a->count > C\URL_FILTER_SIZE/2 &&
+            !$this->filter_b) {
+            if (file_exists($this->dir_name . "/filter_b.ftr")) {
+                $this->filter_b = BloomFilterFile::load($dir_name .
+                    "/filter_b.ftr");
+            } else {
+                $this->filter_b = new BloomFilterFile(
+                    $this->dir_name . "/filter_b.ftr", C\URL_FILTER_SIZE);
+                chmod($dir_name . "/filter_a.ftr", 0755);
+            }
+        }
+        if ($this->filter_a->count > C\URL_FILTER_SIZE) {
+            unlink($this->dir_name . "/filter_a.ftr");
+            rename($this->dir_name . "/filter_b.ftr",
+                $this->dir_name . "/filter_a.ftr");
+        }
+        $this->filter_a->add($key);
+        if ($this->filter_b) {
+            $this->filter_b->add($key);
+        }
+    }
+    /**
+     *
+     */
+    public function contains($key)
+    {
+        return $this->filter_a->contains($key);
+    }
+    /**
+     * Forces the current shard to be saved
+     */
+    public function forceSave()
+    {
+        $this->getActiveShard()->save(false, true);
+        $this->filter_a->save();
+        chmod($this->dir_name . "/filter_a.ftr", 0755);
+        if ($this->filter_b) {
+            $this->filter_b->save();
+            chmod($this->dir_name . "/filter_b.ftr", 0755);
+        }
+    }
+}
ViewGit