viewgit/inc/functions.php:22 Function utf8_encode() is deprecated [8192]

Another pass att next gen feeds, a=chris

Chris Pollett [2019-12-17 22:Dec:th]
Another pass att next gen feeds, a=chris
Filename
src/controllers/components/CrawlComponent.php
src/library/BloomFilterFile.php
src/library/IndexArchiveBundle.php
src/library/IndexManager.php
src/library/IndexShard.php
src/library/PersistentStructure.php
src/library/index_bundle_iterators/DocIterator.php
src/library/index_bundle_iterators/NegationIterator.php
src/library/media_jobs/FeedsUpdateJob.php
diff --git a/src/controllers/components/CrawlComponent.php b/src/controllers/components/CrawlComponent.php
index 0ecc9e333..9e9ef8b80 100644
--- a/src/controllers/components/CrawlComponent.php
+++ b/src/controllers/components/CrawlComponent.php
@@ -2765,7 +2765,7 @@ class CrawlComponent extends Component implements CrawlConstants
                         $feeds_update_job = new M\FeedsUpdateJob();
                         $feeds_update_job->parseFeedAuxInfo($source);
                         $data['FEED_TEST_RESULTS'] =
-                            $feeds_update_job->updateFeedItemsOneGo([$source],
+                            $feeds_update_job->updateFoundItemsOneGo([$source],
                             C\ONE_WEEK, true);
                     } else if (in_array($source['TYPE'], ['feed_podcast',
                         'scrape_podcast'])) {
diff --git a/src/library/BloomFilterFile.php b/src/library/BloomFilterFile.php
index 50843d65d..71d91df37 100755
--- a/src/library/BloomFilterFile.php
+++ b/src/library/BloomFilterFile.php
@@ -51,6 +51,11 @@ class BloomFilterFile extends PersistentStructure
      * @var int
      */
     public $num_keys;
+    /**
+     * Number of items currently stored in this filter
+     * @var int
+     */
+    public $count;
     /**
      * Size in bits of the packed string array used to store the filter's
      * contents
@@ -82,6 +87,7 @@ class BloomFilterFile extends PersistentStructure
          */
         $this->num_keys = ceil(log($num_values)/$log2);
         $this->filter_size = ceil( ($this->num_keys) * $num_values/$log2_sq);
+        $this->count = 0;
         $mem_before =  memory_get_usage(true);
         $this->filter = pack("x". ceil(0.125 * $this->filter_size));
             // 1/8 =.125 = num bits/bytes, want to make things floats
@@ -100,6 +106,7 @@ class BloomFilterFile extends PersistentStructure
         for ($i = 0;  $i < $num_keys; $i++) {
             $this->setBit($pos_array[$i]);
         }
+        $this->count++;
         $this->checkSave();
     }
     /**
diff --git a/src/library/IndexArchiveBundle.php b/src/library/IndexArchiveBundle.php
index 7465f6d88..e5afb8a8b 100644
--- a/src/library/IndexArchiveBundle.php
+++ b/src/library/IndexArchiveBundle.php
@@ -148,6 +148,7 @@ class IndexArchiveBundle implements CrawlConstants
                 file_get_contents($this->dir_name . "/generation.txt"));
         } else if (!$read_only_archive) {
             $this->generation_info['ACTIVE'] = 0;
+            $this->generation_info['LAST_DICTIONARY_SHARD'] = -1;
             file_put_contents($this->dir_name . "/generation.txt",
                 serialize($this->generation_info));
         }
@@ -287,6 +288,8 @@ class IndexArchiveBundle implements CrawlConstants
             $current_index_shard_file, $this->generation_info['ACTIVE'],
                 $this->num_docs_per_generation, true);
         $this->dictionary->addShardDictionary($this->current_shard, $callback);
+        $this->generation_info['LAST_DICTIONARY_SHARD'] =
+            $this->generation_info['ACTIVE'];
     }
     /**
      * Sets the current shard to be the active shard (the active shard is
diff --git a/src/library/IndexManager.php b/src/library/IndexManager.php
index 15a1dd96e..a559a538c 100644
--- a/src/library/IndexManager.php
+++ b/src/library/IndexManager.php
@@ -196,13 +196,42 @@ class IndexManager implements CrawlConstants
         $with_remaining_total = false)
     {
         $index = self::getIndex($index_name);
-        $tmp = [];
+        $added_active = false;
+        $pre_info = [];
         if (!empty($index->dictionary)) {
             $pre_info =
                 $index->dictionary->getWordInfo($hash, true, $shift,
                 $threshold, $start_generation,
                 $num_distinct_generations, true);
         }
+        $last_desired_generation = $start_generation +
+            $num_distinct_generations;
+        if (!empty($index->generation_info)) {
+            $active_generation = $index->generation_info['ACTIVE'];
+            if ((empty($index->generation_info['LAST_DICTIONARY_SHARD']) ||
+                $index->generation_info['LAST_DICTIONARY_SHARD'] <
+                $active_generation) && $active_generation <
+                $last_desired_generation) {
+                $active_shard_file = $index->dir_name .
+                    "/posting_doc_shards/index" . $active_generation;
+                if (file_exists($active_shard_file)) {
+                    $active_shard = new IndexShard($active_shard_file, 0,
+                        C\NUM_DOCS_PER_GENERATION, true);
+                    $active_info = $active_shard->getWordInfo($hash, true,
+                        $shift);
+                    if (is_array($active_info)) {
+                        if (empty($pre_info)) {
+                            $pre_info[0] = 0;
+                            $pre_info[1] = [];
+                        }
+                        $pre_info[1][] = [$active_generation,
+                            $active_info[0], $active_info[1], $active_info[2],
+                            $active_info[3]];
+                        $pre_info[0] += $active_info[2];
+                    }
+                }
+            }
+        }
         if (!empty($pre_info[1])) {
             list($total, $info) = $pre_info;
         } else {
diff --git a/src/library/IndexShard.php b/src/library/IndexShard.php
index 70ed6302b..65cba7c81 100644
--- a/src/library/IndexShard.php
+++ b/src/library/IndexShard.php
@@ -596,7 +596,9 @@ class IndexShard extends PersistentStructure implements CrawlConstants
             $old_check_loc = $check_loc;
             $word_string = $this->getWordString($is_disk, $start, $check_loc,
                 $word_item_len);
-            if ($word_string == false) {return false;}
+            if ($word_string == false) {
+                return false;
+            }
             $id = substr($word_string, 0, $word_key_len);
             $cmp = compareWordHashes($word_id, $id, $shift);
             if ($cmp === 0) {
diff --git a/src/library/PersistentStructure.php b/src/library/PersistentStructure.php
index 41c8e6df2..b9960b3c2 100755
--- a/src/library/PersistentStructure.php
+++ b/src/library/PersistentStructure.php
@@ -85,7 +85,7 @@ class PersistentStructure
     public static function load($fname)
     {
         /* code to handle the fact that name space of object may not be the
-            modern nameepace
+            modern namespace
          */
         $obj_string = file_get_contents($fname);
         $name_length = intval(substr($obj_string, 2, 14));
diff --git a/src/library/index_bundle_iterators/DocIterator.php b/src/library/index_bundle_iterators/DocIterator.php
index 321f313b8..052e33dfb 100755
--- a/src/library/index_bundle_iterators/DocIterator.php
+++ b/src/library/index_bundle_iterators/DocIterator.php
@@ -59,7 +59,7 @@ class DocIterator extends IndexBundleIterator
      */
     public $next_offset;
     /**
-     * Last Offset of a doc occurence in the IndexShard
+     * Last offset of a doc occurrence in the IndexShard
      * @var int
      */
     public $last_offset;
@@ -90,7 +90,7 @@ class DocIterator extends IndexBundleIterator
     public $filter;
     /** Host Key position + 1 (first char says doc, inlink or eternal link)*/
     const HOST_KEY_POS = 17;
-    /** Length of a doc key*/
+    /** Length of a doc key */
     const KEY_LEN = 8;
     /**
      * Creates a word iterator with the given parameters.
@@ -122,12 +122,15 @@ class DocIterator extends IndexBundleIterator
      */
     public function reset()
     {
-        $this->current_generation = 0;
+        $is_ascending = ($this->direction == self::ASCENDING);
+        $this->current_generation = ($is_ascending) ? 0 :
+            $this->num_generations - 1;
+        $this->getShardInfo($this->current_generation);
         $this->count_block = 0;
         $this->seen_docs = 0;
-        $this->current_offset = 0;
-        $this->next_offset = 0;
-        $this->getShardInfo($this->current_generation);
+        $this->current_offset = ($is_ascending) ? 0 :
+            $this->getPreviousDocOffset($this->last_offset);
+        $this->next_offset = $this->current_offset;
     }
     /**
      * Mainly used to get the last_offset in shard $generation of the
@@ -143,8 +146,7 @@ class DocIterator extends IndexBundleIterator
         if (isset($this->shard_lens[$generation])) {
             $this->last_offset = $this->shard_lens[$generation];
         } else {
-            $index = IndexManager::getIndex($this->index_name,
-                $this->direction);
+            $index = IndexManager::getIndex($this->index_name);
             $index->setCurrentShard($generation, true);
             $shard = $index->getCurrentShard();
             $this->last_offset = $shard->docids_len;
@@ -160,14 +162,18 @@ class DocIterator extends IndexBundleIterator
      */
     public function findDocsWithWord()
     {
-        if (($this->current_generation >= $this->num_generations)
+        $is_ascending = ($this->direction == self::ASCENDING);
+        if (($is_ascending &&
+            ($this->current_generation >= $this->num_generations)
             || ($this->current_generation == $this->num_generations - 1 &&
-            $this->current_offset > $this->last_offset)) {
+            $this->current_offset > $this->last_offset)) ||
+            !$is_ascending &&  ($this->current_generation < 0) ||
+            ($this->current_generation == 0 && $this->current_offset < 0)) {
             return -1;
         }
         $pre_results = [];
         $this->next_offset = $this->current_offset;
-        $index = IndexManager::getIndex($this->index_name, $this->direction);
+        $index = IndexManager::getIndex($this->index_name);
         $index->setCurrentShard($this->current_generation, true);
         //the next call also updates next offset
         $shard = $index->getCurrentShard();
@@ -177,16 +183,22 @@ class DocIterator extends IndexBundleIterator
         $pre_results = [];
         $num_docs_so_far = 0;
         do {
-            if ($this->next_offset >= $this->last_offset) {
+            if (($is_ascending && $this->next_offset >= $this->last_offset)
+                || (!$is_ascending && $this->next_offset < 0)) {
                 break;
             }
             $posting = L\packPosting($this->next_offset >> 4, [1]);
             list($doc_id, $num_keys, $item) =
-                $shard->makeItem($posting, $num_docs_or_links);
-            if ($num_keys % 2 == 0) {
-                $num_keys++;
+                $shard->makeItem($posting, $num_docs_or_links,
+                    $this->direction);
+            if ($is_ascending) {
+                if ($num_keys % 2 == 0) {
+                    $num_keys++;
+                }
+                $this->next_offset += ($num_keys + 1) * $doc_key_len;
+            } else {
+                $this->next_offset = $this->getPreviousDocOffset($next_offset);
             }
-            $this->next_offset += ($num_keys + 1) * $doc_key_len;
             $pre_results[$doc_id] = $item;
             $num_docs_so_far++;
         } while ($num_docs_so_far <  $this->results_per_block);
@@ -212,17 +224,34 @@ class DocIterator extends IndexBundleIterator
         $this->pages = $results;
         return $results;
     }
+    /**
+     *
+     */
+    public function getPreviousDocOffset($doc_offset)
+    {
+        $doc_item_len = 4 * IndexShard::DOC_KEY_LEN;
+        // this is nott correct, only works if no additions doc keys
+        return $doc_offset - $doc_item_len;
+    }
     /**
      * Updates the seen_docs count during an advance() call
      */
     public function advanceSeenDocs()
     {
         if ($this->current_block_fresh != true) {
+            $is_ascending = ($this->direction == self::ASCENDING);
             $doc_item_len = 4 * IndexShard::DOC_KEY_LEN;
+            $pre_num_docs = ($is_ascending) ?
+                ($this->last_offset - $this->next_offset) / $doc_item_len :
+                $this->next_offset/$doc_item_len;
             $num_docs = min($this->results_per_block,
-                ($this->last_offset - $this->next_offset) / $doc_item_len);
+                );
             $this->next_offset = $this->current_offset;
-            $this->next_offset += $doc_item_len * $num_docs;
+            if ($is_ascending) {
+                $this->next_offset += $doc_item_len * $num_docs;
+            } else {
+                $this->next_offset -= $doc_item_len * $num_docs;
+            }
             if ($num_docs < 0) {
                 return;
             }
@@ -242,32 +271,40 @@ class DocIterator extends IndexBundleIterator
     public function advance($gen_doc_offset = null)
     {
         $this->advanceSeenDocs();
-        if ($this->current_offset < $this->next_offset) {
+        if (($is_ascending && $this->current_offset < $this->next_offset) ||
+            (!$is_ascending && $this->current_offset > $this->next_offset)) {
             $this->current_offset = $this->next_offset;
         } else {
             $this->advanceGeneration();
             $this->next_offset = $this->current_offset;
         }
-        if ($this->current_offset > $this->last_offset) {
+        if (($is_ascending && $this->current_offset > $this->last_offset) ||
+            (!$is_ascending && $this->current_offset < 0)) {
             $this->advanceGeneration();
             $this->next_offset = $this->current_offset;
         }
         if ($gen_doc_offset !== null) {
-            if ($this->current_generation < $gen_doc_offset[0]) {
+            if (($is_ascending &&
+                $this->current_generation < $gen_doc_offset[0]) ||
+                (!$is_ascending &&
+                    $this->current_generation > $gen_doc_offset[0])) {
                 $this->advanceGeneration($gen_doc_offset[0]);
                 $this->next_offset = $this->current_offset;
             }
             if ($this->current_generation == $gen_doc_offset[0]) {
-                $this->current_offset = max($this->current_offset,
-                    $gen_doc_offset[1]);
-                if ($this->current_offset > $this->last_offset) {
+                $this->current_offset = ($is_ascending) ?
+                    max($this->current_offset, $gen_doc_offset[1]) :
+                    min($this->current_offset, $gen_doc_offset[1]);
+                if (($is_ascending &&
+                    $this->current_offset > $this->last_offset) ||
+                    (!$is_ascending &&
+                        $this->current_offset < $this->last_offset)) {
                     $this->advanceGeneration();
                     $this->next_offset = $this->current_offset;
                 }
             }
-            $this->seen_docs =
-                $this->current_offset /
-                    4 * IndexShard::DOC_KEY_LEN;
+            $this->seen_docs = $this->current_offset /
+                4 * IndexShard::DOC_KEY_LEN;
         }
     }
     /**
@@ -278,12 +315,16 @@ class DocIterator extends IndexBundleIterator
      */
     public function advanceGeneration($generation = null)
     {
+        $is_ascending = ($this->direction == self::ASCENDING);
         if ($generation === null) {
-            $generation = $this->current_generation + 1;
+            $generation = ($is_ascending) ? $this->current_generation + 1 :
+                $this->current_generation - 1;
         }
         $this->current_generation = $generation;
-        $this->current_offset = 0;
-        if ($generation < $this->num_generations) {
+        $this->current_offset = ($is_ascending) ? 0 :
+            $this->last_offset;
+        if (($is_ascending && $generation < $this->num_generations) ||
+            (!$is_ascending && $generation >= 0) ) {
             $this->getShardInfo($generation);
         }
     }
@@ -295,8 +336,11 @@ class DocIterator extends IndexBundleIterator
      * and generation; -1 on fail
      */
     public function currentGenDocOffsetWithWord() {
-        if (($this->current_offset > $this->last_offset ||
-            $this->current_generation >= $this->num_generations)) {
+        $is_ascending = ($this->direction == self::ASCENDING);
+        if (($is_ascending && ($this->current_offset > $this->last_offset ||
+            $this->current_generation >= $this->num_generations)) ||
+            (!$is_ascending && ($this->current_offset < 0 ||
+                $this->current_generation < 0))) {
             return -1;
         }
         return [$this->current_generation, $this->current_offset];
diff --git a/src/library/index_bundle_iterators/NegationIterator.php b/src/library/index_bundle_iterators/NegationIterator.php
index a3ad9c82b..f94d0ad8f 100644
--- a/src/library/index_bundle_iterators/NegationIterator.php
+++ b/src/library/index_bundle_iterators/NegationIterator.php
@@ -142,16 +142,15 @@ class NegationIterator extends IndexBundleIterator
                 $old_gen_offset_all) == 0)) {
                 return -1;
             }
-            $gen_offset_term =
-                $this->index_bundle_iterators[
-                    1]->currentGenDocOffsetWithWord();
+            $gen_offset_term = $this->index_bundle_iterators[
+                1]->currentGenDocOffsetWithWord();
             if ($gen_offset_term == -1 || ($changed_term &&
                 $this->genDocOffsetCmp($gen_offset_term,
                 $old_gen_offset_term) == 0)) {
                 return -1;
             }
             $gen_doc_cmp = $this->genDocOffsetCmp($gen_offset_all,
-                $gen_offset_term);
+                $gen_offset_term, $this->getDirection());
             if ($gen_doc_cmp > 0) {
                 $this->index_bundle_iterators[1]->advance($gen_offset_all);
                 $old_gen_offset_term = $gen_offset_term;
diff --git a/src/library/media_jobs/FeedsUpdateJob.php b/src/library/media_jobs/FeedsUpdateJob.php
index b08b57346..791592618 100644
--- a/src/library/media_jobs/FeedsUpdateJob.php
+++ b/src/library/media_jobs/FeedsUpdateJob.php
@@ -36,7 +36,7 @@ use seekquarry\yioop\library as L;
 use seekquarry\yioop\library\CrawlConstants;
 use seekquarry\yioop\library\FetchUrl;
 use seekquarry\yioop\library\IndexShard;
-use seekquarry\yioop\library\IndexArchiveBundle;
+use seekquarry\yioop\library\FeedArchiveBundle;
 use seekquarry\yioop\library\PhraseParser;
 use seekquarry\yioop\library\UrlParser;

@@ -47,14 +47,7 @@ use seekquarry\yioop\library\UrlParser;
  */
 class FeedsUpdateJob extends MediaJob
 {
-    /**
-     * how long in seconds before a feed item expires
-     */
-    const ITEM_EXPIRES_TIME = 4 * C\ONE_WEEK;
-    /**
-     * Mamimum number of feeds to download in one try
-     */
-    const MAX_FEEDS_ONE_GO = 100;
+
     /**
      * Time in current epoch when feeds last updated
      * @var int
@@ -66,6 +59,22 @@ class FeedsUpdateJob extends MediaJob
      * @var object
      */
     public $db;
+    /**
+     * @var IndexArchiveBundle
+     */
+    public $index_archive;
+    /**
+     * @var array
+     */
+    public $found_items;
+    /**
+     * Mamimum number of feeds to download in one try
+     */
+    const MAX_FEEDS_ONE_GO = 100;
+   /**
+     * how long in seconds before a feed item expires
+     */
+    const OLD_ITEM_TIME = 4 * C\ONE_WEEK;
     /**
      * Initializes the last update time to far in the past so, feeds will get
      * immediately updated. Sets up connect to DB to store feeds items, and
@@ -77,9 +86,14 @@ class FeedsUpdateJob extends MediaJob
         $this->update_time = 0;
         $this->name_server_does_client_tasks = true;
         $this->name_server_does_client_tasks_only = true;
+        $dir = C\CRAWL_DIR . '/cache/' . self::feed_index_data_base_name;
+        $info['DESCRIPTION'] = "feed";
+        $this->index_archive = new FeedArchiveBundle($dir, false,
+            serialize($info), C\NUM_DOCS_PER_GENERATION);
         $db_class = C\NS_DATASOURCES . ucfirst(C\DBMS). "Manager";
         $this->db = new $db_class();
         $this->db->connect();
+        $this->db->setWorldPermissionsRecursive($dir);
         C\nsconddefine("FEEDS_UPDATE_INTERVAL", C\ONE_HOUR);
     }
     /**
@@ -121,8 +135,8 @@ class FeedsUpdateJob extends MediaJob
     }
     /**
      * For each feed source downloads the feeds, checks which items are
-     * not in the database, adds them. Then calls the method to rebuild the
-     * inverted index shard for feeds
+     * new, and makes an array of them. Then calls the method to add these
+     * items to both the IndexArchiveBundle for feeds
      *
      * @param array $tasks array of feed info (url to download, paths to
      *  extract etc)
@@ -138,18 +152,19 @@ class FeedsUpdateJob extends MediaJob
         L\crawlLog("----This media updater is responsible for the feeds:");
         $i = 1;
         foreach ($feeds as $feed) {
-            L\crawlLog("----  $i. ".$feed["NAME"]);
+            L\crawlLog("----  $i. " . $feed["NAME"]);
             $i++;
         }
         $num_feeds = count($feeds);
         $feeds_one_go = self::MAX_FEEDS_ONE_GO;
         $limit = 0;
+        $this->found_items = [];
         while ($limit < $num_feeds) {
             $feeds_batch = array_slice($feeds, $limit, $feeds_one_go);
-            $this->updateFeedItemsOneGo($feeds_batch, self::ITEM_EXPIRES_TIME);
+            $this->updateFoundItemsOneGo($feeds_batch, self::OLD_ITEM_TIME);
             $limit += $feeds_one_go;
         }
-        $this->rebuildFeedShard(self::ITEM_EXPIRES_TIME);
+        $this->addFoundItemsInvertedIndex(self::OLD_ITEM_TIME);
     }
     /**
      * Handles the request to get the  array of feed sources which hash to
@@ -216,7 +231,7 @@ class FeedsUpdateJob extends MediaJob
      * @return mixed either true, or if $test_mode is true then the results
      *      as a string of downloading the feeds and extracting the feed items
      */
-    public function updateFeedItemsOneGo($feeds, $age = C\ONE_WEEK,
+    public function updateFoundItemsOneGo($feeds, $age = C\ONE_WEEK,
         $test_mode = false)
     {
         $test_results = "";
@@ -596,25 +611,13 @@ class FeedsUpdateJob extends MediaJob
      * shard to be active. If this method is going to take max_execution_time/2
      * it returns false, so an additional job can be schedules; otherwise
      * it returns true
-     *
      * @param int $age how many seconds old records should be deleted
+     *
      * @return bool whether job executed to complete
      */
-    public function rebuildFeedShard($age)
+    public function addFoundItemsInvertedIndex($age)
     {
-        $time = time();
-        $prune_shard_name = C\WORK_DIRECTORY . "/feeds/prune_index";
-        $dir = C\CRAWL_DIR . '/cache/' . self::feed_index_data_base_name;
-        $info['DESCRIPTION'] = "feed";
-        $index_archive = new IndexArchiveBundle($dir, false,
-            serialize($info), C\NUM_DOCS_PER_GENERATION, self::DESCENDING);
-        $this->db->setWorldPermissionsRecursive($dir);
-        $prune_shard = new IndexShard($prune_shard_name);
-        $too_old = $time - $age;
-        $num_sites = 0;
-        if (!$prune_shard) {
-            return false;
-        }
+        $items = $this->found_items;
         $pre_feeds = $this->tasks;
         if (!$pre_feeds) {
             return false;
@@ -626,90 +629,83 @@ class FeedsUpdateJob extends MediaJob
             }
             $feeds[$pre_feed['NAME']] = $pre_feed;
         }
+        $time = time();
+        $tmp_shard_name = C\WORK_DIRECTORY . "/data/tmp_index";
+        $tmp_shard = new IndexShard($tmp_shard_name);
+        $num_sites = 0;
+        if (!$tmp_shard) {
+            return false;
+        }
         $db = $this->db;
-        // we now rebuild the inverted index with the remaining items
-        $sql = "SELECT * FROM FEED_ITEM WHERE PUBDATE >= ? " .
-            "ORDER BY PUBDATE ASC";
+        $completed = true;
+        L\crawlLog("----.. Creating inverted index of new items.");
+        $i = 0;
+        $term_counts = [];
         $seen_url_count = 0;
-        $seen_sites = [];
-        $result = $db->execute($sql, [$too_old]);
-        if ($result) {
-            $completed = true;
-            L\crawlLog("----..Making new index" .
-                " of non-pruned items.");
-            $i = 0;
-            $term_counts = [];
-            while ($item = $db->fetchArray($result)) {
-                L\crawlTimeoutLog(
-                    "----..have added %s non-pruned items to index.", $i);
-                $i++;
-                if (!isset($item['SOURCE_NAME'])) {
-                    continue;
-                }
-                $source_name = $item['SOURCE_NAME'];
-                if (isset($feeds[$source_name])) {
-                    $lang = $feeds[$source_name]['LANGUAGE'];
-                    $media_category = $feeds[$source_name]['CATEGORY'];
-                } else {
-                    $lang = "";
-                    $media_category = "news";
-                }
-                $phrase_string = $item["TITLE"] . " " . $item["DESCRIPTION"];
-                $word_and_qa_lists = PhraseParser::extractPhrasesInLists(
-                    $phrase_string, $lang);
-                $raw_guid = L\unbase64Hash($item["GUID"]);
-                $doc_keys = L\crawlHash($item["LINK"], true) .
-                    $raw_guid . "d". substr(L\crawlHash(
-                    UrlParser::getHost($item["LINK"]) . "/", true), 1);
-                $meta_ids = $this->calculateMetas($lang, $item['PUBDATE'],
-                    $source_name, $item["GUID"], $media_category);
-                $len = strlen($phrase_string);
-                $word_list = $word_and_qa_lists["WORD_LIST"];
-                if (PhraseParser::computeSafeSearchScore($word_list, $len,
-                    $item["LINK"]) < 0.012) {
-                    $meta_ids[] = "safe:true";
-                    $meta_ids[] = "safe:all";
-                } else {
-                    $meta_ids[] = "safe:false";
-                    $meta_ids[] = "safe:all";
-                }
-                $prune_shard->addDocumentWords($doc_keys,
-                    self::NEEDS_OFFSET_FLAG, $word_and_qa_lists["WORD_LIST"],
-                    $meta_ids, true, false);
-                $this->updateTrendingTermCounts($term_counts, $phrase_string,
-                    $word_and_qa_lists["WORD_LIST"], $media_category,
-                    $source_name, $lang,
-                    $item['PUBDATE']);
-                $seen_url_count += 1;
-                $page = [];
-                $page[self::TITLE] = $item['TITLE'];
-                $page[self::DESCRIPTION] = $item['DESCRIPTION'];
-                $page[self::URL] = $item['LINK'];
-                $page[self::HASH] = $item['GUID'];
-                $page[self::SOURCE_NAME] = $item['SOURCE_NAME'];
-                $page[self::IMAGE_LINK] = $item['IMAGE_LINK'];
-                $page[self::PUBDATE] = $item['PUBDATE'];
-                $seen_sites[] = $page;
+        foreach ($items as $item) {
+            L\crawlTimeoutLog(
+                "----..have added %s items to new item index.", $i);
+            $i++;
+            if (!isset($item['SOURCE_NAME'])) {
+                continue;
+            }
+            $source_name = $item['SOURCE_NAME'];
+            if (isset($feeds[$source_name])) {
+                $lang = $feeds[$source_name]['LANGUAGE'];
+                $media_category = $feeds[$source_name]['CATEGORY'];
+            } else {
+                $lang = "";
+                $media_category = "news";
+            }
+            $phrase_string = $item["TITLE"] . " " . $item["DESCRIPTION"];
+            $word_and_qa_lists = PhraseParser::extractPhrasesInLists(
+                $phrase_string, $lang);
+            $raw_guid = L\unbase64Hash($item["GUID"]);
+            $doc_keys = L\crawlHash($item["LINK"], true) .
+                $raw_guid . "d". substr(L\crawlHash(
+                UrlParser::getHost($item["LINK"]) . "/", true), 1);
+            $meta_ids = $this->calculateMetas($lang, $item['PUBDATE'],
+                $source_name, $item["GUID"], $media_category);
+            $len = strlen($phrase_string);
+            $word_list = $word_and_qa_lists["WORD_LIST"];
+            if (PhraseParser::computeSafeSearchScore($word_list, $len,
+                $item["LINK"]) < 0.012) {
+                $meta_ids[] = "safe:true";
+                $meta_ids[] = "safe:all";
+            } else {
+                $meta_ids[] = "safe:false";
+                $meta_ids[] = "safe:all";
             }
-            unset($term_counts['seen']);
-            $this->addTermCountsTrendingTable($db, $term_counts);
+            $tmp_shard->addDocumentWords($doc_keys,
+                self::NEEDS_OFFSET_FLAG, $word_and_qa_lists["WORD_LIST"],
+                $meta_ids, true, false);
+            $this->updateTrendingTermCounts($term_counts, $phrase_string,
+                $word_and_qa_lists["WORD_LIST"], $media_category,
+                $source_name, $lang,
+                $item['PUBDATE']);
+            $seen_url_count += 1;
+            $page = [];
+            $page[self::TITLE] = $item['TITLE'];
+            $page[self::DESCRIPTION] = $item['DESCRIPTION'];
+            $page[self::URL] = $item['LINK'];
+            $page[self::HASH] = $item['GUID'];
+            $page[self::SOURCE_NAME] = $item['SOURCE_NAME'];
+            $page[self::IMAGE_LINK] = $item['IMAGE_LINK'];
+            $page[self::PUBDATE] = $item['PUBDATE'];
+            $seen_sites[] = $page;
         }
-        L\crawlLog("----..deleting old feed items");
-        $sql = " DELETE FROM FEED_ITEM ";
-        $db->execute($sql);
-        L\crawlLog("----..done deleting old items");
+        unset($term_counts['seen']);
+        $this->addTermCountsTrendingTable($db, $term_counts);
+        L\crawlLog("----..adding items to IndexArchiveBundle");
         // 1. check if index shard is full or not. if it is, new gen
-        $generation = $index_archive->initGenerationToAdd(
-            $prune_shard->num_docs);
+        $generation = $this->index_archive->initGenerationToAdd(
+            $tmp_shard->num_docs);
         $summary_offsets = [];
         if (!empty($seen_sites)) {
             // 2. add pages, get summary_offset
-            $index_archive->addPages($generation,
-                self::SUMMARY_OFFSET, $seen_sites, $seen_url_count);
-            // keeping track of duplicates
-            $sql = " INSERT INTO FEED_ITEM (GUID) VALUES (?)";
+            $this->index_archive->addPagesAndSeenKeys($generation,
+                self::SUMMARY_OFFSET, self::HASH, $seen_sites, $seen_url_count);
             foreach ($seen_sites as $site) {
-                $result = $db->execute($sql, [$site[self::HASH]]);
                 $site_url = str_replace('|', "%7C", $site[self::URL]);
                 $host = UrlParser::getHost($site_url);
                 $raw_guid = L\unbase64Hash($site[self::HASH]);
@@ -720,17 +716,17 @@ class FeedsUpdateJob extends MediaJob
             }
             unset($seen_sites);
         }
-        $prune_string = $prune_shard->save(true, true);
-        $tmp_shard = IndexShard::load("news" , $prune_string);
+        $tmp_string = $tmp_shard->save(true, true);
+        $tmp_shard = IndexShard::load("feed_data", $tmp_string);
         if (!empty($summary_offsets)) {
             $tmp_shard->changeDocumentOffsets($summary_offsets);
-            $index_archive->addIndexData($tmp_shard);
+            $this->index_archive->addIndexData($tmp_shard);
         }
-        $index_archive->stopIndexingBundle();
-        if (file_exists($prune_shard_name)) {
-            unlink($prune_shard_name);
+        $this->index_archive->forceSave();
+        if (file_exists($tmp_shard_name)) {
+            unlink($tmp_shard_name);
         }
-        unset($prune_shard);
+        unset($tmp_shard);
         set_error_handler(null);
         set_error_handler(C\NS_CONFIGS . "yioop_error_handler");
     }
@@ -873,7 +869,7 @@ class FeedsUpdateJob extends MediaJob
             "AND CATEGORY = ?";
         $interval_sql = "SELECT TERM, SUM(OCCURRENCES) AS OCCURRENCES ".
             "FROM TRENDING_TERM WHERE UPDATE_PERIOD = ? AND " .
-            "TIMESTAMP > ? AND LANGUAGE = ? GROUP BY TERM ".
+            "TIMESTAMP > ? AND LANGUAGE = ? AND CATEGORY = ? GROUP BY TERM ".
             "ORDER BY OCCURRENCES DESC ".
             $db->limitOffset(C\NUM_TRENDING);
         $insert_sql = "INSERT INTO TRENDING_TERM (TERM, OCCURRENCES, " .
@@ -904,7 +900,7 @@ class FeedsUpdateJob extends MediaJob
                         }
                     }
                     $result = $db->execute($interval_sql,
-                        [$sub_interval, $interval_start, $lang]);
+                        [$sub_interval, $interval_start, $lang, $category]);
                     while ($interval_info = $db->fetchArray($result)) {
                         $db->execute($insert_sql, [$interval_info['TERM'],
                         $interval_info['OCCURRENCES'], $interval, $time,
@@ -952,53 +948,42 @@ class FeedsUpdateJob extends MediaJob
             strlen($item["link"]) > C\MAX_URL_LEN) {
             return false;
         }
-        $item["title"] = substr($item["title"], 0, C\TITLE_LEN);
-        $item["description"] = substr($item["description"], 0,
+        $out_item = [];
+        $out_item["TITLE"] = substr($item["title"], 0, C\TITLE_LEN);
+        $out_item["DESCRIPTION"] = substr($item["description"], 0,
             C\MAX_GROUP_POST_LEN);
+        $out_item["LINK"] = $item["link"];
         if (empty($item["guid"])) {
             $hash_string = "";
             foreach ($unique_fields as $field) {
                 $hash_string .= $item[$field];
             }
-            $item["guid"] = L\crawlHash($hash_string);
+            $out_item["GUID"] = L\crawlHash($hash_string);
         } else {
-            $item["guid"] = L\crawlHash($item["guid"]);
+            $out_item["GUID"] = L\crawlHash($item["guid"]);
         }
         if (!isset($item["image_link"]) ||
             strlen($item["image_link"]) > C\MAX_URL_LEN) {
-            $item["image_link"] = "";
+            $out_item["IMAGE_LINK"] = "";
+        } else {
+            $out_item["IMAGE_LINK"] = $item["image_link"];
         }
-        $raw_guid = L\unbase64Hash($item["guid"]);
         if (!isset($item["pubdate"]) || $item["pubdate"] == "") {
-            $item["pubdate"] = time();
+            $out_item["PUBDATE"] = time();
         } else {
-            $item["pubdate"] = strtotime($item["pubdate"]);
-            if ($item["pubdate"] < 0) {
-                $item["pubdate"] = time();
+            $out_item["PUBDATE"] = strtotime($item["pubdate"]);
+            if ($out_item["PUBDATE"] < 0) {
+                $out_item["PUBDATE"] = time();
             }
         }
-        if (time() - $item["pubdate"] > $age) {
+        if (time() - $out_item["PUBDATE"] > $age) {
             return false;
         }
-        $sql = "SELECT COUNT(*) AS NUMBER FROM FEED_ITEM WHERE GUID = ?";
-        $db = $this->db;
-        $result = $db->execute($sql, [$item["guid"]]);
-        if ($result) {
-            $row = $db->fetchArray($result);
-            if ($row["NUMBER"] > 0) {
-                return false;
-            }
-        } else {
-            return true;
-        }
-        $sql = "INSERT INTO FEED_ITEM (GUID, TITLE, LINK, IMAGE_LINK,".
-            "DESCRIPTION, PUBDATE, SOURCE_NAME) VALUES (?, ?, ?, ?, ?, ?, ?)";
-        $result = $db->execute($sql, [$item['guid'],
-            $item['title'], $item['link'], $item['image_link'],
-            $item['description'], $item['pubdate'], $source_name]);
-        if (!$result) {
+        if ($this->index_archive->contains($out_item["GUID"])) {
             return false;
         }
+        $out_item['SOURCE_NAME'] = $source_name;
+        $this->found_items[] = $out_item;
         return true;
     }
     /**
ViewGit