Adds ReverseIterator and modifies existing files to support iterating backwards. FeedUpdateJob creates a NewsFeed IndexArchiveBundle which can be used as the news index.

Tim Chow [2019-12-06 06:Dec:th]

Adds ReverseIterator and modifies existing files to support iterating backwards. FeedUpdateJob creates a NewsFeed IndexArchiveBundle which can be used as the news index.

Signed-off-by: Chris Pollett <chris@pollett.org>

Filename
src/configs/Config.php
src/executables/ArcTool.php
src/library/CrawlConstants.php
src/library/IndexArchiveBundle.php
src/library/IndexManager.php
src/library/IndexShard.php
src/library/VersionFunctions.php
src/library/index_bundle_iterators/DisjointIterator.php
src/library/index_bundle_iterators/IndexBundleIterator.php
src/library/index_bundle_iterators/IntersectIterator.php
src/library/index_bundle_iterators/ReverseIterator.php
src/library/index_bundle_iterators/WordIterator.php
src/library/media_jobs/FeedsUpdateJob.php
src/models/CrawlModel.php
src/models/PhraseModel.php
src/views/helpers/FeedsHelper.php
tests/IndexShardTest.php

diff --git a/src/configs/Config.php b/src/configs/Config.php
index 895273dc7..b22405e87 100755
--- a/src/configs/Config.php
+++ b/src/configs/Config.php
@@ -158,7 +158,7 @@ function nsconddefine($constant, $value)
  * Version number for upgrade database function
  * @var int
  */
-nsdefine('DATABASE_VERSION', 67);
+nsdefine('DATABASE_VERSION', 68);
 /**
  * Minimum Version fo Yioop for which keyword ad script
  * still works with this version
diff --git a/src/executables/ArcTool.php b/src/executables/ArcTool.php
index ee83b9250..8c5dcd5a0 100755
--- a/src/executables/ArcTool.php
+++ b/src/executables/ArcTool.php
@@ -282,6 +282,8 @@ class ArcTool implements CrawlConstants
         $index_timestamp = (isset($matches[0])) ? $matches[0] : 0;
         if ($bundle_num >= 0) {
             $index_timestamp .= "-$bundle_num";
+        } else if ($bundle_name == "IndexDataNewsFeed") {
+            $index_timestamp = "NewsFeed";
         }
         $hash_paths = L\allCrawlHashPaths($word, true);
         $found = false;
@@ -371,6 +373,8 @@ class ArcTool implements CrawlConstants
         $index_timestamp = (isset($matches[0])) ? $matches[0] : 0;
         if ($bundle_num >= 0) {
             $index_timestamp .= "-$bundle_num";
+        } else if ($bundle_name == "IndexDataNewsFeed") {
+            $index_timestamp = "NewsFeed";
         }
         $index = IndexManager::getIndex($index_timestamp);
         $index->setCurrentShard($generation);
@@ -489,6 +493,8 @@ class ArcTool implements CrawlConstants
         $index_timestamp = (isset($matches[0])) ? $matches[0] : 0;
         if ($bundle_num >= 0) {
             $index_timestamp .= "-$bundle_num";
+        } else if ($bundle_num = "IndexDataNewsFeed") {
+            $index_timestamp = "NewsFeed";
         }
         $index = IndexManager::getIndex($index_timestamp);
         $index->setCurrentShard($generation, true);
@@ -510,6 +516,7 @@ class ArcTool implements CrawlConstants
             if (!$tmp) {
                 break;
             }
+
             $documents = array_merge($documents, $shard->getPostingsSlice(
                 $old_offset, $old_start, $old_end, 1));
             $raw_postings[] = $tmp;
diff --git a/src/library/CrawlConstants.php b/src/library/CrawlConstants.php
index 09ecca301..75d41fcbd 100755
--- a/src/library/CrawlConstants.php
+++ b/src/library/CrawlConstants.php
@@ -60,6 +60,7 @@ interface CrawlConstants
     const robot_data_base_name = "RobotData";
     const etag_expires_data_base_name = "EtagExpiresData";
     const index_data_base_name = "IndexData";
+    const feed_index_data_base_name = "IndexDataNewsFeed";
     const double_index_base_name = "DoubleIndexData";
     const network_base_name = "Network";
     const network_crawllist_base_name = "NetworkCrawlList";
@@ -236,4 +237,5 @@ interface CrawlConstants
     const THUMB_URL = 'ec';
     const IS_VR = 'ed';
     const DURATION = 'ee';
+    const PUBDATE = 'ef';
 }
diff --git a/src/library/IndexArchiveBundle.php b/src/library/IndexArchiveBundle.php
index a294522f4..167d98b6e 100644
--- a/src/library/IndexArchiveBundle.php
+++ b/src/library/IndexArchiveBundle.php
@@ -294,7 +294,7 @@ class IndexArchiveBundle implements CrawlConstants
      * returns a reference to this shard
      * @return object last shard in the bundle
      */
-     public function getActiveShard()
+     public function getActiveShard($forward = true)
      {
         if ($this->setCurrentShard($this->generation_info['ACTIVE'])) {
             return $this->getCurrentShard();
@@ -317,7 +317,7 @@ class IndexArchiveBundle implements CrawlConstants
      *      merge dictionary side effects
      * @return object the currently being index shard
      */
-     public function getCurrentShard($force_read = false)
+     public function getCurrentShard($force_read = false, $forward = true)
      {
         if (!isset($this->current_shard)) {
             if (!isset($this->generation_info['CURRENT'])) {
@@ -331,7 +331,7 @@ class IndexArchiveBundle implements CrawlConstants
                     $this->current_shard = new IndexShard(
                         $current_index_shard_file,
                         $this->generation_info['CURRENT'],
-                        $this->num_docs_per_generation, true);
+                        $this->num_docs_per_generation, true, $forward);
                     $this->current_shard->getShardHeader($force_read);
                     $this->current_shard->read_only_from_disk = true;
                 } else {
@@ -346,7 +346,7 @@ class IndexArchiveBundle implements CrawlConstants
             } else {
                 $this->current_shard = new IndexShard($current_index_shard_file,
                     $this->generation_info['CURRENT'],
-                    $this->num_docs_per_generation);
+                    $this->num_docs_per_generation, $forward);
             }
         }
         return $this->current_shard;
diff --git a/src/library/IndexManager.php b/src/library/IndexManager.php
index adbf0a17d..95df96176 100644
--- a/src/library/IndexManager.php
+++ b/src/library/IndexManager.php
@@ -68,7 +68,7 @@ class IndexManager implements CrawlConstants
      * @param string $index_name timestamp of desired IndexArchiveBundle
      * @return object the desired IndexArchiveBundle reference
      */
-    public static function getIndex($index_name)
+    public static function getIndex($index_name, $forward_direction = true)
     {
         $index_name = trim($index_name); //trim to fix postgres quirkiness
         if (empty(self::$indexes[$index_name]) ||
@@ -86,10 +86,17 @@ class IndexManager implements CrawlConstants
                     return false;
                 }
             } else {
+                if ($index_name == "NewsFeed") {
+                    $index_archive_name = self::feed_index_data_base_name;
+                    $index_name = 13;
+                } else {
+                    $index_archive_name = self::index_data_base_name . $index_name;
+                }
                 $index_archive_name = self::index_data_base_name . $index_name;
                 if (file_exists(C\CRAWL_DIR.'/cache/' . $index_archive_name)) {
                     $tmp = new IndexArchiveBundle(
-                        C\CRAWL_DIR.'/cache/' . $index_archive_name);
+                        C\CRAWL_DIR.'/cache/' . $index_archive_name, null,
+                        C\NUM_DOCS_PER_GENERATION, $forward_direction);
                     if (!$tmp) {
                         return false;
                     }
diff --git a/src/library/IndexShard.php b/src/library/IndexShard.php
index 0af758daf..49240695b 100644
--- a/src/library/IndexShard.php
+++ b/src/library/IndexShard.php
@@ -209,6 +209,12 @@ class IndexShard extends PersistentStructure implements CrawlConstants
      * @var string
      */
     public $word_postings;
+    /**
+     * Specifies which direction an IndexShard will be traversed through using
+     * WordIterator
+     * @var bool
+     */
+    public $forward_direction;
     /**
      * Fraction of NUM_DOCS_PER_GENERATION document inserts before data
      * from the words array is flattened to word_postings. (It will
@@ -287,7 +293,7 @@ class IndexShard extends PersistentStructure implements CrawlConstants
      */
     public function __construct($fname, $generation = 0,
         $num_docs_per_generation = C\NUM_DOCS_PER_GENERATION,
-        $read_only_from_disk = false)
+        $read_only_from_disk = false, $forward_direction = true)
     {
         parent::__construct($fname, -1);
         $this->hash_name = crawlHash($fname);
@@ -310,6 +316,7 @@ class IndexShard extends PersistentStructure implements CrawlConstants
         $this->read_only_from_disk = $read_only_from_disk;
         $this->word_docs_packed = false;
         $this->blocks_words= [];
+        $this->forward_direction = $forward_direction;
     }
     /**
      * Used to pack a list of description scores and user ranks as a
@@ -654,8 +661,9 @@ class IndexShard extends PersistentStructure implements CrawlConstants
      * @return array desired list of doc's and their info
      */
     public function getPostingsSlice($start_offset, &$next_offset, $last_offset,
-        $len)
+        $len, $forward = true)
     {
+        $forward_dir = ($this->forward_direction && $forward);
         if (!$this->read_only_from_disk && !$this->word_docs_packed) {
             $this->mergeWordPostingsToString();
             $this->packWords(null);
@@ -663,6 +671,20 @@ class IndexShard extends PersistentStructure implements CrawlConstants
         } else if ($this->read_only_from_disk && empty($this->num_docs)) {
             $this->getShardHeader();
         }
+        // Normal forward iterator
+        if ($forward_dir) {
+            return $this->postingsSliceForward($start_offset, $next_offset, $last_offset,
+                    $len);
+        }
+        // Reverse direction iterator used for newsfeed
+        else {
+            return $this->postingsSliceBackward($start_offset, $next_offset, $last_offset,
+                    $len);
+        }
+    }
+    public function postingsSliceForward($start_offset, &$next_offset, $last_offset,
+        $len)
+    {
         $num_docs_so_far = 0;
         $results = [];
         /* wd_len is a kludgy fix because word_docs_len can get out of sync
@@ -697,6 +719,51 @@ class IndexShard extends PersistentStructure implements CrawlConstants
         $next_offset = $next << 2;
         return $results;
     }
+    public function postingsSliceBackward($start_offset, &$next_offset, $last_offset,
+        $len)
+    {
+        $num_docs_so_far = 0;
+        $results = [];
+        /* wd_len is a kludgy fix because word_docs_len can get out of sync
+           when things are file-based and am still tracking down why
+        */
+        $wd_len = (isset($this->file_len)) ?
+        $this->file_len - $this->docids_len : $this->word_docs_len;
+        /*  For a reverse shard, the arguments for start offset and
+            last offset are the same. It actually gets reversed here,
+            where end:=start and last:=start.
+        */
+        $end = $start_offset >> 2;
+        $last = $start_offset >> 2;
+        $next = $next_offset >> 2;
+        $posting_end = $next;
+        $total_posting_len = 0;
+        $num_postings_so_far = 0;
+        $stop = 0;
+        do {
+            if ($next < $end) {
+                break;
+            }
+            $posting_start = $next;
+            // getPostingAtOffset will modify both start and end to the value of next
+            // using addresses
+            $posting = $this->getPostingAtOffset(
+                $next, $posting_start, $posting_end);
+            $total_posting_len += strlen($posting);
+            $num_postings_so_far++;
+            $next = $posting_start - 1;
+            // getting the number of docs is the same forwards or backwards
+            $num_docs_or_links =
+                self::numDocsOrLinks($start_offset, $last_offset,
+                    $total_posting_len / $num_postings_so_far);
+            list($doc_id, , $item) =
+                $this->makeItem($posting, $num_docs_or_links);
+            $results[$doc_id] = $item;
+            $num_docs_so_far += $posting_end - $next;
+        } while ($next >= $last && $num_docs_so_far < $len);
+        $next_offset = $next << 2;
+        return $results;
+    }
     /**
      * An upper bound on the number of docs or links represented by
      * the start and ending integer offsets into a posting list.
@@ -1035,9 +1102,10 @@ class IndexShard extends PersistentStructure implements CrawlConstants
      * @return array (int offset to next posting, doc_offset for this post)
      */
      public function nextPostingOffsetDocOffset($start_offset, $end_offset,
-        $doc_offset)
+        $doc_offset, $forward = true)
     {
         $doc_index = $doc_offset >> 4;
+        $start = $start_offset >> 2;
         $end = $end_offset >> 2;
         $post_doc_index = $this->getDocIndexOfPostingAtOffset($end);
         if ($doc_index > $post_doc_index) { //fail fast
@@ -1045,9 +1113,16 @@ class IndexShard extends PersistentStructure implements CrawlConstants
         } else if ($doc_index == $post_doc_index) {
             return [$end << 2, $post_doc_index << 4];
         }
-        $current = $start_offset >> 2;
-        $post_doc_index = $this->gallopPostingOffsetDocOffset($current,
+        $current = 0;
+        if ($forward) {
+            $current = $start_offset >> 2;
+            $post_doc_index = $this->gallopPostingOffsetDocOffset($current,
             $doc_index, $end);
+        } else {
+            $current = $end_offset >> 2;
+            $post_doc_index = $this->gallopPostingOffsetDocOffset($current,
+            $doc_index, $start);
+        }
         if ($doc_index == $post_doc_index) {
             return [$current << 2, $post_doc_index << 4];
         }
@@ -1076,7 +1151,6 @@ class IndexShard extends PersistentStructure implements CrawlConstants
                 return [$current << 2, $post_doc_index << 4];
             }
         } while($current <= $end);
-        return false;
      }
     /**
      * Performs a galloping search (double forward jump distance each failure
@@ -1094,16 +1168,30 @@ class IndexShard extends PersistentStructure implements CrawlConstants
     public function gallopPostingOffsetDocOffset(&$current, $doc_index, $end)
     {
         $stride = 32;
-        do {
-            $post_doc_index = $this->getDocIndexOfPostingAtOffset($current);
-            if ($doc_index <= $post_doc_index) {
-                return $post_doc_index;
-            }
-            $current += $stride;
-            $stride <<= 1;
-        } while($current <= $end);
-        $current = $end;
-        return $post_doc_index;
+        if ($this->forward_direction) {
+            do {
+                $post_doc_index = $this->getDocIndexOfPostingAtOffset($current);
+                if ($doc_index <= $post_doc_index) {
+                    return $post_doc_index;
+                }
+                $current += $stride;
+                $stride <<= 1;
+            } while($current <= $end);
+            $current = $end;
+            return $post_doc_index;
+        } else {
+            do {
+                $post_doc_index = $this->getDocIndexOfPostingAtOffset($current);
+                if ($doc_index >= $post_doc_index) {
+                    return $post_doc_index;
+                }
+                $current -= $stride;
+                $stride <<= 1;
+            } while($current >= $end);
+            $current = $end;
+            return $post_doc_index;
+        }
+
     }
     /**
      * Given an offset of a posting into the word_docs string, looks up
@@ -1112,7 +1200,8 @@ class IndexShard extends PersistentStructure implements CrawlConstants
      * @param int $offset byte/char offset into the word_docs string
      * @return int a document byte/char offset into the doc_infos string
      */
-    public function docOffsetFromPostingOffset($offset) {
+    public function docOffsetFromPostingOffset($offset, $forward=true) {
+        $this->forward_direction = $forward;
         $doc_index = $this->getDocIndexOfPostingAtOffset($offset >> 2);
         return ($doc_index << 4);
     }
@@ -1133,8 +1222,15 @@ class IndexShard extends PersistentStructure implements CrawlConstants
         if ($info !== false) {
             list($first_offset, $last_offset,
                 $num_docs_or_links) = $info;
-            $results = $this->getPostingsSlice($first_offset,
-                $first_offset, $last_offset, $len);
+            if ($this->forward_direction) {
+                $results = $this->getPostingsSlice($first_offset,
+                    $first_offset, $last_offset, $len);
+            }
+            else {
+                $results = $this->getPostingsSlice($first_offset,
+                $last_offset, $last_offset, $len);
+            }
+
         }
         return $results;
     }
@@ -1397,13 +1493,15 @@ class IndexShard extends PersistentStructure implements CrawlConstants
             } else if ($offset == self::NEEDS_OFFSET_FLAG &&
                 $missing_count < 100) {
                 crawlLog("Index Shard Document:" . toHexString($id) .
-                    " still needs offset");
+                   " still needs offset");
                 $missing_count++;
             } else if ($offset == self::NEEDS_OFFSET_FLAG &&
                 $missing_count == 100) {
                 crawlLog("Index Shard: too many docs still need offset, " .
                     "not logging rest");
                 $missing_count++;
+            } else {
+                crawlLog("Still wrong");
             }
         }
     }
@@ -1483,7 +1581,7 @@ class IndexShard extends PersistentStructure implements CrawlConstants
     public function saveWithoutDictionary($with_logging = false)
     {
         $this->getShardHeader(true);
-        if($with_logging) {
+        if ($with_logging) {
             crawlLog("Opening without dictionary version of shard to write...");
         }
         $fh = fopen($this->filename . "-tmp", "wb");
@@ -1497,7 +1595,7 @@ class IndexShard extends PersistentStructure implements CrawlConstants
             $this->len_all_docs,
             $this->len_all_link_docs);
         fwrite($fh, $header);
-        if($with_logging) {
+        if ($with_logging) {
             crawlLog("..without dictionary version of shard header written");
         }
         if (!$this->read_only_from_disk) {
@@ -1513,7 +1611,7 @@ class IndexShard extends PersistentStructure implements CrawlConstants
             $offset += $len;
             $remaining -= $len;
         }
-        if($with_logging) {
+        if ($with_logging) {
             crawlLog("..without dictionary version of shard word docs written");
         }
         $remaining = $this->docids_len;
@@ -1525,7 +1623,7 @@ class IndexShard extends PersistentStructure implements CrawlConstants
             $offset += $len;
             $remaining -= $len;
         }
-        if($with_logging) {
+        if ($with_logging) {
             crawlLog("..without dictionary version of shard doc infos written");
         }
         fclose($fh);
@@ -1536,7 +1634,7 @@ class IndexShard extends PersistentStructure implements CrawlConstants
             unlink($this->filename);
             rename($this->filename . "-tmp", $this->filename);
         }
-        if($with_logging) {
+        if ($with_logging) {
             crawlLog("done replacing version of shard.");
         }
     }
@@ -1798,7 +1896,7 @@ class IndexShard extends PersistentStructure implements CrawlConstants
     }
     /**
      * Reads 32 bit word as an unsigned int from the offset given in the
-     * word_docs string in the sahrd
+     * word_docs string in the shard
      * @param int $offset a byte offset into the word_docs string
      */
     public function getWordDocsWord($offset)
@@ -2090,4 +2188,4 @@ class IndexShard extends PersistentStructure implements CrawlConstants
             substr($value, self::WORD_KEY_LEN,
                 self::WORD_DATA_LEN);
     }
-}
+}
\ No newline at end of file
diff --git a/src/library/VersionFunctions.php b/src/library/VersionFunctions.php
index 09ab43367..078aad92b 100644
--- a/src/library/VersionFunctions.php
+++ b/src/library/VersionFunctions.php
@@ -1850,3 +1850,14 @@ function upgradeDatabaseVersion67(&$db)
     $db->execute("ALTER TABLE SUBSEARCH ADD COLUMN " .
         "DEFAULT_QUERY VARCHAR(" . C\TITLE_LEN . ") DEFAULT ''");
 }
+/**
+ * Upgrades a Version 67 version of the Yioop database to a Version 68 version
+ * @param object $db datasource to use to upgrade.
+ */
+function upgradeDatabaseVersion68(&$db)
+{
+    $db->execute("DELETE FROM MIX_COMPONENTS WHERE MIX_TIMESTAMP=4
+        AND GROUP_ID=0");
+    $db->execute("INSERT INTO MIX_COMPONENTS VALUES(
+        4, 0, 13, 1, 'media:news')");
+}
diff --git a/src/library/index_bundle_iterators/DisjointIterator.php b/src/library/index_bundle_iterators/DisjointIterator.php
index 3ffab0f76..abff878c4 100644
--- a/src/library/index_bundle_iterators/DisjointIterator.php
+++ b/src/library/index_bundle_iterators/DisjointIterator.php
@@ -162,8 +162,12 @@ class DisjointIterator extends IndexBundleIterator
             } else if ($cur_gen_doc_offset == -1) {
                 continue;
             }
+            $forward = true;
+            if ($this->index_bundle_iterators[$i] instanceof ReverseIterator) {
+                $forward = false;
+            }
             $gen_doc_cmp = $this->genDocOffsetCmp($cur_gen_doc_offset,
-                $least_gen_offset);
+                $least_gen_offset, $forward);
             if ($gen_doc_cmp < 0) {
                 $least_gen_offset = $cur_gen_doc_offset;
                 $this->least_offset_index = $i;
@@ -187,8 +191,12 @@ class DisjointIterator extends IndexBundleIterator
             for ($i = 0; $i < $this->num_iterators; $i++) {
                 $cur_gen_doc_offset = $this->index_bundle_iterators[
                     $i]->currentGenDocOffsetWithWord();
+                $forward = true;
+                if ($this->index_bundle_iterators[$i] instanceof ReverseIterator) {
+                    $forward = false;
+                }
                 if ($this->genDocOffsetCmp($cur_gen_doc_offset,
-                    $gen_doc_offset) < 0) {
+                    $gen_doc_offset, $forward) < 0) {
                     if ($no_change) {
                         $this->current_block_fresh = false;
                         $this->seen_docs += 1;
diff --git a/src/library/index_bundle_iterators/IndexBundleIterator.php b/src/library/index_bundle_iterators/IndexBundleIterator.php
index 185ad5ec4..74fb5ff34 100644
--- a/src/library/index_bundle_iterators/IndexBundleIterator.php
+++ b/src/library/index_bundle_iterators/IndexBundleIterator.php
@@ -149,19 +149,34 @@ abstract class IndexBundleIterator implements CrawlConstants
      * @param array $gen_doc2  second ordered pair
      * @return int -1,0,1 depending on which is bigger
      */
-     public function genDocOffsetCmp($gen_doc1, $gen_doc2)
+     public function genDocOffsetCmp($gen_doc1, $gen_doc2, $forward=true)
      {
-        //less generation or greater
-        if ($gen_doc1[0] < $gen_doc2[0]) {
-            return -1;
-        } else if ($gen_doc1[0] > $gen_doc2[0]) {
-            return 1;
-        }
-        //less offset or greater
-        if ($gen_doc1[1] < $gen_doc2[1]) {
-            return -1;
-        } else if ($gen_doc1[1] > $gen_doc2[1]) {
-            return 1;
+        if ($forward) {
+            //less generation or greater
+            if ($gen_doc1[0] < $gen_doc2[0]) {
+                return -1;
+            } else if ($gen_doc1[0] > $gen_doc2[0]) {
+                return 1;
+            }
+            //less offset or greater
+            if ($gen_doc1[1] < $gen_doc2[1]) {
+                return -1;
+            } else if ($gen_doc1[1] > $gen_doc2[1]) {
+                return 1;
+            }
+        } else if (!$forward) {
+            //less generation or greater for reverse
+            if ($gen_doc1[0] < $gen_doc2[0]) {
+                return 1;
+            } else if ($gen_doc1[0] > $gen_doc2[0]) {
+                return -1;
+            }
+            //less offset or greater for reverse
+            if ($gen_doc1[1] < $gen_doc2[1]) {
+                return 1;
+            } else if ($gen_doc1[1] > $gen_doc2[1]) {
+                return -1;
+            }
         }
         //equal
         return 0;
diff --git a/src/library/index_bundle_iterators/IntersectIterator.php b/src/library/index_bundle_iterators/IntersectIterator.php
index 7c67f17d1..29d7316fc 100644
--- a/src/library/index_bundle_iterators/IntersectIterator.php
+++ b/src/library/index_bundle_iterators/IntersectIterator.php
@@ -31,6 +31,7 @@
 namespace seekquarry\yioop\library\index_bundle_iterators;

 use seekquarry\yioop\configs as C;
+use seekquarry\yioop\library as L;

 /**
  * Used to iterate over the documents which occur in all of a set of
@@ -179,6 +180,7 @@ class IntersectIterator extends IndexBundleIterator
             return -1;
         }
         //next we finish computing BM25F
+        $retrieve_postings_time = microtime(true);
         $docs = $this->index_bundle_iterators[0]->currentDocsWithWord();
         $weight = $this->weight;
         if (is_array($docs) && count($docs) == 1) {
@@ -414,7 +416,12 @@ class IntersectIterator extends IndexBundleIterator
         }
         $gen_doc_offset[0] = $biggest_gen_offset;
         $all_same = true;
+        $forward = true;
         for ($i = 1; $i < $this->num_iterators; $i++) {
+            if ($this->index_bundle_iterators[$i] instanceof ReverseIterator) {
+                $forward = false;
+            }
+            $retrieve_postings_time = microtime(true);
             if ((($cur_gen_doc_offset = $this->index_bundle_iterators[
                 $i]->currentGenDocOffsetWithWord()) == -1) ||
                 time() > $time_out) {
@@ -422,7 +429,7 @@ class IntersectIterator extends IndexBundleIterator
             }
             $gen_doc_offset[$i] = $cur_gen_doc_offset;
             $gen_doc_cmp = $this->genDocOffsetCmp($cur_gen_doc_offset,
-                $biggest_gen_offset);
+                $biggest_gen_offset, $forward);
             if ($gen_doc_cmp > 0) {
                 $biggest_gen_offset = $cur_gen_doc_offset;
                 $all_same = false;
@@ -435,12 +442,13 @@ class IntersectIterator extends IndexBundleIterator
         }
         $last_changed = -1;
         $i = 0;
+        $j = 0;
         while($i != $last_changed) {
             if (time() > $time_out) {
                 return -1;
             }
             if ($this->genDocOffsetCmp($gen_doc_offset[$i],
-                $biggest_gen_offset) < 0) {
+                $biggest_gen_offset, $forward) < 0) {
                 $iterator = $this->index_bundle_iterators[$i];
                 $iterator->advance($biggest_gen_offset);
                 if( ($cur_gen_doc_offset =
@@ -449,7 +457,7 @@ class IntersectIterator extends IndexBundleIterator
                 }
                 $gen_doc_offset[$i] = $cur_gen_doc_offset;
                 if ($this->genDocOffsetCmp($cur_gen_doc_offset,
-                    $biggest_gen_offset) > 0) {
+                    $biggest_gen_offset, $forward) > 0) {
                     $last_changed = $i;
                     $biggest_gen_offset = $cur_gen_doc_offset;
                 }
@@ -459,6 +467,7 @@ class IntersectIterator extends IndexBundleIterator
                 $i = 0;
                 $last_changed = max($last_changed, 0);
             }
+            $j++;
         }
         return 1;
     }
diff --git a/src/library/index_bundle_iterators/ReverseIterator.php b/src/library/index_bundle_iterators/ReverseIterator.php
new file mode 100644
index 000000000..5436146da
--- /dev/null
+++ b/src/library/index_bundle_iterators/ReverseIterator.php
@@ -0,0 +1,543 @@
+<?php
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009 - 2019  Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ *
+ * END LICENSE
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @license https://www.gnu.org/licenses/ GPL3
+ * @link https://www.seekquarry.com/
+ * @copyright 2009 - 2019
+ * @filesource
+ */
+namespace seekquarry\yioop\library\index_bundle_iterators;
+
+use seekquarry\yioop\configs as C;
+use seekquarry\yioop\library as L;
+use seekquarry\yioop\library\IndexShard;
+use seekquarry\yioop\library\IndexManager;
+
+/**
+ * Used to iterate through the documents associated with a word in
+ * an IndexArchiveBundle. It also makes it easy to get the summaries
+ * of these documents.
+ *
+ * A description of how words and the documents containing them are stored
+ * is given in the documentation of IndexArchiveBundle.
+ *
+ * @author Chris Pollett and Tim Chow
+ * @see IndexArchiveBundle
+ */
+class ReverseIterator extends IndexBundleIterator
+{
+    /**
+     * hash of word or phrase that the iterator iterates over
+     * @var string
+     */
+    public $word_key;
+    /**
+     * Position from end of key that doesn't have to be an exact match
+     * (for phrases as using suffix tree)
+     * @var int
+     */
+    public $shift;
+    /**
+     * The timestamp of the index is associated with this iterator
+     * @var string
+     */
+    public $index_name;
+    /**
+     * First shard generation that word info was obtained for
+     * @var int
+     */
+    public $start_generation;
+    /**
+     * Used to keep track of whether getWordInfo might still get more
+     * data on the search terms as advance generations
+     * @var bool
+     */
+    public $no_more_generations;
+    /**
+     * The next byte offset in the IndexShard
+     * @var int
+     */
+    public $next_offset;
+    /**
+     * An array of shard generation and posting list offsets, lengths, and
+     * numbers of documents
+     * @var array
+     */
+    public $dictionary_info;
+    /**
+     * File name (including path) of the feed shard for news items
+     * @var string
+     */
+    public $feed_shard_name;
+    /**
+     * Structure used to hold posting list start and stops for the query
+     * in the feed shard
+     * @var array
+     */
+    public $feed_info;
+    /**
+     * The total number of shards that have data for this word
+     * @var int
+     */
+    public $num_generations;
+    /**
+     * Index into dictionary_info corresponding to the current shard
+     * @var int
+     */
+    public $generation_pointer;
+    /**
+     * Numeric number of current shard
+     * @var int
+     */
+    public $current_generation;
+    /**
+     * The current byte offset in the IndexShard
+     * @var int
+     */
+    public $current_offset;
+    /**
+     * Starting Offset of word occurence in the IndexShard
+     * @var int
+     */
+    public $start_offset;
+    /**
+     * Last Offset of word occurence in the IndexShard
+     * @var int
+     */
+    public $last_offset;
+    /**
+     * Keeps track of whether the word_iterator list is empty because the
+     * word does not appear in the index shard
+     * @var int
+     */
+    public $empty;
+    /**
+     * Keeps track of whether the word_iterator list is empty because the
+     * word does not appear in the index shard
+     * @var int
+     */
+    public $filter;
+    /**
+     * The current value of the doc_offset of current posting if known
+     * @var int
+     */
+    public $current_doc_offset;
+    /** Host Key position + 1 (first char says doc, inlink or eternal link)*/
+    const HOST_KEY_POS = 17;
+    /** Length of a doc key*/
+    const KEY_LEN = 8;
+    /** If the $limit_feeds constructor input is true then limit the number
+     * of items coming from the feed shard to this count.
+     */
+    const LIMIT_FEEDS_COUNT = 25;
+    /**
+     * Creates a word iterator with the given parameters.
+     *
+     * @param string $word_key hash of word or phrase to iterate docs of
+     * @param string $shift up to what point in key should be a match
+     *      when do dictionary look up (for phrases because using suffix tree)
+     * @param string $index_name time_stamp of the to use
+     * @param bool $raw whether the $word_key is our variant of base64 encoded
+     * @param array $filter an array of hashes of domains to filter from
+     *     results
+     * @param int $results_per_block the maximum number of results that can
+     *      be returned by a findDocsWithWord call
+     * @param bool $limit_feeds feed results appear before all others when
+     *      gotten out of this iterator (may be reordered later). This flag
+     *      controls whether an upper bound of self::LIMIT_FEEDS_COUNT is
+     *      imposed on the number of feed results returned
+     */
+    public function __construct($word_key, $shift, $index_name, $raw = false,
+        $filter = null,
+        $results_per_block = IndexBundleIterator::RESULTS_PER_BLOCK,
+        $limit_feeds = false)
+    {
+        if ($raw == false) {
+            //get rid of out modified base64 encoding
+            $word_key = L\unbase64Hash($word_key);
+        }
+        $this->filter = $filter;
+        $this->word_key = $word_key;
+        $this->shift = $shift;
+        // 13 is somewhat of a magic number right now
+        if($index_name == 13) {
+            $index_name = "NewsFeed";
+        }
+        $this->index_name = $index_name;
+        list($estimated_total, $this->dictionary_info) =
+            IndexManager::getWordInfo($index_name, $word_key, $shift,
+            -1, -1, C\NUM_DISTINCT_GENERATIONS, true);
+        $this->feed_shard_name = C\WORK_DIRECTORY . "/feeds/index";
+        if ((!C\nsdefined('NO_FEEDS') || !C\NO_FEEDS)
+            && file_exists($this->feed_shard_name)) {
+            $this->use_feeds = true;
+        } else {
+            $this->use_feeds = false;
+        }
+        if ($this->use_feeds) {
+            if (!isset($this->dictionary_info[-1])) {
+                $this->feed_info = false;
+                $this->feed_empty = true;
+            } else {
+                $this->feed_info = $this->dictionary_info[-1];
+                unset($this->dictionary_info[-1]);
+                $this->feed_empty = false;
+            }
+        } else {
+            $this->feed_info = false;
+            $this->feed_empty = true;
+        }
+        if (is_array($this->feed_info)) {
+            list(,$this->feed_start, $this->feed_end, $this->feed_count,) =
+                $this->feed_info;
+            $this->feed_info = [$this->feed_start, $this->feed_end,
+                $this->feed_count];
+        } else {
+            $this->feed_start = 0;
+            $this->feed_end = 0;
+            $this->feed_count = 0;
+        }
+        if ($this->feed_count > 0) {
+            $this->using_feeds = true;
+        } else {
+            $this->using_feeds = false;
+        }
+        if ($limit_feeds && $this->feed_count > self::LIMIT_FEEDS_COUNT) {
+            $this->feed_count = self::LIMIT_FEEDS_COUNT;
+            $this->feed_end = $this->feed_start +
+                IndexShard::POSTING_LEN * (self::LIMIT_FEEDS_COUNT - 1);
+        }
+        $this->num_docs = $this->feed_count + $estimated_total;
+        if ($this->dictionary_info === false) {
+            $this->empty = true;
+        } else {
+            ksort($this->dictionary_info);
+            $this->dictionary_info = array_values($this->dictionary_info);
+            $this->num_generations = count($this->dictionary_info);
+            if ($this->num_generations == 0) {
+                $this->empty = true;
+            } else {
+                $this->empty = false;
+            }
+        }
+        $this->no_more_generations =
+            ($this->num_generations < C\NUM_DISTINCT_GENERATIONS);
+        $this->current_doc_offset = null;
+        $this->results_per_block = $results_per_block;
+        $this->current_block_fresh = false;
+        $this->start_generation = $this->num_generations-1;
+        if ($this->dictionary_info !== false || $this->feed_info !== false) {
+            $this->reset();
+        }
+    }
+    /**
+     * Resets the iterator to the first document block that it could iterate
+     * over
+     * Reversed
+     */
+    public function reset()
+    {
+        if ($this->feed_count > 0) {
+            $this->using_feeds = true;
+        } else {
+            $this->using_feeds = false;
+        }
+        $no_feeds = $this->feed_empty || !$this->use_feeds;
+        if (!$this->empty) {//we shouldn't be called when empty - but to be safe
+            if ($this->start_generation < $this->num_generations-1) {
+                list($estimated_total, $this->dictionary_info) =
+                    IndexManager::getWordInfo($this->index_name,
+                    $this->word_key, 0, -1, 0, C\NUM_DISTINCT_GENERATIONS,
+                    true);
+                $this->num_docs = $this->feed_count + $estimated_total;
+                ksort($this->dictionary_info);
+                $this->dictionary_info = array_values($this->dictionary_info);
+                $this->num_generations = count($this->dictionary_info);
+                $this->no_more_generations =
+                    ($this->num_generations < C\NUM_DISTINCT_GENERATIONS);
+            }
+            list($this->current_generation, $this->start_offset,
+                $this->last_offset, )
+                = $this->dictionary_info[$this->num_generations-1];
+        # if the feed isn't empty
+        } else {
+            $this->start_offset = 0;
+            $this->last_offset = -1;
+            $this->num_generations = -1;
+        }
+        $this->current_offset = $this->last_offset;
+        // reset pointer to the number of gens, which in reverse is the first one we want
+        $this->generation_pointer = $this->num_generations-1;
+        $this->count_block = 0;
+        $this->seen_docs = 0;
+        $this->current_doc_offset = null;
+    }
+    /**
+     * Hook function used by currentDocsWithWord to return the current block
+     * of docs if it is not cached
+     *
+     * @return mixed doc ids and score if there are docs left, -1 otherwise
+     */
+    public function findDocsWithWord()
+    {
+        if ($this->empty) {
+            return -1;
+        }
+        if (($this->generation_pointer>=$this->num_generations)
+            || ($this->generation_pointer == 0 &&
+            $this->current_offset < $this->start_offset)) {
+            return -1;
+        }
+        $pre_results = [];
+        if (!$this->empty) {
+            $this->next_offset = $this->current_offset;
+            $index = IndexManager::getIndex($this->index_name, false);
+            $index->setCurrentShard($this->current_generation, true);
+            //the next call also updates next offset
+            $shard = $index->getCurrentShard(false, false);
+            $pre_results = $shard->getPostingsSlice(
+                $this->start_offset,
+                $this->next_offset, $this->last_offset,
+                $this->results_per_block, false);
+            if($this->index_name == "NewsFeed") {
+                $time = time();
+                foreach ($pre_results as $keys => $pre_result) {
+                    $page = $index->getPage($pre_result[self::SUMMARY_OFFSET],
+                        $this->current_generation);
+                    $delta = $time - $page[self::PUBDATE];
+                    $pre_results[$keys][self::DOC_RANK] = 720000 /
+                        max($delta, 1);
+                }
+            }
+        }
+        $results = [];
+        $doc_key_len = IndexShard::DOC_KEY_LEN;
+        foreach ($pre_results as $keys => $data) {
+            $host_key = substr($keys, self::HOST_KEY_POS, self::KEY_LEN);
+            if (!empty($this->filter) && $this->filter->isFiltered($host_key)) {
+                continue;
+            }
+            $data[self::KEY] = $keys;
+            // inlinks is the domain of the inlink
+            $key_parts = str_split($keys, $doc_key_len);
+            if (isset($key_parts[2])) {
+                list($hash_url, $data[self::HASH], $data[self::INLINKS]) =
+                    $key_parts;
+            } else {
+                continue;
+            }
+            if (!empty($data[self::IS_FEED])) {
+                $data[self::CRAWL_TIME] = "feed";
+            } else {
+                $data[self::CRAWL_TIME] = $this->index_name;
+            }
+            $results[$keys] = $data;
+        }
+        $this->count_block = count($results);
+        if ($this->generation_pointer == $this->num_generations - 1 &&
+            $results == []) {
+            $results = null;
+        }
+        $this->pages = $results;
+        return $results;
+    }
+    /**
+     * Updates the seen_docs count during an advance() call
+     * For a reverse shard, instead of adding to the offset, we subtract by a block instead.
+     */
+    public function advanceSeenDocs()
+    {
+        if ($this->current_block_fresh != true) {
+            $total_guess = IndexShard::numDocsOrLinks($this->next_offset,
+                    $this->start_offset);
+            $num_docs = $total_guess % $this->results_per_block;
+            if ($num_docs == 0) {
+                $num_docs = $this->results_per_block;
+            } else {
+                $num_docs = IndexShard::numDocsOrLinks($this->start_offset,
+                    $this->last_offset)%$this->results_per_block;
+            }
+            $this->next_offset = $this->current_offset;
+            $this->next_offset -= IndexShard::POSTING_LEN * $num_docs;
+            if ($num_docs <= 0) {
+                return;
+            }
+        } else {
+            $num_docs = $this->count_block;
+        }
+        $this->current_block_fresh = false;
+        $this->seen_docs += $num_docs;
+    }
+    /**
+     * Forwards the iterator one group of docs
+     * @param array $gen_doc_offset a generation, doc_offset pair. If set,
+     *     the must be of greater than or equal generation, and if equal the
+     *     next block must all have $doc_offsets larger than or equal to
+     *     this value
+     */
+    public function advance($gen_doc_offset = null)
+    {
+        if ($gen_doc_offset == null) {
+            $this->plainAdvance();
+            return;
+        }
+        $cur_gen_doc_offset = $this->currentGenDocOffsetWithWord();
+        if ($cur_gen_doc_offset == -1 ||
+            $this->genDocOffsetCmp($cur_gen_doc_offset,
+            $gen_doc_offset) < 0) {
+            return;
+        }
+        $this->plainAdvance();
+        if ($this->current_generation > $gen_doc_offset[0]) {
+            $this->advanceGeneration($gen_doc_offset[0]);
+            $this->next_offset = $this->current_offset;
+        }
+        $using_feeds = $this->using_feeds && $this->use_feeds;
+        if ($using_feeds) {
+            $shard = IndexManager::getIndex("feed");
+            $last = $this->feed_end;
+        } else {
+            $index = IndexManager::getIndex($this->index_name, false);
+            $index->setCurrentShard($this->current_generation, true);
+            $shard = $index->getCurrentShard(false, false);
+            $start = $this->start_offset;
+        }
+        if ($this->current_generation == $gen_doc_offset[0]) {
+            $offset_pair = $shard->nextPostingOffsetDocOffset(
+                 $start, $this->next_offset, $gen_doc_offset[1], false);
+            if ($offset_pair === false) {
+                $this->advanceGeneration();
+                $this->next_offset = $this->current_offset;
+            } else {
+                list($this->current_offset, $this->current_doc_offset) =
+                    $offset_pair;
+            }
+        }
+        $this->seen_docs =  0;
+        $this->seen_docs += ($this->current_offset - $this->start_offset) /
+            IndexShard::POSTING_LEN;
+    }
+    /**
+     * Forwards the iterator one group of docs. This is what's called
+     * by @see advance($gen_doc_offset) if $gen_doc_offset is null
+     * Reversed
+     */
+    public function plainAdvance()
+    {
+        $this->advanceSeenDocs();
+        $this->current_doc_offset = null;
+        # RC if the current offset is greater than the next
+        if ($this->current_offset > $this->next_offset) {
+            $this->current_offset = $this->next_offset;
+        } else {
+            $this->advanceGeneration();
+            $this->next_offset = $this->current_offset;
+        }
+        # if the current offset is smaller, then we need to get next
+        # generation
+        if ($this->current_offset < $this->start_offset) {
+            $this->advanceGeneration();
+            $this->next_offset = $this->current_offset;
+        }
+    }
+    /**
+     * Switches which index shard is being used to return occurrences of
+     * the word to the next shard containing the word
+     * Reversed
+     *
+     * @param int $generation generation to advance beyond
+     */
+    public function advanceGeneration($generation = null)
+    {
+        if ($this->using_feeds && $this->use_feeds) {
+            $this->using_feeds = false;
+            $this->generation_pointer = -1;
+        }
+        if ($generation === null) {
+            $generation = $this->current_generation;
+        }
+        do {
+            # RC if the pointer is greater than the total generations, subtract
+            if ($this->generation_pointer >= 0) {
+                $this->generation_pointer--;
+            }
+            # RC if the generation pointer is still more than the number of generations
+            if ($this->generation_pointer >= 0) {
+                list($this->current_generation, $this->start_offset,
+                    $this->last_offset, )
+                    = $this->dictionary_info[$this->generation_pointer];
+                #set the current offset to the last one of the dictionary
+                $this->current_offset = $this->last_offset;
+            }
+            # if there are more generations and
+            if (!$this->no_more_generations &&
+                $this->current_generation > $generation &&
+                $this->generation_pointer <= 0) {
+                list($estimated_remaining_total, $info) =
+                    IndexManager::getWordInfo($this->index_name,
+                    $this->word_key, 0, -1, $this->num_generations,
+                    C\NUM_DISTINCT_GENERATIONS, true);
+                if (count($info) > 0) {
+                    $this->num_docs = $this->seen_docs +
+                        $estimated_remaining_total;
+                    ksort($info);
+                    $this->dictionary_info = array_merge($this->dictionary_info,
+                        array_values($info));
+                    $this->num_generations = count($this->dictionary_info);
+                    $this->no_more_generations =
+                        count($info) < C\NUM_DISTINCT_GENERATIONS;
+                    //will increment back to where were next loop
+                    $this->generation_pointer++;
+                }
+            }
+        # whle the current generation is greater than supplied argument
+        } while($this->current_generation > $generation &&
+        # of if we haven't hit the zeroeth generation
+            $this->generation_pointer >= 0);
+    }
+    /**
+     * Gets the doc_offset and generation for the next document that
+     * would be return by this iterator
+     *
+     * @return mixed an array with the desired document offset
+     * and generation; -1 on fail
+     */
+    public function currentGenDocOffsetWithWord() {
+        if ($this->current_doc_offset !== null) {
+            return [$this->current_generation, $this->current_doc_offset];
+        }
+        # if the current offset is before the first one, or if gen pointer is less than 0
+        # we are in an impossible position
+        if ($this->current_offset < $this->start_offset||
+            $this->generation_pointer <= -1) {
+            return -1;
+        }
+        $index = IndexManager::getIndex($this->index_name);
+        $index->setCurrentShard($this->current_generation, true);
+        $this->current_doc_offset = $index->getCurrentShard(
+            )->docOffsetFromPostingOffset($this->current_offset, false);
+        return [$this->current_generation, $this->current_doc_offset];
+    }
+}
\ No newline at end of file
diff --git a/src/library/index_bundle_iterators/WordIterator.php b/src/library/index_bundle_iterators/WordIterator.php
index a571286da..333dfb6df 100644
--- a/src/library/index_bundle_iterators/WordIterator.php
+++ b/src/library/index_bundle_iterators/WordIterator.php
@@ -180,6 +180,9 @@ class WordIterator extends IndexBundleIterator
         $this->filter = $filter;
         $this->word_key = $word_key;
         $this->shift = $shift;
+        if($index_name == 13) {
+            $index_name = "NewsFeed";
+        }
         $this->index_name =  $index_name;
         list($estimated_total, $this->dictionary_info) =
             IndexManager::getWordInfo($index_name, $word_key, $shift,
@@ -544,6 +547,7 @@ class WordIterator extends IndexBundleIterator
         }
         $index = IndexManager::getIndex($this->index_name);
         $index->setCurrentShard($this->current_generation, true);
+        $index->setCurrentShard($this->current_generation, true);
         $this->current_doc_offset = $index->getCurrentShard(
             )->docOffsetFromPostingOffset($this->current_offset);
         return [$this->current_generation, $this->current_doc_offset];
diff --git a/src/library/media_jobs/FeedsUpdateJob.php b/src/library/media_jobs/FeedsUpdateJob.php
index e4fb863fc..478f4cdc6 100644
--- a/src/library/media_jobs/FeedsUpdateJob.php
+++ b/src/library/media_jobs/FeedsUpdateJob.php
@@ -36,6 +36,7 @@ use seekquarry\yioop\library as L;
 use seekquarry\yioop\library\CrawlConstants;
 use seekquarry\yioop\library\FetchUrl;
 use seekquarry\yioop\library\IndexShard;
+use seekquarry\yioop\library\IndexArchiveBundle;
 use seekquarry\yioop\library\PhraseParser;
 use seekquarry\yioop\library\UrlParser;

@@ -602,10 +603,16 @@ class FeedsUpdateJob extends MediaJob
     public function rebuildFeedShard($age)
     {
         $time = time();
-        $feed_shard_name = C\WORK_DIRECTORY . "/feeds/index";
         $prune_shard_name = C\WORK_DIRECTORY . "/feeds/prune_index";
+        $dir = C\CRAWL_DIR . '/cache/' . self::feed_index_data_base_name;
+        $info['DESCRIPTION'] = "NewsFeed";
+        $info['FORWARD_DIRECTION'] = false;
+        $this->index_archive = new IndexArchiveBundle($dir, false,
+            serialize($info), C\NUM_DOCS_PER_GENERATION, false);
+        $this->db->setWorldPermissionsRecursive($dir);
         $prune_shard =  new IndexShard($prune_shard_name);
         $too_old = $time - $age;
+        $num_sites = 0;
         if (!$prune_shard) {
             return false;
         }
@@ -623,7 +630,9 @@ class FeedsUpdateJob extends MediaJob
         $db = $this->db;
         // we now rebuild the inverted index with the remaining items
         $sql = "SELECT * FROM FEED_ITEM WHERE PUBDATE >= ? " .
-            "ORDER BY PUBDATE DESC";
+            "ORDER BY PUBDATE ASC";
+        $seen_url_count = 0;
+        $seen_sites = [];
         $result = $db->execute($sql, [$too_old]);
         if ($result) {
             $completed = true;
@@ -665,36 +674,68 @@ class FeedsUpdateJob extends MediaJob
                     $meta_ids[] = "safe:false";
                     $meta_ids[] = "safe:all";
                 }
-                $prune_shard->addDocumentWords($doc_keys, $item['PUBDATE'],
+                $prune_shard->addDocumentWords($doc_keys, self::NEEDS_OFFSET_FLAG,
                     $word_and_qa_lists["WORD_LIST"], $meta_ids, true, false);
                 $this->updateTrendingTermCounts($term_counts, $phrase_string,
                     $word_and_qa_lists["WORD_LIST"], $media_category,
                     $source_name, $lang,
                     $item['PUBDATE']);
+                $seen_url_count += 1;
+                $page = [];
+                $page[self::TITLE] = $item['TITLE'];
+                $page[self::DESCRIPTION] = $item['DESCRIPTION'];
+                $page[self::URL] = $item['LINK'];
+                $page[self::HASH] = $item['GUID'];
+                $page[self::SOURCE_NAME] = $item['SOURCE_NAME'];
+                $page[self::IMAGE_LINK] = $item['IMAGE_LINK'];
+                $page[self::PUBDATE] = $item['PUBDATE'];
+                $seen_sites[] = $page;
             }
             unset($term_counts['seen']);
             $this->addTermCountsTrendingTable($db, $term_counts);
         }
-        $prune_shard->save();
+        L\crawlLog("----..deleting old feed items");
+        $sql = " DELETE FROM FEED_ITEM ";
+        $db->execute($sql);
+        L\crawlLog("----..done deleting old items");
+        // 1. check if indexshard is full or not. if it is, new gen
+        $generation = $this->index_archive->initGenerationToAdd(
+                $prune_shard->num_docs, null);
+        if ($generation != -1) {
+            $summary_offsets = [];
+            if (!empty($seen_sites)) {
+                // 2. add pages, get summary_offset
+                $this->index_archive->addPages($generation, self::SUMMARY_OFFSET,
+                    $seen_sites, $seen_url_count);
+                // keeping track of duplicates
+                $sql = " INSERT INTO FEED_ITEM (GUID) VALUES (?)";
+                foreach ($seen_sites as $site) {
+                    $result = $db->execute($sql, [$site[self::HASH]]);
+                    $site_url = str_replace('|', "%7C", $site[self::URL]);
+                    $host = UrlParser::getHost($site_url);
+                    $raw_guid = L\unbase64Hash($site[self::HASH]);
+                    $hash = L\crawlHash($site[self::URL], true) .
+                        $raw_guid . "d". substr(L\crawlHash(
+                        UrlParser::getHost($site[self::URL]) . "/", true), 1);
+                    $summary_offsets[$hash] = $site[self::SUMMARY_OFFSET];
+                }
+                unset($seen_sites);
+            }
+            $prune_string = $prune_shard->save(true, true);
+            $tmp_shard = IndexShard::load("news" , $prune_string);
+            if (!empty($summary_offsets)) {
+                $tmp_shard->changeDocumentOffsets($summary_offsets);
+                $this->index_archive->addIndexData($tmp_shard);
+                $this->index_dirty = true;
+            }
+            $this->index_archive->stopIndexingBundle();
+        }
+        if (file_exists($prune_shard_name)) {
+            unlink($prune_shard_name);
+        }
+        unset($prune_shard);
         set_error_handler(null);
-        @chmod($prune_shard_name, 0777);
-        @chmod($feed_shard_name, 0777);
-        @rename($prune_shard_name, $feed_shard_name);
-        @chmod($feed_shard_name, 0777);
         set_error_handler(C\NS_CONFIGS . "yioop_error_handler");
-        L\crawlLog("----..deleting old feed items");
-        $sql = "DELETE FROM FEED_ITEM WHERE PUBDATE < ?";
-        $db->execute($sql, [$too_old]);
-        $base_dir = C\APP_DIR . "/resources";
-        $subfolder = L\crawlHash(
-            'group' . C\PUBLIC_GROUP_ID . C\AUTH_KEY);
-        $prefix_folder = substr($subfolder, 0, 3);
-        $subfolder = "t" . $subfolder;
-        $date = date('Y-m-d', $too_old - C\ONE_DAY);
-        $old_folder = "$base_dir/$prefix_folder/$subfolder/$date";
-        L\crawlLog("----..deleting old feed image thumb folder");
-        $db->unlinkRecursive($old_folder);
-        L\crawlLog("----..done deleting old items");
     }
     /**
      * Updates trending term counts based on the string from the current
@@ -707,7 +748,7 @@ class FeedsUpdateJob extends MediaJob
      * @param array $word_or_phrase_list associate array of
      *      stemmed_word_or_phrase => positions in feed item of where occurs
      * @param string $media_category of feed source the item case from. We
-     *      tredning counts grouped by mmedia categort
+     *      tredning counts grouped by media category
      * @param string $source_name of feed source the item case from. We exclude
      *      from counts the name of the feed source
      * @param string $lang locale_tag for this feed item
diff --git a/src/models/CrawlModel.php b/src/models/CrawlModel.php
index 01590feb3..cc6ce9ae5 100755
--- a/src/models/CrawlModel.php
+++ b/src/models/CrawlModel.php
@@ -1171,14 +1171,21 @@ EOT;
         $dirs = glob(C\CRAWL_DIR . '/cache/{' . self::index_data_base_name .
             ',' . self::double_index_base_name . '}*',
             GLOB_ONLYDIR | GLOB_BRACE);
+        $feed_dir = C\CRAWL_DIR . '/cache/' . self::feed_index_data_base_name;
         foreach ($dirs as $dir) {
             $crawl = [];
-            preg_match('/(' .self::index_data_base_name .
-                '|'. self::double_index_base_name .')(\d+)$/', $dir, $matches);
-            $bundle_class_name = ($matches[1][0] == 'D') ?
-                C\NS_LIB . "DoubleIndexBundle" :
-                C\NS_LIB . "IndexArchiveBundle";
-            $crawl['CRAWL_TIME'] = $matches[2];
+            if ($dir != $feed_dir) {
+                preg_match('/(' .self::index_data_base_name .
+                    '|'. self::double_index_base_name .
+                    ')(\d+)$/', $dir, $matches);
+                $bundle_class_name = ($matches[1][0] == 'D') ?
+                    C\NS_LIB . "DoubleIndexBundle" :
+                    C\NS_LIB . "IndexArchiveBundle";
+                $crawl['CRAWL_TIME'] = $matches[2];
+            } else {
+                $bundle_class_name = C\NS_LIB . "IndexArchiveBundle";
+                $crawl['CRAWL_TIME'] = 13;
+            }
             $info = $bundle_class_name::getArchiveInfo($dir);
             if (isset($info['DESCRIPTION'])) {
                 set_error_handler(null);
diff --git a/src/models/PhraseModel.php b/src/models/PhraseModel.php
index 8e83a83e6..d90732381 100755
--- a/src/models/PhraseModel.php
+++ b/src/models/PhraseModel.php
@@ -35,6 +35,7 @@ use seekquarry\yioop\library as L;
 use seekquarry\yioop\library\AnalyticsManager;
 use seekquarry\yioop\library\IndexManager;
 use seekquarry\yioop\library\PhraseParser;
+use seekquarry\yioop\library\IndexArchiveBundle;
 use seekquarry\yioop\library\index_bundle_iterators as I;

 /**
@@ -1693,10 +1694,35 @@ class PhraseModel extends ParallelModel
                                 $distinct_key[1] : 0;
                             $distinct_key_id = L\unbase64Hash(
                                 $distinct_key[0]);
-                            $tmp_word_iterators[$m] =
-                                new I\WordIterator($distinct_key_id, $shift,
-                                $index_name, true, $filter, $to_retrieve,
-                                $limit_feeds);
+                            // 13 is somewhat of a magic number right now
+                            if ($index_name == 13) {
+                                $dir_name = C\CRAWL_DIR."/cache/"
+                                    .self::index_data_base_name.$index_name;
+                            } else {
+                                $dir_name = C\CRAWL_DIR."/cache/"
+                                    .self::index_data_base_name.$index_name;
+                            }
+                            $index = IndexManager::getIndex($index_name);
+                            $archive_info = $index->getArchiveInfo($dir_name);
+                            $description = unserialize($archive_info['DESCRIPTION']);
+                            if (isset($description['FORWARD_DIRECTION'])) {
+                                $forward_direction = $description['FORWARD_DIRECTION'];
+                            } else {
+                                $forward_direction = 1;
+                            }
+                            // will have to change index name for checking iterator
+                            if ($forward_direction) {
+                                $tmp_word_iterators[$m] =
+                                    new I\WordIterator($distinct_key_id, $shift,
+                                    $index_name, true, $filter, $to_retrieve,
+                                    $limit_feeds);
+                            }
+                            else {
+                                $tmp_word_iterators[$m] =
+                                    new I\ReverseIterator($distinct_key_id, $shift,
+                                    $index_name, true, $filter, $to_retrieve,
+                                    $limit_feeds);
+                            }
                             $sum += $tmp_word_iterators[$m]->num_docs;
                             if ($tmp_word_iterators[$m]->dictionary_info !=
                                 [] ||
diff --git a/src/views/helpers/FeedsHelper.php b/src/views/helpers/FeedsHelper.php
index ee12c750e..51c0cc104 100644
--- a/src/views/helpers/FeedsHelper.php
+++ b/src/views/helpers/FeedsHelper.php
@@ -78,7 +78,7 @@ class FeedsHelper extends Helper implements CrawlConstants
         $time = time();
         foreach ($feed_pages as $page) {
             if ($not_news) {
-                $pub_date = $page[self::SUMMARY_OFFSET][0][4];
+                $pub_date = $page[self::PUBDATE];
                 $encode_source = urlencode(
                     urlencode($page[self::SOURCE_NAME]));
                 $pub_date = $this->getPubdateString($time, $pub_date);
@@ -120,7 +120,7 @@ class FeedsHelper extends Helper implements CrawlConstants
         $query_array = (empty($csrf_token)) ? [] :
             [C\CSRF_TOKEN => $csrf_token];
         $delim = (C\REDIRECTS_ON) ? "?" : "&amp;";
-        $pub_date = $page[self::SUMMARY_OFFSET][0][4];
+        $pub_date = $page[self::PUBDATE];
         $encode_source = urlencode(
             urlencode($page[self::SOURCE_NAME]));
         $time = time();
diff --git a/tests/IndexShardTest.php b/tests/IndexShardTest.php
index 8df9dd3c4..124b270da 100644
--- a/tests/IndexShardTest.php
+++ b/tests/IndexShardTest.php
@@ -36,6 +36,9 @@ use seekquarry\yioop\library\CrawlConstants;
 use seekquarry\yioop\library\IndexShard;
 use seekquarry\yioop\library\LinearAlgebra as LA;
 use seekquarry\yioop\library\UnitTest;
+use seekquarry\yioop\library\index_bundle_iterators\WordIterator;
+use seekquarry\yioop\library\index_bundle_iterators\ReverseIterator;
+use seekquarry\yioop\library\IndexManager;

 /**
  * Used to test that the IndexShard class can properly add new documents
@@ -57,6 +60,8 @@ class IndexShardTest extends UnitTest
             "/shard2.txt", 0);
         $this->test_objects['shard3'] = new IndexShard(C\WORK_DIRECTORY.
             "/shard3.txt", 0);
+        $this->test_objects['shard4'] = new IndexShard(C\WORK_DIRECTORY.
+            "/shard4.txt", 0, C\NUM_DOCS_PER_GENERATION, false, false);
     }
     /**
      * Deletes any index shard files we may have created
@@ -67,6 +72,7 @@ class IndexShardTest extends UnitTest
         @unlink(C\WORK_DIRECTORY."/shard.txt");
         @unlink(C\WORK_DIRECTORY."/shard2.txt");
         @unlink(C\WORK_DIRECTORY."/shard3.txt");
+        @unlink(C\WORK_DIRECTORY."/shard4.txt");
         set_error_handler(C\NS_CONFIGS . "yioop_error_handler");
     }
     /**
@@ -144,6 +150,232 @@ class IndexShardTest extends UnitTest
         $this->assertEqual(count($c_data), 2,
             "Doc lookup by meta word works has correct count");
     }
+    /**
+     * Check if can store documents into a reverse index shard and retrieve them
+     * Shard is just a normal regular IndexShard, while Shard4 sets the additional
+     * flag which makes everything go in reverse
+     */
+    public function addDocumentsGetPostingsSliceReverseTestCase()
+    {
+        $docid = "AAAAAAAA";
+        $doc_hash = "BBBBBBBB";
+        $doc_hosts_url = "CCCCCCCC";
+        $docid .= $doc_hash . $doc_hosts_url;
+        $offset = 5;
+        $word_counts = [
+            'BBBBBBBB' => [1, 3],
+            'CCCCCCCC' => [4, 9, 16],
+            'DDDDDDDD' => [5, 25, 125],
+        ];
+        $meta_ids = ["EEEEEEEE", "FFFFFFFF"];
+        $this->test_objects['shard']->addDocumentWords($docid,
+            $offset, $word_counts, $meta_ids, ["EEEEEEEE"], true);
+        $this->test_objects['shard4']->addDocumentWords($docid,
+            $offset, $word_counts, $meta_ids, ["EEEEEEEE"], true);
+        $this->assertEqual($this->test_objects['shard']->len_all_docs, 8,
+            "Len All Docs Correctly Counts Length of First Doc");
+        // add a second document and check
+        $docid = "HHHHHHHH";
+        $doc_hash = "IIIIIIII";
+        $doc_hosts_url = "JJJJJJJJ";
+        $docid .= $doc_hash. $doc_hosts_url;
+        $offset = 7;
+        $word_counts = [
+            'CCCCCCCC' => [1, 4, 9],
+            'GGGGGGGG' => [6],
+        ];
+        $meta_ids = ["YYYYYYYY"];
+        $this->test_objects['shard']->addDocumentWords($docid,
+            $offset, $word_counts, $meta_ids, ['FFFFFFFF'], true);
+        $this->test_objects['shard4']->addDocumentWords($docid,
+            $offset, $word_counts, $meta_ids, ['FFFFFFFF'], true);
+        // add a third document
+        $docid = "ABABABAB";
+        $doc_hash = "IJIJIJIJ";
+        $doc_hosts_url = "KLKLKLKL";
+        $docid .= $doc_hash. $doc_hosts_url;
+        $offset = 50;
+        $word_counts = [
+            'the' => [1,9,12,17,19,42,52,95,103],
+            'mineral' => [2],
+            'known' => [3],
+            'as' => [4,132],
+            'kryptonite' => [5,32,74,112,114,129],
+            'was' => [6,33,55,86,121,130],
+            'introduced' => [7,34,131],
+            'in' => [8,16,24,58,91,102],
+            'radio' => [10,53],
+            'serial' => [11,54],
+            'adventures' => [13],
+            'of' => [14],
+            'superman' => [15,69,107,137],
+            'story' => [18,29],
+            'meteor' => [20,104],
+            'from' => [21,105],
+            'krypton' => [22,106],
+            'broadcast' => [23],
+            'june' => [25],
+            '19433' => [26],
+            'an' => [27,59],
+            'apocryphal' => [28],
+            'claims' => [30],
+            'that' => [31,120],
+            'to' => [35,44,67,111,117,138],
+            'give' => [36],
+            'supermans' => [37],
+            'voice' => [38,78],
+            'actor' => [39,79],
+            'bud' => [40],
+            'collyer' => [41,62,116],
+            'possibility' => [43],
+            'take' => [45,118],
+            'a' => [46,49,76,122,133],
+            'vacation' => [47],
+            'at' => [48],
+            'time' => [50],
+            'when' => [51],
+            'performed' => [56],
+            'live' => [57],
+            'episode' => [60],
+            'where' => [61],
+            'would' => [63,70,80],
+            'not' => [64],
+            'be' => [65,71],
+            'present' => [66],
+            'perform' => [68],
+            'incapacitated' => [72],
+            'by' => [73,88],
+            'and' => [75],
+            'substitute' => [77],
+            'make' => [81],
+            'groaning' => [82],
+            'sounds' => [83],
+            'this' => [84,101],
+            'tale' => [85],
+            'recounted' => [87],
+            'julius' => [89],
+            'schwartz' => [90],
+            'his' => [92,140],
+            'memoir4' => [93],
+            'however' => [94],
+            'historian' => [96],
+            'michael' => [97],
+            'j' => [98],
+            'hayde' => [99],
+            'disputes' => [100],
+            'is' => [108],
+            'never' => [109],
+            'exposed' => [110],
+            'if' => [113],
+            'allowed' => [115],
+            'vacations' => [119],
+            'fringe' => [123],
+            'benefit' => [124],
+            'discovered' => [125],
+            'later' => [126],
+            'more' => [127],
+            'likely' => [128],
+            'plot' => [134],
+            'device' => [135],
+            'for' => [136],
+            'discover' => [139],
+            'origin' => [141],
+        ];
+        $meta_ids = ["ZZZZZZZZ"];
+        $this->test_objects['shard']->addDocumentWords($docid,
+            $offset, $word_counts, $meta_ids, ['GGGGGGGG'], true);
+        $this->test_objects['shard4']->addDocumentWords($docid,
+            $offset, $word_counts, $meta_ids, ['GGGGGGGG'], true);
+        $forward = $this->test_objects['shard']->getPostingsSliceById(
+            L\crawlHashWord('the', true), 5);
+        $this->assertTrue(isset($forward[$docid]),
+            "Doc lookup by word works for shard");
+        $backward = $this->test_objects['shard4']->getPostingsSliceById(
+            L\crawlHashWord('the', true), 5);
+        $this->assertTrue(isset($backward[$docid]),
+            "Doc lookup by word works for shard4");
+        $this->assertEqual($forward, $backward,
+            "Both only have one document with this word");
+        $info = $this->test_objects['shard']->getWordInfo(
+            L\crawlHashWord('CCCCCCCC', true), true);
+        list($first_offset, $last_offset,
+                $num_docs_or_links) = $info;
+        $this->assertEqual($first_offset, 36,
+            "First offset set correctly");
+        $this->assertEqual($last_offset, 40,
+            "Second offset set correctly");
+        $forward = $this->test_objects['shard']->nextPostingOffsetDocOffset($first_offset, $last_offset, 5);
+        //print_r($forward);
+        $backward = $this->test_objects['shard4']->nextPostingOffsetDocOffset($first_offset, $last_offset, 5);
+        //print_r($backward);
+        $forward = $this->test_objects['shard']->getPostingsSlice($first_offset,
+                $first_offset, $last_offset, 5);
+        # have to reset offset values, since getPostingsSlice modifies by ref
+        $info = $this->test_objects['shard4']->getWordInfo(
+            L\crawlHashWord('CCCCCCCC', true), true);
+        list($first_offset, $last_offset,
+                $num_docs_or_links) = $info;
+        $backward = $this->test_objects['shard4']->getPostingsSlice($first_offset,
+                $last_offset, $last_offset, 5, false);
+        $reversed = array_reverse($backward);
+        $this->assertEqual($forward, $backward,
+            "ReverseIndexShard returns a flipped version off a forward one");
+        $word = "media:news";
+        list($hash_key, $shift) =  L\allCrawlHashPaths($word, true)[0];
+        $index_name = 1573453725;
+        $index_name = 1575422839;
+        $index_archive_name = "IndexData" . $index_name;
+        $index_archive_name = "IndexDataNewsFeed";
+        $index_name = "NewsFeed";
+        $results_limit = 200;
+        $total_results = 0;
+        if (file_exists(C\CRAWL_DIR.'/cache/' . $index_archive_name)) {
+            $info = IndexManager::getWordInfo($index_name, $hash_key, $shift, -1, 0, -1);
+            $this->assertTrue(isset($info[0][4]));
+            $forward = [];
+            if (isset($info[0][4])) {
+                $word_iterator = new WordIterator($info[0][4], 0, $index_name, true, null, $results_limit);
+                // $norm_docs = $word_iterator->findDocsWithWord();
+                $forward_offsets = [];
+                $offset = $word_iterator->currentGenDocOffsetWithWord();
+                array_push($forward_offsets, $offset);
+                while($offset != -1){
+                    $word_iterator->advance();
+                    $offset = $word_iterator->currentGenDocOffsetWithWord();
+                    array_push($forward_offsets, $offset);
+                }
+                foreach ($norm_docs as $k => $v) {
+                    $item['bn'] = $v['bn'];
+                    $item['U'] = $v['U'];
+                    $forward[] = $item;
+                }
+                $for_results = count($forward_offsets);
+            }
+            $backward = [];
+            $info = IndexManager::getWordInfo($index_name, $hash_key, $shift, -1, 0, -1);
+            $this->assertTrue(isset($info[0][4]));
+            if (isset($info[0][4])) {
+                $word_rev_iterator = new ReverseIterator($info[0][4], 0, $index_name, true, null, $results_limit);
+                // $rev_docs = $word_rev_iterator->findDocsWithWord();
+                $backward_offsets = [];
+                $offset = $word_rev_iterator->currentGenDocOffsetWithWord();
+                array_push($backward_offsets, $offset);
+                while($offset != -1){
+                    $word_rev_iterator->advance();
+                    $offset = $word_rev_iterator->currentGenDocOffsetWithWord();
+                    array_push($backward_offsets, $offset);
+                }
+                $reversed = array_reverse($backward_offsets);
+                foreach ($rev_docs as $k => $v) {
+                    $item['bn'] = $v['bn'];
+                    $item['U'] = $v['U'];
+                    $backward[] = $item;
+                }
+                $backward = array_reverse($backward);
+                $back_results = count($reversed);
+            }
+        }
+    }
     /**
      * Check if can store link documents into an index shard and retrieve them
      */
@@ -568,4 +800,4 @@ class IndexShardTest extends UnitTest
         $this->assertTrue(isset($c_data["AAAAAAAABBBBBBBBCCCCCCCC"]),
             "Save without dictionary test works");
     }
-}
+}
\ No newline at end of file

ViewGit