viewgit/inc/functions.php:22 Function utf8_encode() is deprecated [8192]
diff --git a/src/configs/Config.php b/src/configs/Config.php index 895273dc7..b22405e87 100755 --- a/src/configs/Config.php +++ b/src/configs/Config.php @@ -158,7 +158,7 @@ function nsconddefine($constant, $value) * Version number for upgrade database function * @var int */ -nsdefine('DATABASE_VERSION', 67); +nsdefine('DATABASE_VERSION', 68); /** * Minimum Version fo Yioop for which keyword ad script * still works with this version diff --git a/src/executables/ArcTool.php b/src/executables/ArcTool.php index ee83b9250..8c5dcd5a0 100755 --- a/src/executables/ArcTool.php +++ b/src/executables/ArcTool.php @@ -282,6 +282,8 @@ class ArcTool implements CrawlConstants $index_timestamp = (isset($matches[0])) ? $matches[0] : 0; if ($bundle_num >= 0) { $index_timestamp .= "-$bundle_num"; + } else if ($bundle_name == "IndexDataNewsFeed") { + $index_timestamp = "NewsFeed"; } $hash_paths = L\allCrawlHashPaths($word, true); $found = false; @@ -371,6 +373,8 @@ class ArcTool implements CrawlConstants $index_timestamp = (isset($matches[0])) ? $matches[0] : 0; if ($bundle_num >= 0) { $index_timestamp .= "-$bundle_num"; + } else if ($bundle_name == "IndexDataNewsFeed") { + $index_timestamp = "NewsFeed"; } $index = IndexManager::getIndex($index_timestamp); $index->setCurrentShard($generation); @@ -489,6 +493,8 @@ class ArcTool implements CrawlConstants $index_timestamp = (isset($matches[0])) ? $matches[0] : 0; if ($bundle_num >= 0) { $index_timestamp .= "-$bundle_num"; + } else if ($bundle_num = "IndexDataNewsFeed") { + $index_timestamp = "NewsFeed"; } $index = IndexManager::getIndex($index_timestamp); $index->setCurrentShard($generation, true); @@ -510,6 +516,7 @@ class ArcTool implements CrawlConstants if (!$tmp) { break; } + $documents = array_merge($documents, $shard->getPostingsSlice( $old_offset, $old_start, $old_end, 1)); $raw_postings[] = $tmp; diff --git a/src/library/CrawlConstants.php b/src/library/CrawlConstants.php index 09ecca301..75d41fcbd 100755 --- a/src/library/CrawlConstants.php +++ b/src/library/CrawlConstants.php @@ -60,6 +60,7 @@ interface CrawlConstants const robot_data_base_name = "RobotData"; const etag_expires_data_base_name = "EtagExpiresData"; const index_data_base_name = "IndexData"; + const feed_index_data_base_name = "IndexDataNewsFeed"; const double_index_base_name = "DoubleIndexData"; const network_base_name = "Network"; const network_crawllist_base_name = "NetworkCrawlList"; @@ -236,4 +237,5 @@ interface CrawlConstants const THUMB_URL = 'ec'; const IS_VR = 'ed'; const DURATION = 'ee'; + const PUBDATE = 'ef'; } diff --git a/src/library/IndexArchiveBundle.php b/src/library/IndexArchiveBundle.php index a294522f4..167d98b6e 100644 --- a/src/library/IndexArchiveBundle.php +++ b/src/library/IndexArchiveBundle.php @@ -294,7 +294,7 @@ class IndexArchiveBundle implements CrawlConstants * returns a reference to this shard * @return object last shard in the bundle */ - public function getActiveShard() + public function getActiveShard($forward = true) { if ($this->setCurrentShard($this->generation_info['ACTIVE'])) { return $this->getCurrentShard(); @@ -317,7 +317,7 @@ class IndexArchiveBundle implements CrawlConstants * merge dictionary side effects * @return object the currently being index shard */ - public function getCurrentShard($force_read = false) + public function getCurrentShard($force_read = false, $forward = true) { if (!isset($this->current_shard)) { if (!isset($this->generation_info['CURRENT'])) { @@ -331,7 +331,7 @@ class IndexArchiveBundle implements CrawlConstants $this->current_shard = new IndexShard( $current_index_shard_file, $this->generation_info['CURRENT'], - $this->num_docs_per_generation, true); + $this->num_docs_per_generation, true, $forward); $this->current_shard->getShardHeader($force_read); $this->current_shard->read_only_from_disk = true; } else { @@ -346,7 +346,7 @@ class IndexArchiveBundle implements CrawlConstants } else { $this->current_shard = new IndexShard($current_index_shard_file, $this->generation_info['CURRENT'], - $this->num_docs_per_generation); + $this->num_docs_per_generation, $forward); } } return $this->current_shard; diff --git a/src/library/IndexManager.php b/src/library/IndexManager.php index adbf0a17d..95df96176 100644 --- a/src/library/IndexManager.php +++ b/src/library/IndexManager.php @@ -68,7 +68,7 @@ class IndexManager implements CrawlConstants * @param string $index_name timestamp of desired IndexArchiveBundle * @return object the desired IndexArchiveBundle reference */ - public static function getIndex($index_name) + public static function getIndex($index_name, $forward_direction = true) { $index_name = trim($index_name); //trim to fix postgres quirkiness if (empty(self::$indexes[$index_name]) || @@ -86,10 +86,17 @@ class IndexManager implements CrawlConstants return false; } } else { + if ($index_name == "NewsFeed") { + $index_archive_name = self::feed_index_data_base_name; + $index_name = 13; + } else { + $index_archive_name = self::index_data_base_name . $index_name; + } $index_archive_name = self::index_data_base_name . $index_name; if (file_exists(C\CRAWL_DIR.'/cache/' . $index_archive_name)) { $tmp = new IndexArchiveBundle( - C\CRAWL_DIR.'/cache/' . $index_archive_name); + C\CRAWL_DIR.'/cache/' . $index_archive_name, null, + C\NUM_DOCS_PER_GENERATION, $forward_direction); if (!$tmp) { return false; } diff --git a/src/library/IndexShard.php b/src/library/IndexShard.php index 0af758daf..49240695b 100644 --- a/src/library/IndexShard.php +++ b/src/library/IndexShard.php @@ -209,6 +209,12 @@ class IndexShard extends PersistentStructure implements CrawlConstants * @var string */ public $word_postings; + /** + * Specifies which direction an IndexShard will be traversed through using + * WordIterator + * @var bool + */ + public $forward_direction; /** * Fraction of NUM_DOCS_PER_GENERATION document inserts before data * from the words array is flattened to word_postings. (It will @@ -287,7 +293,7 @@ class IndexShard extends PersistentStructure implements CrawlConstants */ public function __construct($fname, $generation = 0, $num_docs_per_generation = C\NUM_DOCS_PER_GENERATION, - $read_only_from_disk = false) + $read_only_from_disk = false, $forward_direction = true) { parent::__construct($fname, -1); $this->hash_name = crawlHash($fname); @@ -310,6 +316,7 @@ class IndexShard extends PersistentStructure implements CrawlConstants $this->read_only_from_disk = $read_only_from_disk; $this->word_docs_packed = false; $this->blocks_words= []; + $this->forward_direction = $forward_direction; } /** * Used to pack a list of description scores and user ranks as a @@ -654,8 +661,9 @@ class IndexShard extends PersistentStructure implements CrawlConstants * @return array desired list of doc's and their info */ public function getPostingsSlice($start_offset, &$next_offset, $last_offset, - $len) + $len, $forward = true) { + $forward_dir = ($this->forward_direction && $forward); if (!$this->read_only_from_disk && !$this->word_docs_packed) { $this->mergeWordPostingsToString(); $this->packWords(null); @@ -663,6 +671,20 @@ class IndexShard extends PersistentStructure implements CrawlConstants } else if ($this->read_only_from_disk && empty($this->num_docs)) { $this->getShardHeader(); } + // Normal forward iterator + if ($forward_dir) { + return $this->postingsSliceForward($start_offset, $next_offset, $last_offset, + $len); + } + // Reverse direction iterator used for newsfeed + else { + return $this->postingsSliceBackward($start_offset, $next_offset, $last_offset, + $len); + } + } + public function postingsSliceForward($start_offset, &$next_offset, $last_offset, + $len) + { $num_docs_so_far = 0; $results = []; /* wd_len is a kludgy fix because word_docs_len can get out of sync @@ -697,6 +719,51 @@ class IndexShard extends PersistentStructure implements CrawlConstants $next_offset = $next << 2; return $results; } + public function postingsSliceBackward($start_offset, &$next_offset, $last_offset, + $len) + { + $num_docs_so_far = 0; + $results = []; + /* wd_len is a kludgy fix because word_docs_len can get out of sync + when things are file-based and am still tracking down why + */ + $wd_len = (isset($this->file_len)) ? + $this->file_len - $this->docids_len : $this->word_docs_len; + /* For a reverse shard, the arguments for start offset and + last offset are the same. It actually gets reversed here, + where end:=start and last:=start. + */ + $end = $start_offset >> 2; + $last = $start_offset >> 2; + $next = $next_offset >> 2; + $posting_end = $next; + $total_posting_len = 0; + $num_postings_so_far = 0; + $stop = 0; + do { + if ($next < $end) { + break; + } + $posting_start = $next; + // getPostingAtOffset will modify both start and end to the value of next + // using addresses + $posting = $this->getPostingAtOffset( + $next, $posting_start, $posting_end); + $total_posting_len += strlen($posting); + $num_postings_so_far++; + $next = $posting_start - 1; + // getting the number of docs is the same forwards or backwards + $num_docs_or_links = + self::numDocsOrLinks($start_offset, $last_offset, + $total_posting_len / $num_postings_so_far); + list($doc_id, , $item) = + $this->makeItem($posting, $num_docs_or_links); + $results[$doc_id] = $item; + $num_docs_so_far += $posting_end - $next; + } while ($next >= $last && $num_docs_so_far < $len); + $next_offset = $next << 2; + return $results; + } /** * An upper bound on the number of docs or links represented by * the start and ending integer offsets into a posting list. @@ -1035,9 +1102,10 @@ class IndexShard extends PersistentStructure implements CrawlConstants * @return array (int offset to next posting, doc_offset for this post) */ public function nextPostingOffsetDocOffset($start_offset, $end_offset, - $doc_offset) + $doc_offset, $forward = true) { $doc_index = $doc_offset >> 4; + $start = $start_offset >> 2; $end = $end_offset >> 2; $post_doc_index = $this->getDocIndexOfPostingAtOffset($end); if ($doc_index > $post_doc_index) { //fail fast @@ -1045,9 +1113,16 @@ class IndexShard extends PersistentStructure implements CrawlConstants } else if ($doc_index == $post_doc_index) { return [$end << 2, $post_doc_index << 4]; } - $current = $start_offset >> 2; - $post_doc_index = $this->gallopPostingOffsetDocOffset($current, + $current = 0; + if ($forward) { + $current = $start_offset >> 2; + $post_doc_index = $this->gallopPostingOffsetDocOffset($current, $doc_index, $end); + } else { + $current = $end_offset >> 2; + $post_doc_index = $this->gallopPostingOffsetDocOffset($current, + $doc_index, $start); + } if ($doc_index == $post_doc_index) { return [$current << 2, $post_doc_index << 4]; } @@ -1076,7 +1151,6 @@ class IndexShard extends PersistentStructure implements CrawlConstants return [$current << 2, $post_doc_index << 4]; } } while($current <= $end); - return false; } /** * Performs a galloping search (double forward jump distance each failure @@ -1094,16 +1168,30 @@ class IndexShard extends PersistentStructure implements CrawlConstants public function gallopPostingOffsetDocOffset(&$current, $doc_index, $end) { $stride = 32; - do { - $post_doc_index = $this->getDocIndexOfPostingAtOffset($current); - if ($doc_index <= $post_doc_index) { - return $post_doc_index; - } - $current += $stride; - $stride <<= 1; - } while($current <= $end); - $current = $end; - return $post_doc_index; + if ($this->forward_direction) { + do { + $post_doc_index = $this->getDocIndexOfPostingAtOffset($current); + if ($doc_index <= $post_doc_index) { + return $post_doc_index; + } + $current += $stride; + $stride <<= 1; + } while($current <= $end); + $current = $end; + return $post_doc_index; + } else { + do { + $post_doc_index = $this->getDocIndexOfPostingAtOffset($current); + if ($doc_index >= $post_doc_index) { + return $post_doc_index; + } + $current -= $stride; + $stride <<= 1; + } while($current >= $end); + $current = $end; + return $post_doc_index; + } + } /** * Given an offset of a posting into the word_docs string, looks up @@ -1112,7 +1200,8 @@ class IndexShard extends PersistentStructure implements CrawlConstants * @param int $offset byte/char offset into the word_docs string * @return int a document byte/char offset into the doc_infos string */ - public function docOffsetFromPostingOffset($offset) { + public function docOffsetFromPostingOffset($offset, $forward=true) { + $this->forward_direction = $forward; $doc_index = $this->getDocIndexOfPostingAtOffset($offset >> 2); return ($doc_index << 4); } @@ -1133,8 +1222,15 @@ class IndexShard extends PersistentStructure implements CrawlConstants if ($info !== false) { list($first_offset, $last_offset, $num_docs_or_links) = $info; - $results = $this->getPostingsSlice($first_offset, - $first_offset, $last_offset, $len); + if ($this->forward_direction) { + $results = $this->getPostingsSlice($first_offset, + $first_offset, $last_offset, $len); + } + else { + $results = $this->getPostingsSlice($first_offset, + $last_offset, $last_offset, $len); + } + } return $results; } @@ -1397,13 +1493,15 @@ class IndexShard extends PersistentStructure implements CrawlConstants } else if ($offset == self::NEEDS_OFFSET_FLAG && $missing_count < 100) { crawlLog("Index Shard Document:" . toHexString($id) . - " still needs offset"); + " still needs offset"); $missing_count++; } else if ($offset == self::NEEDS_OFFSET_FLAG && $missing_count == 100) { crawlLog("Index Shard: too many docs still need offset, " . "not logging rest"); $missing_count++; + } else { + crawlLog("Still wrong"); } } } @@ -1483,7 +1581,7 @@ class IndexShard extends PersistentStructure implements CrawlConstants public function saveWithoutDictionary($with_logging = false) { $this->getShardHeader(true); - if($with_logging) { + if ($with_logging) { crawlLog("Opening without dictionary version of shard to write..."); } $fh = fopen($this->filename . "-tmp", "wb"); @@ -1497,7 +1595,7 @@ class IndexShard extends PersistentStructure implements CrawlConstants $this->len_all_docs, $this->len_all_link_docs); fwrite($fh, $header); - if($with_logging) { + if ($with_logging) { crawlLog("..without dictionary version of shard header written"); } if (!$this->read_only_from_disk) { @@ -1513,7 +1611,7 @@ class IndexShard extends PersistentStructure implements CrawlConstants $offset += $len; $remaining -= $len; } - if($with_logging) { + if ($with_logging) { crawlLog("..without dictionary version of shard word docs written"); } $remaining = $this->docids_len; @@ -1525,7 +1623,7 @@ class IndexShard extends PersistentStructure implements CrawlConstants $offset += $len; $remaining -= $len; } - if($with_logging) { + if ($with_logging) { crawlLog("..without dictionary version of shard doc infos written"); } fclose($fh); @@ -1536,7 +1634,7 @@ class IndexShard extends PersistentStructure implements CrawlConstants unlink($this->filename); rename($this->filename . "-tmp", $this->filename); } - if($with_logging) { + if ($with_logging) { crawlLog("done replacing version of shard."); } } @@ -1798,7 +1896,7 @@ class IndexShard extends PersistentStructure implements CrawlConstants } /** * Reads 32 bit word as an unsigned int from the offset given in the - * word_docs string in the sahrd + * word_docs string in the shard * @param int $offset a byte offset into the word_docs string */ public function getWordDocsWord($offset) @@ -2090,4 +2188,4 @@ class IndexShard extends PersistentStructure implements CrawlConstants substr($value, self::WORD_KEY_LEN, self::WORD_DATA_LEN); } -} +} \ No newline at end of file diff --git a/src/library/VersionFunctions.php b/src/library/VersionFunctions.php index 09ab43367..078aad92b 100644 --- a/src/library/VersionFunctions.php +++ b/src/library/VersionFunctions.php @@ -1850,3 +1850,14 @@ function upgradeDatabaseVersion67(&$db) $db->execute("ALTER TABLE SUBSEARCH ADD COLUMN " . "DEFAULT_QUERY VARCHAR(" . C\TITLE_LEN . ") DEFAULT ''"); } +/** + * Upgrades a Version 67 version of the Yioop database to a Version 68 version + * @param object $db datasource to use to upgrade. + */ +function upgradeDatabaseVersion68(&$db) +{ + $db->execute("DELETE FROM MIX_COMPONENTS WHERE MIX_TIMESTAMP=4 + AND GROUP_ID=0"); + $db->execute("INSERT INTO MIX_COMPONENTS VALUES( + 4, 0, 13, 1, 'media:news')"); +} diff --git a/src/library/index_bundle_iterators/DisjointIterator.php b/src/library/index_bundle_iterators/DisjointIterator.php index 3ffab0f76..abff878c4 100644 --- a/src/library/index_bundle_iterators/DisjointIterator.php +++ b/src/library/index_bundle_iterators/DisjointIterator.php @@ -162,8 +162,12 @@ class DisjointIterator extends IndexBundleIterator } else if ($cur_gen_doc_offset == -1) { continue; } + $forward = true; + if ($this->index_bundle_iterators[$i] instanceof ReverseIterator) { + $forward = false; + } $gen_doc_cmp = $this->genDocOffsetCmp($cur_gen_doc_offset, - $least_gen_offset); + $least_gen_offset, $forward); if ($gen_doc_cmp < 0) { $least_gen_offset = $cur_gen_doc_offset; $this->least_offset_index = $i; @@ -187,8 +191,12 @@ class DisjointIterator extends IndexBundleIterator for ($i = 0; $i < $this->num_iterators; $i++) { $cur_gen_doc_offset = $this->index_bundle_iterators[ $i]->currentGenDocOffsetWithWord(); + $forward = true; + if ($this->index_bundle_iterators[$i] instanceof ReverseIterator) { + $forward = false; + } if ($this->genDocOffsetCmp($cur_gen_doc_offset, - $gen_doc_offset) < 0) { + $gen_doc_offset, $forward) < 0) { if ($no_change) { $this->current_block_fresh = false; $this->seen_docs += 1; diff --git a/src/library/index_bundle_iterators/IndexBundleIterator.php b/src/library/index_bundle_iterators/IndexBundleIterator.php index 185ad5ec4..74fb5ff34 100644 --- a/src/library/index_bundle_iterators/IndexBundleIterator.php +++ b/src/library/index_bundle_iterators/IndexBundleIterator.php @@ -149,19 +149,34 @@ abstract class IndexBundleIterator implements CrawlConstants * @param array $gen_doc2 second ordered pair * @return int -1,0,1 depending on which is bigger */ - public function genDocOffsetCmp($gen_doc1, $gen_doc2) + public function genDocOffsetCmp($gen_doc1, $gen_doc2, $forward=true) { - //less generation or greater - if ($gen_doc1[0] < $gen_doc2[0]) { - return -1; - } else if ($gen_doc1[0] > $gen_doc2[0]) { - return 1; - } - //less offset or greater - if ($gen_doc1[1] < $gen_doc2[1]) { - return -1; - } else if ($gen_doc1[1] > $gen_doc2[1]) { - return 1; + if ($forward) { + //less generation or greater + if ($gen_doc1[0] < $gen_doc2[0]) { + return -1; + } else if ($gen_doc1[0] > $gen_doc2[0]) { + return 1; + } + //less offset or greater + if ($gen_doc1[1] < $gen_doc2[1]) { + return -1; + } else if ($gen_doc1[1] > $gen_doc2[1]) { + return 1; + } + } else if (!$forward) { + //less generation or greater for reverse + if ($gen_doc1[0] < $gen_doc2[0]) { + return 1; + } else if ($gen_doc1[0] > $gen_doc2[0]) { + return -1; + } + //less offset or greater for reverse + if ($gen_doc1[1] < $gen_doc2[1]) { + return 1; + } else if ($gen_doc1[1] > $gen_doc2[1]) { + return -1; + } } //equal return 0; diff --git a/src/library/index_bundle_iterators/IntersectIterator.php b/src/library/index_bundle_iterators/IntersectIterator.php index 7c67f17d1..29d7316fc 100644 --- a/src/library/index_bundle_iterators/IntersectIterator.php +++ b/src/library/index_bundle_iterators/IntersectIterator.php @@ -31,6 +31,7 @@ namespace seekquarry\yioop\library\index_bundle_iterators; use seekquarry\yioop\configs as C; +use seekquarry\yioop\library as L; /** * Used to iterate over the documents which occur in all of a set of @@ -179,6 +180,7 @@ class IntersectIterator extends IndexBundleIterator return -1; } //next we finish computing BM25F + $retrieve_postings_time = microtime(true); $docs = $this->index_bundle_iterators[0]->currentDocsWithWord(); $weight = $this->weight; if (is_array($docs) && count($docs) == 1) { @@ -414,7 +416,12 @@ class IntersectIterator extends IndexBundleIterator } $gen_doc_offset[0] = $biggest_gen_offset; $all_same = true; + $forward = true; for ($i = 1; $i < $this->num_iterators; $i++) { + if ($this->index_bundle_iterators[$i] instanceof ReverseIterator) { + $forward = false; + } + $retrieve_postings_time = microtime(true); if ((($cur_gen_doc_offset = $this->index_bundle_iterators[ $i]->currentGenDocOffsetWithWord()) == -1) || time() > $time_out) { @@ -422,7 +429,7 @@ class IntersectIterator extends IndexBundleIterator } $gen_doc_offset[$i] = $cur_gen_doc_offset; $gen_doc_cmp = $this->genDocOffsetCmp($cur_gen_doc_offset, - $biggest_gen_offset); + $biggest_gen_offset, $forward); if ($gen_doc_cmp > 0) { $biggest_gen_offset = $cur_gen_doc_offset; $all_same = false; @@ -435,12 +442,13 @@ class IntersectIterator extends IndexBundleIterator } $last_changed = -1; $i = 0; + $j = 0; while($i != $last_changed) { if (time() > $time_out) { return -1; } if ($this->genDocOffsetCmp($gen_doc_offset[$i], - $biggest_gen_offset) < 0) { + $biggest_gen_offset, $forward) < 0) { $iterator = $this->index_bundle_iterators[$i]; $iterator->advance($biggest_gen_offset); if( ($cur_gen_doc_offset = @@ -449,7 +457,7 @@ class IntersectIterator extends IndexBundleIterator } $gen_doc_offset[$i] = $cur_gen_doc_offset; if ($this->genDocOffsetCmp($cur_gen_doc_offset, - $biggest_gen_offset) > 0) { + $biggest_gen_offset, $forward) > 0) { $last_changed = $i; $biggest_gen_offset = $cur_gen_doc_offset; } @@ -459,6 +467,7 @@ class IntersectIterator extends IndexBundleIterator $i = 0; $last_changed = max($last_changed, 0); } + $j++; } return 1; } diff --git a/src/library/index_bundle_iterators/ReverseIterator.php b/src/library/index_bundle_iterators/ReverseIterator.php new file mode 100644 index 000000000..5436146da --- /dev/null +++ b/src/library/index_bundle_iterators/ReverseIterator.php @@ -0,0 +1,543 @@ +<?php +/** + * SeekQuarry/Yioop -- + * Open Source Pure PHP Search Engine, Crawler, and Indexer + * + * Copyright (C) 2009 - 2019 Chris Pollett chris@pollett.org + * + * LICENSE: + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <https://www.gnu.org/licenses/>. + * + * END LICENSE + * + * @author Chris Pollett chris@pollett.org + * @license https://www.gnu.org/licenses/ GPL3 + * @link https://www.seekquarry.com/ + * @copyright 2009 - 2019 + * @filesource + */ +namespace seekquarry\yioop\library\index_bundle_iterators; + +use seekquarry\yioop\configs as C; +use seekquarry\yioop\library as L; +use seekquarry\yioop\library\IndexShard; +use seekquarry\yioop\library\IndexManager; + +/** + * Used to iterate through the documents associated with a word in + * an IndexArchiveBundle. It also makes it easy to get the summaries + * of these documents. + * + * A description of how words and the documents containing them are stored + * is given in the documentation of IndexArchiveBundle. + * + * @author Chris Pollett and Tim Chow + * @see IndexArchiveBundle + */ +class ReverseIterator extends IndexBundleIterator +{ + /** + * hash of word or phrase that the iterator iterates over + * @var string + */ + public $word_key; + /** + * Position from end of key that doesn't have to be an exact match + * (for phrases as using suffix tree) + * @var int + */ + public $shift; + /** + * The timestamp of the index is associated with this iterator + * @var string + */ + public $index_name; + /** + * First shard generation that word info was obtained for + * @var int + */ + public $start_generation; + /** + * Used to keep track of whether getWordInfo might still get more + * data on the search terms as advance generations + * @var bool + */ + public $no_more_generations; + /** + * The next byte offset in the IndexShard + * @var int + */ + public $next_offset; + /** + * An array of shard generation and posting list offsets, lengths, and + * numbers of documents + * @var array + */ + public $dictionary_info; + /** + * File name (including path) of the feed shard for news items + * @var string + */ + public $feed_shard_name; + /** + * Structure used to hold posting list start and stops for the query + * in the feed shard + * @var array + */ + public $feed_info; + /** + * The total number of shards that have data for this word + * @var int + */ + public $num_generations; + /** + * Index into dictionary_info corresponding to the current shard + * @var int + */ + public $generation_pointer; + /** + * Numeric number of current shard + * @var int + */ + public $current_generation; + /** + * The current byte offset in the IndexShard + * @var int + */ + public $current_offset; + /** + * Starting Offset of word occurence in the IndexShard + * @var int + */ + public $start_offset; + /** + * Last Offset of word occurence in the IndexShard + * @var int + */ + public $last_offset; + /** + * Keeps track of whether the word_iterator list is empty because the + * word does not appear in the index shard + * @var int + */ + public $empty; + /** + * Keeps track of whether the word_iterator list is empty because the + * word does not appear in the index shard + * @var int + */ + public $filter; + /** + * The current value of the doc_offset of current posting if known + * @var int + */ + public $current_doc_offset; + /** Host Key position + 1 (first char says doc, inlink or eternal link)*/ + const HOST_KEY_POS = 17; + /** Length of a doc key*/ + const KEY_LEN = 8; + /** If the $limit_feeds constructor input is true then limit the number + * of items coming from the feed shard to this count. + */ + const LIMIT_FEEDS_COUNT = 25; + /** + * Creates a word iterator with the given parameters. + * + * @param string $word_key hash of word or phrase to iterate docs of + * @param string $shift up to what point in key should be a match + * when do dictionary look up (for phrases because using suffix tree) + * @param string $index_name time_stamp of the to use + * @param bool $raw whether the $word_key is our variant of base64 encoded + * @param array $filter an array of hashes of domains to filter from + * results + * @param int $results_per_block the maximum number of results that can + * be returned by a findDocsWithWord call + * @param bool $limit_feeds feed results appear before all others when + * gotten out of this iterator (may be reordered later). This flag + * controls whether an upper bound of self::LIMIT_FEEDS_COUNT is + * imposed on the number of feed results returned + */ + public function __construct($word_key, $shift, $index_name, $raw = false, + $filter = null, + $results_per_block = IndexBundleIterator::RESULTS_PER_BLOCK, + $limit_feeds = false) + { + if ($raw == false) { + //get rid of out modified base64 encoding + $word_key = L\unbase64Hash($word_key); + } + $this->filter = $filter; + $this->word_key = $word_key; + $this->shift = $shift; + // 13 is somewhat of a magic number right now + if($index_name == 13) { + $index_name = "NewsFeed"; + } + $this->index_name = $index_name; + list($estimated_total, $this->dictionary_info) = + IndexManager::getWordInfo($index_name, $word_key, $shift, + -1, -1, C\NUM_DISTINCT_GENERATIONS, true); + $this->feed_shard_name = C\WORK_DIRECTORY . "/feeds/index"; + if ((!C\nsdefined('NO_FEEDS') || !C\NO_FEEDS) + && file_exists($this->feed_shard_name)) { + $this->use_feeds = true; + } else { + $this->use_feeds = false; + } + if ($this->use_feeds) { + if (!isset($this->dictionary_info[-1])) { + $this->feed_info = false; + $this->feed_empty = true; + } else { + $this->feed_info = $this->dictionary_info[-1]; + unset($this->dictionary_info[-1]); + $this->feed_empty = false; + } + } else { + $this->feed_info = false; + $this->feed_empty = true; + } + if (is_array($this->feed_info)) { + list(,$this->feed_start, $this->feed_end, $this->feed_count,) = + $this->feed_info; + $this->feed_info = [$this->feed_start, $this->feed_end, + $this->feed_count]; + } else { + $this->feed_start = 0; + $this->feed_end = 0; + $this->feed_count = 0; + } + if ($this->feed_count > 0) { + $this->using_feeds = true; + } else { + $this->using_feeds = false; + } + if ($limit_feeds && $this->feed_count > self::LIMIT_FEEDS_COUNT) { + $this->feed_count = self::LIMIT_FEEDS_COUNT; + $this->feed_end = $this->feed_start + + IndexShard::POSTING_LEN * (self::LIMIT_FEEDS_COUNT - 1); + } + $this->num_docs = $this->feed_count + $estimated_total; + if ($this->dictionary_info === false) { + $this->empty = true; + } else { + ksort($this->dictionary_info); + $this->dictionary_info = array_values($this->dictionary_info); + $this->num_generations = count($this->dictionary_info); + if ($this->num_generations == 0) { + $this->empty = true; + } else { + $this->empty = false; + } + } + $this->no_more_generations = + ($this->num_generations < C\NUM_DISTINCT_GENERATIONS); + $this->current_doc_offset = null; + $this->results_per_block = $results_per_block; + $this->current_block_fresh = false; + $this->start_generation = $this->num_generations-1; + if ($this->dictionary_info !== false || $this->feed_info !== false) { + $this->reset(); + } + } + /** + * Resets the iterator to the first document block that it could iterate + * over + * Reversed + */ + public function reset() + { + if ($this->feed_count > 0) { + $this->using_feeds = true; + } else { + $this->using_feeds = false; + } + $no_feeds = $this->feed_empty || !$this->use_feeds; + if (!$this->empty) {//we shouldn't be called when empty - but to be safe + if ($this->start_generation < $this->num_generations-1) { + list($estimated_total, $this->dictionary_info) = + IndexManager::getWordInfo($this->index_name, + $this->word_key, 0, -1, 0, C\NUM_DISTINCT_GENERATIONS, + true); + $this->num_docs = $this->feed_count + $estimated_total; + ksort($this->dictionary_info); + $this->dictionary_info = array_values($this->dictionary_info); + $this->num_generations = count($this->dictionary_info); + $this->no_more_generations = + ($this->num_generations < C\NUM_DISTINCT_GENERATIONS); + } + list($this->current_generation, $this->start_offset, + $this->last_offset, ) + = $this->dictionary_info[$this->num_generations-1]; + # if the feed isn't empty + } else { + $this->start_offset = 0; + $this->last_offset = -1; + $this->num_generations = -1; + } + $this->current_offset = $this->last_offset; + // reset pointer to the number of gens, which in reverse is the first one we want + $this->generation_pointer = $this->num_generations-1; + $this->count_block = 0; + $this->seen_docs = 0; + $this->current_doc_offset = null; + } + /** + * Hook function used by currentDocsWithWord to return the current block + * of docs if it is not cached + * + * @return mixed doc ids and score if there are docs left, -1 otherwise + */ + public function findDocsWithWord() + { + if ($this->empty) { + return -1; + } + if (($this->generation_pointer>=$this->num_generations) + || ($this->generation_pointer == 0 && + $this->current_offset < $this->start_offset)) { + return -1; + } + $pre_results = []; + if (!$this->empty) { + $this->next_offset = $this->current_offset; + $index = IndexManager::getIndex($this->index_name, false); + $index->setCurrentShard($this->current_generation, true); + //the next call also updates next offset + $shard = $index->getCurrentShard(false, false); + $pre_results = $shard->getPostingsSlice( + $this->start_offset, + $this->next_offset, $this->last_offset, + $this->results_per_block, false); + if($this->index_name == "NewsFeed") { + $time = time(); + foreach ($pre_results as $keys => $pre_result) { + $page = $index->getPage($pre_result[self::SUMMARY_OFFSET], + $this->current_generation); + $delta = $time - $page[self::PUBDATE]; + $pre_results[$keys][self::DOC_RANK] = 720000 / + max($delta, 1); + } + } + } + $results = []; + $doc_key_len = IndexShard::DOC_KEY_LEN; + foreach ($pre_results as $keys => $data) { + $host_key = substr($keys, self::HOST_KEY_POS, self::KEY_LEN); + if (!empty($this->filter) && $this->filter->isFiltered($host_key)) { + continue; + } + $data[self::KEY] = $keys; + // inlinks is the domain of the inlink + $key_parts = str_split($keys, $doc_key_len); + if (isset($key_parts[2])) { + list($hash_url, $data[self::HASH], $data[self::INLINKS]) = + $key_parts; + } else { + continue; + } + if (!empty($data[self::IS_FEED])) { + $data[self::CRAWL_TIME] = "feed"; + } else { + $data[self::CRAWL_TIME] = $this->index_name; + } + $results[$keys] = $data; + } + $this->count_block = count($results); + if ($this->generation_pointer == $this->num_generations - 1 && + $results == []) { + $results = null; + } + $this->pages = $results; + return $results; + } + /** + * Updates the seen_docs count during an advance() call + * For a reverse shard, instead of adding to the offset, we subtract by a block instead. + */ + public function advanceSeenDocs() + { + if ($this->current_block_fresh != true) { + $total_guess = IndexShard::numDocsOrLinks($this->next_offset, + $this->start_offset); + $num_docs = $total_guess % $this->results_per_block; + if ($num_docs == 0) { + $num_docs = $this->results_per_block; + } else { + $num_docs = IndexShard::numDocsOrLinks($this->start_offset, + $this->last_offset)%$this->results_per_block; + } + $this->next_offset = $this->current_offset; + $this->next_offset -= IndexShard::POSTING_LEN * $num_docs; + if ($num_docs <= 0) { + return; + } + } else { + $num_docs = $this->count_block; + } + $this->current_block_fresh = false; + $this->seen_docs += $num_docs; + } + /** + * Forwards the iterator one group of docs + * @param array $gen_doc_offset a generation, doc_offset pair. If set, + * the must be of greater than or equal generation, and if equal the + * next block must all have $doc_offsets larger than or equal to + * this value + */ + public function advance($gen_doc_offset = null) + { + if ($gen_doc_offset == null) { + $this->plainAdvance(); + return; + } + $cur_gen_doc_offset = $this->currentGenDocOffsetWithWord(); + if ($cur_gen_doc_offset == -1 || + $this->genDocOffsetCmp($cur_gen_doc_offset, + $gen_doc_offset) < 0) { + return; + } + $this->plainAdvance(); + if ($this->current_generation > $gen_doc_offset[0]) { + $this->advanceGeneration($gen_doc_offset[0]); + $this->next_offset = $this->current_offset; + } + $using_feeds = $this->using_feeds && $this->use_feeds; + if ($using_feeds) { + $shard = IndexManager::getIndex("feed"); + $last = $this->feed_end; + } else { + $index = IndexManager::getIndex($this->index_name, false); + $index->setCurrentShard($this->current_generation, true); + $shard = $index->getCurrentShard(false, false); + $start = $this->start_offset; + } + if ($this->current_generation == $gen_doc_offset[0]) { + $offset_pair = $shard->nextPostingOffsetDocOffset( + $start, $this->next_offset, $gen_doc_offset[1], false); + if ($offset_pair === false) { + $this->advanceGeneration(); + $this->next_offset = $this->current_offset; + } else { + list($this->current_offset, $this->current_doc_offset) = + $offset_pair; + } + } + $this->seen_docs = 0; + $this->seen_docs += ($this->current_offset - $this->start_offset) / + IndexShard::POSTING_LEN; + } + /** + * Forwards the iterator one group of docs. This is what's called + * by @see advance($gen_doc_offset) if $gen_doc_offset is null + * Reversed + */ + public function plainAdvance() + { + $this->advanceSeenDocs(); + $this->current_doc_offset = null; + # RC if the current offset is greater than the next + if ($this->current_offset > $this->next_offset) { + $this->current_offset = $this->next_offset; + } else { + $this->advanceGeneration(); + $this->next_offset = $this->current_offset; + } + # if the current offset is smaller, then we need to get next + # generation + if ($this->current_offset < $this->start_offset) { + $this->advanceGeneration(); + $this->next_offset = $this->current_offset; + } + } + /** + * Switches which index shard is being used to return occurrences of + * the word to the next shard containing the word + * Reversed + * + * @param int $generation generation to advance beyond + */ + public function advanceGeneration($generation = null) + { + if ($this->using_feeds && $this->use_feeds) { + $this->using_feeds = false; + $this->generation_pointer = -1; + } + if ($generation === null) { + $generation = $this->current_generation; + } + do { + # RC if the pointer is greater than the total generations, subtract + if ($this->generation_pointer >= 0) { + $this->generation_pointer--; + } + # RC if the generation pointer is still more than the number of generations + if ($this->generation_pointer >= 0) { + list($this->current_generation, $this->start_offset, + $this->last_offset, ) + = $this->dictionary_info[$this->generation_pointer]; + #set the current offset to the last one of the dictionary + $this->current_offset = $this->last_offset; + } + # if there are more generations and + if (!$this->no_more_generations && + $this->current_generation > $generation && + $this->generation_pointer <= 0) { + list($estimated_remaining_total, $info) = + IndexManager::getWordInfo($this->index_name, + $this->word_key, 0, -1, $this->num_generations, + C\NUM_DISTINCT_GENERATIONS, true); + if (count($info) > 0) { + $this->num_docs = $this->seen_docs + + $estimated_remaining_total; + ksort($info); + $this->dictionary_info = array_merge($this->dictionary_info, + array_values($info)); + $this->num_generations = count($this->dictionary_info); + $this->no_more_generations = + count($info) < C\NUM_DISTINCT_GENERATIONS; + //will increment back to where were next loop + $this->generation_pointer++; + } + } + # whle the current generation is greater than supplied argument + } while($this->current_generation > $generation && + # of if we haven't hit the zeroeth generation + $this->generation_pointer >= 0); + } + /** + * Gets the doc_offset and generation for the next document that + * would be return by this iterator + * + * @return mixed an array with the desired document offset + * and generation; -1 on fail + */ + public function currentGenDocOffsetWithWord() { + if ($this->current_doc_offset !== null) { + return [$this->current_generation, $this->current_doc_offset]; + } + # if the current offset is before the first one, or if gen pointer is less than 0 + # we are in an impossible position + if ($this->current_offset < $this->start_offset|| + $this->generation_pointer <= -1) { + return -1; + } + $index = IndexManager::getIndex($this->index_name); + $index->setCurrentShard($this->current_generation, true); + $this->current_doc_offset = $index->getCurrentShard( + )->docOffsetFromPostingOffset($this->current_offset, false); + return [$this->current_generation, $this->current_doc_offset]; + } +} \ No newline at end of file diff --git a/src/library/index_bundle_iterators/WordIterator.php b/src/library/index_bundle_iterators/WordIterator.php index a571286da..333dfb6df 100644 --- a/src/library/index_bundle_iterators/WordIterator.php +++ b/src/library/index_bundle_iterators/WordIterator.php @@ -180,6 +180,9 @@ class WordIterator extends IndexBundleIterator $this->filter = $filter; $this->word_key = $word_key; $this->shift = $shift; + if($index_name == 13) { + $index_name = "NewsFeed"; + } $this->index_name = $index_name; list($estimated_total, $this->dictionary_info) = IndexManager::getWordInfo($index_name, $word_key, $shift, @@ -544,6 +547,7 @@ class WordIterator extends IndexBundleIterator } $index = IndexManager::getIndex($this->index_name); $index->setCurrentShard($this->current_generation, true); + $index->setCurrentShard($this->current_generation, true); $this->current_doc_offset = $index->getCurrentShard( )->docOffsetFromPostingOffset($this->current_offset); return [$this->current_generation, $this->current_doc_offset]; diff --git a/src/library/media_jobs/FeedsUpdateJob.php b/src/library/media_jobs/FeedsUpdateJob.php index e4fb863fc..478f4cdc6 100644 --- a/src/library/media_jobs/FeedsUpdateJob.php +++ b/src/library/media_jobs/FeedsUpdateJob.php @@ -36,6 +36,7 @@ use seekquarry\yioop\library as L; use seekquarry\yioop\library\CrawlConstants; use seekquarry\yioop\library\FetchUrl; use seekquarry\yioop\library\IndexShard; +use seekquarry\yioop\library\IndexArchiveBundle; use seekquarry\yioop\library\PhraseParser; use seekquarry\yioop\library\UrlParser; @@ -602,10 +603,16 @@ class FeedsUpdateJob extends MediaJob public function rebuildFeedShard($age) { $time = time(); - $feed_shard_name = C\WORK_DIRECTORY . "/feeds/index"; $prune_shard_name = C\WORK_DIRECTORY . "/feeds/prune_index"; + $dir = C\CRAWL_DIR . '/cache/' . self::feed_index_data_base_name; + $info['DESCRIPTION'] = "NewsFeed"; + $info['FORWARD_DIRECTION'] = false; + $this->index_archive = new IndexArchiveBundle($dir, false, + serialize($info), C\NUM_DOCS_PER_GENERATION, false); + $this->db->setWorldPermissionsRecursive($dir); $prune_shard = new IndexShard($prune_shard_name); $too_old = $time - $age; + $num_sites = 0; if (!$prune_shard) { return false; } @@ -623,7 +630,9 @@ class FeedsUpdateJob extends MediaJob $db = $this->db; // we now rebuild the inverted index with the remaining items $sql = "SELECT * FROM FEED_ITEM WHERE PUBDATE >= ? " . - "ORDER BY PUBDATE DESC"; + "ORDER BY PUBDATE ASC"; + $seen_url_count = 0; + $seen_sites = []; $result = $db->execute($sql, [$too_old]); if ($result) { $completed = true; @@ -665,36 +674,68 @@ class FeedsUpdateJob extends MediaJob $meta_ids[] = "safe:false"; $meta_ids[] = "safe:all"; } - $prune_shard->addDocumentWords($doc_keys, $item['PUBDATE'], + $prune_shard->addDocumentWords($doc_keys, self::NEEDS_OFFSET_FLAG, $word_and_qa_lists["WORD_LIST"], $meta_ids, true, false); $this->updateTrendingTermCounts($term_counts, $phrase_string, $word_and_qa_lists["WORD_LIST"], $media_category, $source_name, $lang, $item['PUBDATE']); + $seen_url_count += 1; + $page = []; + $page[self::TITLE] = $item['TITLE']; + $page[self::DESCRIPTION] = $item['DESCRIPTION']; + $page[self::URL] = $item['LINK']; + $page[self::HASH] = $item['GUID']; + $page[self::SOURCE_NAME] = $item['SOURCE_NAME']; + $page[self::IMAGE_LINK] = $item['IMAGE_LINK']; + $page[self::PUBDATE] = $item['PUBDATE']; + $seen_sites[] = $page; } unset($term_counts['seen']); $this->addTermCountsTrendingTable($db, $term_counts); } - $prune_shard->save(); + L\crawlLog("----..deleting old feed items"); + $sql = " DELETE FROM FEED_ITEM "; + $db->execute($sql); + L\crawlLog("----..done deleting old items"); + // 1. check if indexshard is full or not. if it is, new gen + $generation = $this->index_archive->initGenerationToAdd( + $prune_shard->num_docs, null); + if ($generation != -1) { + $summary_offsets = []; + if (!empty($seen_sites)) { + // 2. add pages, get summary_offset + $this->index_archive->addPages($generation, self::SUMMARY_OFFSET, + $seen_sites, $seen_url_count); + // keeping track of duplicates + $sql = " INSERT INTO FEED_ITEM (GUID) VALUES (?)"; + foreach ($seen_sites as $site) { + $result = $db->execute($sql, [$site[self::HASH]]); + $site_url = str_replace('|', "%7C", $site[self::URL]); + $host = UrlParser::getHost($site_url); + $raw_guid = L\unbase64Hash($site[self::HASH]); + $hash = L\crawlHash($site[self::URL], true) . + $raw_guid . "d". substr(L\crawlHash( + UrlParser::getHost($site[self::URL]) . "/", true), 1); + $summary_offsets[$hash] = $site[self::SUMMARY_OFFSET]; + } + unset($seen_sites); + } + $prune_string = $prune_shard->save(true, true); + $tmp_shard = IndexShard::load("news" , $prune_string); + if (!empty($summary_offsets)) { + $tmp_shard->changeDocumentOffsets($summary_offsets); + $this->index_archive->addIndexData($tmp_shard); + $this->index_dirty = true; + } + $this->index_archive->stopIndexingBundle(); + } + if (file_exists($prune_shard_name)) { + unlink($prune_shard_name); + } + unset($prune_shard); set_error_handler(null); - @chmod($prune_shard_name, 0777); - @chmod($feed_shard_name, 0777); - @rename($prune_shard_name, $feed_shard_name); - @chmod($feed_shard_name, 0777); set_error_handler(C\NS_CONFIGS . "yioop_error_handler"); - L\crawlLog("----..deleting old feed items"); - $sql = "DELETE FROM FEED_ITEM WHERE PUBDATE < ?"; - $db->execute($sql, [$too_old]); - $base_dir = C\APP_DIR . "/resources"; - $subfolder = L\crawlHash( - 'group' . C\PUBLIC_GROUP_ID . C\AUTH_KEY); - $prefix_folder = substr($subfolder, 0, 3); - $subfolder = "t" . $subfolder; - $date = date('Y-m-d', $too_old - C\ONE_DAY); - $old_folder = "$base_dir/$prefix_folder/$subfolder/$date"; - L\crawlLog("----..deleting old feed image thumb folder"); - $db->unlinkRecursive($old_folder); - L\crawlLog("----..done deleting old items"); } /** * Updates trending term counts based on the string from the current @@ -707,7 +748,7 @@ class FeedsUpdateJob extends MediaJob * @param array $word_or_phrase_list associate array of * stemmed_word_or_phrase => positions in feed item of where occurs * @param string $media_category of feed source the item case from. We - * tredning counts grouped by mmedia categort + * tredning counts grouped by media category * @param string $source_name of feed source the item case from. We exclude * from counts the name of the feed source * @param string $lang locale_tag for this feed item diff --git a/src/models/CrawlModel.php b/src/models/CrawlModel.php index 01590feb3..cc6ce9ae5 100755 --- a/src/models/CrawlModel.php +++ b/src/models/CrawlModel.php @@ -1171,14 +1171,21 @@ EOT; $dirs = glob(C\CRAWL_DIR . '/cache/{' . self::index_data_base_name . ',' . self::double_index_base_name . '}*', GLOB_ONLYDIR | GLOB_BRACE); + $feed_dir = C\CRAWL_DIR . '/cache/' . self::feed_index_data_base_name; foreach ($dirs as $dir) { $crawl = []; - preg_match('/(' .self::index_data_base_name . - '|'. self::double_index_base_name .')(\d+)$/', $dir, $matches); - $bundle_class_name = ($matches[1][0] == 'D') ? - C\NS_LIB . "DoubleIndexBundle" : - C\NS_LIB . "IndexArchiveBundle"; - $crawl['CRAWL_TIME'] = $matches[2]; + if ($dir != $feed_dir) { + preg_match('/(' .self::index_data_base_name . + '|'. self::double_index_base_name . + ')(\d+)$/', $dir, $matches); + $bundle_class_name = ($matches[1][0] == 'D') ? + C\NS_LIB . "DoubleIndexBundle" : + C\NS_LIB . "IndexArchiveBundle"; + $crawl['CRAWL_TIME'] = $matches[2]; + } else { + $bundle_class_name = C\NS_LIB . "IndexArchiveBundle"; + $crawl['CRAWL_TIME'] = 13; + } $info = $bundle_class_name::getArchiveInfo($dir); if (isset($info['DESCRIPTION'])) { set_error_handler(null); diff --git a/src/models/PhraseModel.php b/src/models/PhraseModel.php index 8e83a83e6..d90732381 100755 --- a/src/models/PhraseModel.php +++ b/src/models/PhraseModel.php @@ -35,6 +35,7 @@ use seekquarry\yioop\library as L; use seekquarry\yioop\library\AnalyticsManager; use seekquarry\yioop\library\IndexManager; use seekquarry\yioop\library\PhraseParser; +use seekquarry\yioop\library\IndexArchiveBundle; use seekquarry\yioop\library\index_bundle_iterators as I; /** @@ -1693,10 +1694,35 @@ class PhraseModel extends ParallelModel $distinct_key[1] : 0; $distinct_key_id = L\unbase64Hash( $distinct_key[0]); - $tmp_word_iterators[$m] = - new I\WordIterator($distinct_key_id, $shift, - $index_name, true, $filter, $to_retrieve, - $limit_feeds); + // 13 is somewhat of a magic number right now + if ($index_name == 13) { + $dir_name = C\CRAWL_DIR."/cache/" + .self::index_data_base_name.$index_name; + } else { + $dir_name = C\CRAWL_DIR."/cache/" + .self::index_data_base_name.$index_name; + } + $index = IndexManager::getIndex($index_name); + $archive_info = $index->getArchiveInfo($dir_name); + $description = unserialize($archive_info['DESCRIPTION']); + if (isset($description['FORWARD_DIRECTION'])) { + $forward_direction = $description['FORWARD_DIRECTION']; + } else { + $forward_direction = 1; + } + // will have to change index name for checking iterator + if ($forward_direction) { + $tmp_word_iterators[$m] = + new I\WordIterator($distinct_key_id, $shift, + $index_name, true, $filter, $to_retrieve, + $limit_feeds); + } + else { + $tmp_word_iterators[$m] = + new I\ReverseIterator($distinct_key_id, $shift, + $index_name, true, $filter, $to_retrieve, + $limit_feeds); + } $sum += $tmp_word_iterators[$m]->num_docs; if ($tmp_word_iterators[$m]->dictionary_info != [] || diff --git a/src/views/helpers/FeedsHelper.php b/src/views/helpers/FeedsHelper.php index ee12c750e..51c0cc104 100644 --- a/src/views/helpers/FeedsHelper.php +++ b/src/views/helpers/FeedsHelper.php @@ -78,7 +78,7 @@ class FeedsHelper extends Helper implements CrawlConstants $time = time(); foreach ($feed_pages as $page) { if ($not_news) { - $pub_date = $page[self::SUMMARY_OFFSET][0][4]; + $pub_date = $page[self::PUBDATE]; $encode_source = urlencode( urlencode($page[self::SOURCE_NAME])); $pub_date = $this->getPubdateString($time, $pub_date); @@ -120,7 +120,7 @@ class FeedsHelper extends Helper implements CrawlConstants $query_array = (empty($csrf_token)) ? [] : [C\CSRF_TOKEN => $csrf_token]; $delim = (C\REDIRECTS_ON) ? "?" : "&"; - $pub_date = $page[self::SUMMARY_OFFSET][0][4]; + $pub_date = $page[self::PUBDATE]; $encode_source = urlencode( urlencode($page[self::SOURCE_NAME])); $time = time(); diff --git a/tests/IndexShardTest.php b/tests/IndexShardTest.php index 8df9dd3c4..124b270da 100644 --- a/tests/IndexShardTest.php +++ b/tests/IndexShardTest.php @@ -36,6 +36,9 @@ use seekquarry\yioop\library\CrawlConstants; use seekquarry\yioop\library\IndexShard; use seekquarry\yioop\library\LinearAlgebra as LA; use seekquarry\yioop\library\UnitTest; +use seekquarry\yioop\library\index_bundle_iterators\WordIterator; +use seekquarry\yioop\library\index_bundle_iterators\ReverseIterator; +use seekquarry\yioop\library\IndexManager; /** * Used to test that the IndexShard class can properly add new documents @@ -57,6 +60,8 @@ class IndexShardTest extends UnitTest "/shard2.txt", 0); $this->test_objects['shard3'] = new IndexShard(C\WORK_DIRECTORY. "/shard3.txt", 0); + $this->test_objects['shard4'] = new IndexShard(C\WORK_DIRECTORY. + "/shard4.txt", 0, C\NUM_DOCS_PER_GENERATION, false, false); } /** * Deletes any index shard files we may have created @@ -67,6 +72,7 @@ class IndexShardTest extends UnitTest @unlink(C\WORK_DIRECTORY."/shard.txt"); @unlink(C\WORK_DIRECTORY."/shard2.txt"); @unlink(C\WORK_DIRECTORY."/shard3.txt"); + @unlink(C\WORK_DIRECTORY."/shard4.txt"); set_error_handler(C\NS_CONFIGS . "yioop_error_handler"); } /** @@ -144,6 +150,232 @@ class IndexShardTest extends UnitTest $this->assertEqual(count($c_data), 2, "Doc lookup by meta word works has correct count"); } + /** + * Check if can store documents into a reverse index shard and retrieve them + * Shard is just a normal regular IndexShard, while Shard4 sets the additional + * flag which makes everything go in reverse + */ + public function addDocumentsGetPostingsSliceReverseTestCase() + { + $docid = "AAAAAAAA"; + $doc_hash = "BBBBBBBB"; + $doc_hosts_url = "CCCCCCCC"; + $docid .= $doc_hash . $doc_hosts_url; + $offset = 5; + $word_counts = [ + 'BBBBBBBB' => [1, 3], + 'CCCCCCCC' => [4, 9, 16], + 'DDDDDDDD' => [5, 25, 125], + ]; + $meta_ids = ["EEEEEEEE", "FFFFFFFF"]; + $this->test_objects['shard']->addDocumentWords($docid, + $offset, $word_counts, $meta_ids, ["EEEEEEEE"], true); + $this->test_objects['shard4']->addDocumentWords($docid, + $offset, $word_counts, $meta_ids, ["EEEEEEEE"], true); + $this->assertEqual($this->test_objects['shard']->len_all_docs, 8, + "Len All Docs Correctly Counts Length of First Doc"); + // add a second document and check + $docid = "HHHHHHHH"; + $doc_hash = "IIIIIIII"; + $doc_hosts_url = "JJJJJJJJ"; + $docid .= $doc_hash. $doc_hosts_url; + $offset = 7; + $word_counts = [ + 'CCCCCCCC' => [1, 4, 9], + 'GGGGGGGG' => [6], + ]; + $meta_ids = ["YYYYYYYY"]; + $this->test_objects['shard']->addDocumentWords($docid, + $offset, $word_counts, $meta_ids, ['FFFFFFFF'], true); + $this->test_objects['shard4']->addDocumentWords($docid, + $offset, $word_counts, $meta_ids, ['FFFFFFFF'], true); + // add a third document + $docid = "ABABABAB"; + $doc_hash = "IJIJIJIJ"; + $doc_hosts_url = "KLKLKLKL"; + $docid .= $doc_hash. $doc_hosts_url; + $offset = 50; + $word_counts = [ + 'the' => [1,9,12,17,19,42,52,95,103], + 'mineral' => [2], + 'known' => [3], + 'as' => [4,132], + 'kryptonite' => [5,32,74,112,114,129], + 'was' => [6,33,55,86,121,130], + 'introduced' => [7,34,131], + 'in' => [8,16,24,58,91,102], + 'radio' => [10,53], + 'serial' => [11,54], + 'adventures' => [13], + 'of' => [14], + 'superman' => [15,69,107,137], + 'story' => [18,29], + 'meteor' => [20,104], + 'from' => [21,105], + 'krypton' => [22,106], + 'broadcast' => [23], + 'june' => [25], + '19433' => [26], + 'an' => [27,59], + 'apocryphal' => [28], + 'claims' => [30], + 'that' => [31,120], + 'to' => [35,44,67,111,117,138], + 'give' => [36], + 'supermans' => [37], + 'voice' => [38,78], + 'actor' => [39,79], + 'bud' => [40], + 'collyer' => [41,62,116], + 'possibility' => [43], + 'take' => [45,118], + 'a' => [46,49,76,122,133], + 'vacation' => [47], + 'at' => [48], + 'time' => [50], + 'when' => [51], + 'performed' => [56], + 'live' => [57], + 'episode' => [60], + 'where' => [61], + 'would' => [63,70,80], + 'not' => [64], + 'be' => [65,71], + 'present' => [66], + 'perform' => [68], + 'incapacitated' => [72], + 'by' => [73,88], + 'and' => [75], + 'substitute' => [77], + 'make' => [81], + 'groaning' => [82], + 'sounds' => [83], + 'this' => [84,101], + 'tale' => [85], + 'recounted' => [87], + 'julius' => [89], + 'schwartz' => [90], + 'his' => [92,140], + 'memoir4' => [93], + 'however' => [94], + 'historian' => [96], + 'michael' => [97], + 'j' => [98], + 'hayde' => [99], + 'disputes' => [100], + 'is' => [108], + 'never' => [109], + 'exposed' => [110], + 'if' => [113], + 'allowed' => [115], + 'vacations' => [119], + 'fringe' => [123], + 'benefit' => [124], + 'discovered' => [125], + 'later' => [126], + 'more' => [127], + 'likely' => [128], + 'plot' => [134], + 'device' => [135], + 'for' => [136], + 'discover' => [139], + 'origin' => [141], + ]; + $meta_ids = ["ZZZZZZZZ"]; + $this->test_objects['shard']->addDocumentWords($docid, + $offset, $word_counts, $meta_ids, ['GGGGGGGG'], true); + $this->test_objects['shard4']->addDocumentWords($docid, + $offset, $word_counts, $meta_ids, ['GGGGGGGG'], true); + $forward = $this->test_objects['shard']->getPostingsSliceById( + L\crawlHashWord('the', true), 5); + $this->assertTrue(isset($forward[$docid]), + "Doc lookup by word works for shard"); + $backward = $this->test_objects['shard4']->getPostingsSliceById( + L\crawlHashWord('the', true), 5); + $this->assertTrue(isset($backward[$docid]), + "Doc lookup by word works for shard4"); + $this->assertEqual($forward, $backward, + "Both only have one document with this word"); + $info = $this->test_objects['shard']->getWordInfo( + L\crawlHashWord('CCCCCCCC', true), true); + list($first_offset, $last_offset, + $num_docs_or_links) = $info; + $this->assertEqual($first_offset, 36, + "First offset set correctly"); + $this->assertEqual($last_offset, 40, + "Second offset set correctly"); + $forward = $this->test_objects['shard']->nextPostingOffsetDocOffset($first_offset, $last_offset, 5); + //print_r($forward); + $backward = $this->test_objects['shard4']->nextPostingOffsetDocOffset($first_offset, $last_offset, 5); + //print_r($backward); + $forward = $this->test_objects['shard']->getPostingsSlice($first_offset, + $first_offset, $last_offset, 5); + # have to reset offset values, since getPostingsSlice modifies by ref + $info = $this->test_objects['shard4']->getWordInfo( + L\crawlHashWord('CCCCCCCC', true), true); + list($first_offset, $last_offset, + $num_docs_or_links) = $info; + $backward = $this->test_objects['shard4']->getPostingsSlice($first_offset, + $last_offset, $last_offset, 5, false); + $reversed = array_reverse($backward); + $this->assertEqual($forward, $backward, + "ReverseIndexShard returns a flipped version off a forward one"); + $word = "media:news"; + list($hash_key, $shift) = L\allCrawlHashPaths($word, true)[0]; + $index_name = 1573453725; + $index_name = 1575422839; + $index_archive_name = "IndexData" . $index_name; + $index_archive_name = "IndexDataNewsFeed"; + $index_name = "NewsFeed"; + $results_limit = 200; + $total_results = 0; + if (file_exists(C\CRAWL_DIR.'/cache/' . $index_archive_name)) { + $info = IndexManager::getWordInfo($index_name, $hash_key, $shift, -1, 0, -1); + $this->assertTrue(isset($info[0][4])); + $forward = []; + if (isset($info[0][4])) { + $word_iterator = new WordIterator($info[0][4], 0, $index_name, true, null, $results_limit); + // $norm_docs = $word_iterator->findDocsWithWord(); + $forward_offsets = []; + $offset = $word_iterator->currentGenDocOffsetWithWord(); + array_push($forward_offsets, $offset); + while($offset != -1){ + $word_iterator->advance(); + $offset = $word_iterator->currentGenDocOffsetWithWord(); + array_push($forward_offsets, $offset); + } + foreach ($norm_docs as $k => $v) { + $item['bn'] = $v['bn']; + $item['U'] = $v['U']; + $forward[] = $item; + } + $for_results = count($forward_offsets); + } + $backward = []; + $info = IndexManager::getWordInfo($index_name, $hash_key, $shift, -1, 0, -1); + $this->assertTrue(isset($info[0][4])); + if (isset($info[0][4])) { + $word_rev_iterator = new ReverseIterator($info[0][4], 0, $index_name, true, null, $results_limit); + // $rev_docs = $word_rev_iterator->findDocsWithWord(); + $backward_offsets = []; + $offset = $word_rev_iterator->currentGenDocOffsetWithWord(); + array_push($backward_offsets, $offset); + while($offset != -1){ + $word_rev_iterator->advance(); + $offset = $word_rev_iterator->currentGenDocOffsetWithWord(); + array_push($backward_offsets, $offset); + } + $reversed = array_reverse($backward_offsets); + foreach ($rev_docs as $k => $v) { + $item['bn'] = $v['bn']; + $item['U'] = $v['U']; + $backward[] = $item; + } + $backward = array_reverse($backward); + $back_results = count($reversed); + } + } + } /** * Check if can store link documents into an index shard and retrieve them */ @@ -568,4 +800,4 @@ class IndexShardTest extends UnitTest $this->assertTrue(isset($c_data["AAAAAAAABBBBBBBBCCCCCCCC"]), "Save without dictionary test works"); } -} +} \ No newline at end of file