diff --git a/src/configs/Config.php b/src/configs/Config.php index 895273dc7..b04b9bdfe 100755 --- a/src/configs/Config.php +++ b/src/configs/Config.php @@ -158,7 +158,7 @@ function nsconddefine($constant, $value) * Version number for upgrade database function * @var int */ -nsdefine('DATABASE_VERSION', 67); +nsdefine('DATABASE_VERSION', 68); /** * Minimum Version fo Yioop for which keyword ad script * still works with this version @@ -766,6 +766,12 @@ nsdefine('PAGE_RANGE_REQUEST', 50000); * how many distinct generations to read in in one go */ nsconddefine('NUM_DISTINCT_GENERATIONS', 20); +/** + * Used in computing the DOC_RANK when a going through index in descending + * fashion. It represents an upper bound on the maximum number of + * generations an IndexArchiveBundle should have + */ +nsconddefine('MAX_GENERATIONS', 10000); /** * Max number of chars to extract for description from a page to index. * Only words in the description are indexed. -- this is the default value diff --git a/src/configs/Createdb.php b/src/configs/Createdb.php index 897333a13..161550c53 100755 --- a/src/configs/Createdb.php +++ b/src/configs/Createdb.php @@ -483,14 +483,14 @@ foreach ($media_sources as $media_source) { $db->execute("INSERT INTO CRAWL_MIXES VALUES (2, 'images', ".ROOT_ID.", -1)"); $db->execute("INSERT INTO MIX_FRAGMENTS VALUES(2, 0, 1)"); $db->execute("INSERT INTO MIX_COMPONENTS VALUES( - 2, 0, 1, 1, 'media:image')"); + 2, 0, 1, 1, 1, 'media:image')"); $db->execute("INSERT INTO CRAWL_MIXES VALUES (3, 'videos', ".ROOT_ID.", -1)"); $db->execute("INSERT INTO MIX_FRAGMENTS VALUES(3, 0, 1)"); $db->execute("INSERT INTO MIX_COMPONENTS VALUES( - 3, 0, 1, 1, 'media:video')"); + 3, 0, 1, 1, 1, 'media:video')"); $db->execute("INSERT INTO CRAWL_MIXES VALUES (4, 'news', ".ROOT_ID.", -1)"); $db->execute("INSERT INTO MIX_FRAGMENTS VALUES(4, 0, 1)"); -$db->execute("INSERT INTO MIX_COMPONENTS VALUES(4, 0, 100, 1, +$db->execute("INSERT INTO MIX_COMPONENTS VALUES(4, 0, 100, 1, -1, 'media:news')"); $db->execute("INSERT INTO SUBSEARCH VALUES('db_subsearch_images', 'images','m:2', 50, '')"); diff --git a/src/controllers/components/CrawlComponent.php b/src/controllers/components/CrawlComponent.php index a2a9347e0..0ecc9e333 100644 --- a/src/controllers/components/CrawlComponent.php +++ b/src/controllers/components/CrawlComponent.php @@ -339,6 +339,12 @@ class CrawlComponent extends Component implements CrawlConstants 'social_component_weight:"'. tl('social_component_weight').'",'. 'social_component_name:"'.tl('social_component_name').'",'. + 'social_component_order:"'. + tl('social_component_order').'",'. + 'social_component_ascending:"'. + tl('social_component_ascending').'",'. + 'social_component_descending:"'. + tl('social_component_descending').'",'. 'social_component_add_keywords:"'. tl('social_component_add_keywords').'",'. 'social_component_actions:"'. @@ -379,6 +385,9 @@ class CrawlComponent extends Component implements CrawlConstants $component['CRAWL_TIMESTAMP'], "int"); $row['WEIGHT'] = $parent->clean( $component['WEIGHT'], "float"); + $row['DIRECTION'] = ($parent->clean( + $component['DIRECTION'], "int") > 0) ? 1 : + -1; $row['KEYWORDS'] = $parent->clean( $component['KEYWORDS'], "string"); @@ -433,7 +442,7 @@ class CrawlComponent extends Component implements CrawlConstants $crawl_ts = $component['CRAWL_TIMESTAMP']; $crawl_name = $data['available_crawls'][$crawl_ts]; $data['SCRIPT'] .= $comma." [$crawl_ts, '$crawl_name', ". - $component['WEIGHT'].", "; + $component['WEIGHT'].", ".$component['DIRECTION'].", "; $comma = ","; $keywords = (isset($component['KEYWORDS'])) ? $component['KEYWORDS'] : ""; diff --git a/src/data/public_default.db b/src/data/public_default.db index f9fea163c..61e09832d 100644 Binary files a/src/data/public_default.db and b/src/data/public_default.db differ diff --git a/src/library/CrawlConstants.php b/src/library/CrawlConstants.php index 1546388a9..8119879bc 100755 --- a/src/library/CrawlConstants.php +++ b/src/library/CrawlConstants.php @@ -75,8 +75,8 @@ interface CrawlConstants const mirror_table_name = "mirror_table.txt"; const local_ip_cache_file = "local_ip_cache.txt"; /** used for word iterator direction */ - const FORWARD = 1; - const BACKWARD = -1; + const ASCENDING = 1; + const DESCENDING = -1; /** media feed index archive bundle timestamp */ const FEED_CRAWL_TIME = 100; /** Used in priority queue*/ diff --git a/src/library/IndexArchiveBundle.php b/src/library/IndexArchiveBundle.php index 88736b1a0..7465f6d88 100644 --- a/src/library/IndexArchiveBundle.php +++ b/src/library/IndexArchiveBundle.php @@ -294,7 +294,7 @@ class IndexArchiveBundle implements CrawlConstants * returns a reference to this shard * @return object last shard in the bundle */ - public function getActiveShard($forward = true) + public function getActiveShard() { if ($this->setCurrentShard($this->generation_info['ACTIVE'])) { return $this->getCurrentShard(); @@ -317,8 +317,7 @@ class IndexArchiveBundle implements CrawlConstants * merge dictionary side effects * @return object the currently being index shard */ - public function getCurrentShard($force_read = false, - $direction = self::FORWARD) + public function getCurrentShard($force_read = false) { if (!isset($this->current_shard)) { if (!isset($this->generation_info['CURRENT'])) { @@ -332,7 +331,7 @@ class IndexArchiveBundle implements CrawlConstants $this->current_shard = new IndexShard( $current_index_shard_file, $this->generation_info['CURRENT'], - $this->num_docs_per_generation, true, $direction); + $this->num_docs_per_generation, true); $this->current_shard->getShardHeader($force_read); $this->current_shard->read_only_from_disk = true; } else { @@ -347,7 +346,7 @@ class IndexArchiveBundle implements CrawlConstants } else { $this->current_shard = new IndexShard($current_index_shard_file, $this->generation_info['CURRENT'], - $this->num_docs_per_generation, $forward); + $this->num_docs_per_generation); } } return $this->current_shard; diff --git a/src/library/IndexManager.php b/src/library/IndexManager.php index 1a7f87715..15a1dd96e 100644 --- a/src/library/IndexManager.php +++ b/src/library/IndexManager.php @@ -65,10 +65,9 @@ class IndexManager implements CrawlConstants * with a given timestamp or feed (for handling media feeds) * * @param string $index_name timestamp of desired IndexArchiveBundle - * @param int $direction * @return object the desired IndexArchiveBundle reference */ - public static function getIndex($index_name, $direction = self::FORWARD) + public static function getIndex($index_name) { $index_name = trim($index_name); //trim to fix postgres quirkiness if (empty(self::$indexes[$index_name]) || @@ -83,9 +82,8 @@ class IndexManager implements CrawlConstants $index_archive_name = self::index_data_base_name . $index_name; } if (file_exists(C\CRAWL_DIR.'/cache/' . $index_archive_name)) { - $tmp = new IndexArchiveBundle( - C\CRAWL_DIR . '/cache/' . $index_archive_name, null, - C\NUM_DOCS_PER_GENERATION, $direction); + $tmp = new IndexArchiveBundle(C\CRAWL_DIR . '/cache/' . + $index_archive_name); if (!$tmp) { return false; } @@ -160,6 +158,8 @@ class IndexManager implements CrawlConstants */ public static function getVersion($index_name) { + $index_name = (empty($index_name) || $index_name[0] != '-') ? + $index_name : substr($index_name, 1); if (intval($index_name) < C\VERSION_0_TIMESTAMP) { return 0; } @@ -195,10 +195,6 @@ class IndexManager implements CrawlConstants $threshold = -1, $start_generation = -1, $num_distinct_generations = -1, $with_remaining_total = false) { - if ($index_name == self::FEED_CRAWL_TIME) { - $index_name = "feed"; - } - $id = "$index_name:$start_generation:$num_distinct_generations"; $index = self::getIndex($index_name); $tmp = []; if (!empty($index->dictionary)) { @@ -213,10 +209,6 @@ class IndexManager implements CrawlConstants $total = 0; $info = []; } - if (isset($tmp[-1][3])) { - $total += $tmp[-1][3]; - $info = $tmp + $info; - } return ($with_remaining_total) ? [$total, $info] : $info; } /** diff --git a/src/library/IndexShard.php b/src/library/IndexShard.php index 472972129..70ed6302b 100644 --- a/src/library/IndexShard.php +++ b/src/library/IndexShard.php @@ -209,12 +209,6 @@ class IndexShard extends PersistentStructure implements CrawlConstants * @var string */ public $word_postings; - /** - * Specifies which direction an IndexShard will be traversed through using - * WordIterator - * @var bool - */ - public $forward_direction; /** * Fraction of NUM_DOCS_PER_GENERATION document inserts before data * from the words array is flattened to word_postings. (It will @@ -293,7 +287,7 @@ class IndexShard extends PersistentStructure implements CrawlConstants */ public function __construct($fname, $generation = 0, $num_docs_per_generation = C\NUM_DOCS_PER_GENERATION, - $read_only_from_disk = false, $forward_direction = true) + $read_only_from_disk = false) { parent::__construct($fname, -1); $this->hash_name = crawlHash($fname); @@ -316,7 +310,6 @@ class IndexShard extends PersistentStructure implements CrawlConstants $this->read_only_from_disk = $read_only_from_disk; $this->word_docs_packed = false; $this->blocks_words= []; - $this->forward_direction = $forward_direction; } /** * Used to pack a list of description scores and user ranks as a @@ -658,12 +651,14 @@ class IndexShard extends PersistentStructure implements CrawlConstants * @param int& $next_offset where to start in word docs * @param int $last_offset offset at which to stop by * @param int $len number of documents desired + * @param int $direction which direction to iterate through elements + * of the posting slice (self::ASCENDING or self::DESCENDING) as + * compared to the order of when they were stored * @return array desired list of doc's and their info */ public function getPostingsSlice($start_offset, &$next_offset, $last_offset, - $len, $forward = true) + $len, $direction = self::ASCENDING) { - $forward_dir = ($this->forward_direction && $forward); if (!$this->read_only_from_disk && !$this->word_docs_packed) { $this->mergeWordPostingsToString(); $this->packWords(null); @@ -671,20 +666,20 @@ class IndexShard extends PersistentStructure implements CrawlConstants } else if ($this->read_only_from_disk && empty($this->num_docs)) { $this->getShardHeader(); } - // Normal forward iterator - if ($forward_dir) { - return $this->postingsSliceForward($start_offset, $next_offset, + // Normal ASCENDING iterator (same order as stored) + if ($direction == self::ASCENDING) { + return $this->postingsSliceAscending($start_offset, $next_offset, $last_offset, $len); } else { - // Reverse direction iterator used for newsfeed - return $this->postingsSliceBackward($start_offset, $next_offset, + // Reverse direction used most commonly for feeds + return $this->postingsSliceDescending($start_offset, $next_offset, $last_offset, $len); } } /** * */ - public function postingsSliceForward($start_offset, &$next_offset, + public function postingsSliceAscending($start_offset, &$next_offset, $last_offset, $len) { $num_docs_so_far = 0; @@ -714,7 +709,7 @@ class IndexShard extends PersistentStructure implements CrawlConstants self::numDocsOrLinks($start_offset, $last_offset, $total_posting_len / $num_postings_so_far); list($doc_id, , $item) = - $this->makeItem($posting, $num_docs_or_links); + $this->makeItem($posting, $num_docs_or_links, self::ASCENDING); $results[$doc_id] = $item; $num_docs_so_far += $next - $posting_start; } while ($next <= $last && $num_docs_so_far < $len); @@ -724,7 +719,7 @@ class IndexShard extends PersistentStructure implements CrawlConstants /** * */ - public function postingsSliceBackward($start_offset, &$next_offset, + public function postingsSliceDescending($start_offset, &$next_offset, $last_offset, $len) { $num_docs_so_far = 0; @@ -733,7 +728,7 @@ class IndexShard extends PersistentStructure implements CrawlConstants when things are file-based and am still tracking down why */ $wd_len = (isset($this->file_len)) ? - $this->file_len - $this->docids_len : $this->word_docs_len; + $this->file_len - $this->docids_len : $this->word_docs_len; /* For a reverse shard, the arguments for start offset and last offset are the same. It actually gets reversed here, where end:=start and last:=start. @@ -741,7 +736,7 @@ class IndexShard extends PersistentStructure implements CrawlConstants $end = $start_offset >> 2; $last = $start_offset >> 2; $next = $next_offset >> 2; - $posting_end = $next; + $posting_start = $next; $total_posting_len = 0; $num_postings_so_far = 0; $stop = 0; @@ -749,7 +744,7 @@ class IndexShard extends PersistentStructure implements CrawlConstants if ($next < $end) { break; } - $posting_start = $next; + $posting_end = $next; /* getPostingAtOffset will modify both start and end to the value of next using addresses */ @@ -758,12 +753,12 @@ class IndexShard extends PersistentStructure implements CrawlConstants $total_posting_len += strlen($posting); $num_postings_so_far++; $next = $posting_start - 1; - // getting the number of docs is the same forwards or backwards + // getting the number of docs is the same ascending as descending $num_docs_or_links = self::numDocsOrLinks($start_offset, $last_offset, $total_posting_len / $num_postings_so_far); list($doc_id, , $item) = - $this->makeItem($posting, $num_docs_or_links); + $this->makeItem($posting, $num_docs_or_links, self::DESCENDING); $results[$doc_id] = $item; $num_docs_so_far += $posting_end - $next; } while ($next >= $last && $num_docs_so_far < $len); @@ -794,17 +789,27 @@ class IndexShard extends PersistentStructure implements CrawlConstants * * @param string $posting a posting entry from some words posting list * @param int $num_doc_or_links number of documents or links doc appears in + * @param int $direction whether to compute DOC_RAN based on the assumption + * the iterator is traversing the index in an ascending or descending + * fashion * @return array ($doc_id, posting_stats_array) for posting */ - public function makeItem($posting, $num_doc_or_links) + public function makeItem($posting, $num_doc_or_links, $direction = + self::ASCENDING) { $doc_key_len = self::DOC_KEY_LEN; $offset = 0; list($doc_index, $position_list) = unpackPosting($posting, $offset); $item = []; $item[self::POSITION_LIST] = $position_list; - $doc_depth = log(($doc_index + 1) + (C\AVG_LINKS_PER_PAGE + 1) * - $this->num_docs_per_generation * $this->generation, 10); + if ($direction == self::ASCENDING) { + $doc_depth = log(($doc_index + 1) + (C\AVG_LINKS_PER_PAGE + 1) * + $this->num_docs_per_generation * $this->generation, 10); + } else { + $doc_depth = log(($this->num_docs_per_generation - $doc_index + 1) + + (C\MAX_GENERATIONS - (C\AVG_LINKS_PER_PAGE + 1) * + $this->num_docs_per_generation * $this->generation), 10); + } $item[self::DOC_RANK] = number_format(10 - $doc_depth, C\PRECISION); $doc_loc = $doc_index << 4; $tmp = @@ -1103,61 +1108,101 @@ class IndexShard extends PersistentStructure implements CrawlConstants * @param int $start_offset first posting to consider * @param int $end_offset last posting before give up * @param int $doc_offset document offset we want to be greater than or - * equal to - * + * equal to (when ASCENDING) or less equal to (DESCENDING) + * @param int $direction which direction to iterate through elements + * of the posting slice (self::ASCENDING or self::DESCENDING) as + * compared to the order of when they were stored * @return array (int offset to next posting, doc_offset for this post) */ public function nextPostingOffsetDocOffset($start_offset, $end_offset, - $doc_offset, $forward = true) + $doc_offset, $direction = self::ASCENDING) { + $is_ascending = ($direction == self::ASCENDING); $doc_index = $doc_offset >> 4; $start = $start_offset >> 2; $end = $end_offset >> 2; - $post_doc_index = $this->getDocIndexOfPostingAtOffset($end); - if ($doc_index > $post_doc_index) { //fail fast + $extrema = ($is_ascending) ? $end : $start; + $post_doc_index = $this->getDocIndexOfPostingAtOffset($extrema); + if (($is_ascending && $doc_index > $post_doc_index) || + (!$is_ascending && $doc_index < $post_doc_index)) { //fail fast return false; } else if ($doc_index == $post_doc_index) { - return [$end << 2, $post_doc_index << 4]; + return [$extrema << 2, $post_doc_index << 4]; } - $current = 0; - if ($forward) { - $current = $start_offset >> 2; + if ($is_ascending) { + $current = $start; $post_doc_index = $this->gallopPostingOffsetDocOffset($current, - $doc_index, $end); + $doc_index, $end, $direction); } else { - $current = $end_offset >> 2; + $current = $end; $post_doc_index = $this->gallopPostingOffsetDocOffset($current, - $doc_index, $start); + $doc_index, $start, $direction); } if ($doc_index == $post_doc_index) { return [$current << 2, $post_doc_index << 4]; } - $low = $start_offset >> 2; + return $this->binarySearchPostingOffsetDocOffset($start, $end, + $current, $doc_index, $direction); + } + /** + * + */ + public function binarySearchPostingOffsetDocOffset($start, $end, + $current, $doc_index, $direction) + { + $low = $start; $high = $end; - do { - $post_doc_index = $this->getDocIndexOfPostingAtOffset($current); - if ($doc_index > $post_doc_index) { - $low = $current; - if ($current >= $end) { - return false; - } else { - if ($current + 1 == $high) { - $current++; - $low = $current; + if ($direction == self::ASCENDING) { + do { + $post_doc_index = $this->getDocIndexOfPostingAtOffset($current); + if ($doc_index > $post_doc_index) { + $low = $current; + if ($current >= $end) { + return false; + } else { + if ($current + 1 == $high) { + $current++; + $low = $current; + } + $current = (($low + $high) >> 1); } + } else if ($doc_index < $post_doc_index) { + if ($low == $current) { + return [$current << 2, $post_doc_index << 4]; + } + $high = $current; $current = (($low + $high) >> 1); + } else { + return [$current << 2, $post_doc_index << 4]; } - } else if ($doc_index < $post_doc_index) { - if ($low == $current) { + } while($current <= $end); + } else { + do { + $post_doc_index = $this->getDocIndexOfPostingAtOffset($current); + if ($doc_index < $post_doc_index) { + $high = $current; + if ($current <= $start) { + return false; + } else { + if ($current - 1 == $low) { + $current--; + $high = $current; + } + $current = (($low + $high) >> 1); + } + } else if ($doc_index > $post_doc_index) { + if ($high == $current) { + return [$current << 2, $post_doc_index << 4]; + } + $high = $current; + $current = (($low + $high) >> 1); + } else { return [$current << 2, $post_doc_index << 4]; } - $high = $current; - $current = (($low + $high) >> 1); - } else { - return [$current << 2, $post_doc_index << 4]; - } - } while($current <= $end); - } + } while($current >= $start); + } + return false; + } /** * Performs a galloping search (double forward jump distance each failure * step) forward in a posting list from @@ -1169,12 +1214,16 @@ class IndexShard extends PersistentStructure implements CrawlConstants * @param int $end last index of posting list * @return int document index bigger than or equal to $doc_index. Since * $current points at the posting this occurs for if found, no success - * by whether $current > $end. + * by whether $current > $end + * @param int $direction which direction to iterate through elements + * of the posting slice (self::ASCENDING or self::DESCENDING) as + * compared to the order of when they were stored */ - public function gallopPostingOffsetDocOffset(&$current, $doc_index, $end) + public function gallopPostingOffsetDocOffset(&$current, $doc_index, $end, + $direction) { $stride = 32; - if ($this->forward_direction) { + if ($direction == self::ASCENDING) { do { $post_doc_index = $this->getDocIndexOfPostingAtOffset($current); if ($doc_index <= $post_doc_index) { @@ -1183,8 +1232,6 @@ class IndexShard extends PersistentStructure implements CrawlConstants $current += $stride; $stride <<= 1; } while($current <= $end); - $current = $end; - return $post_doc_index; } else { do { $post_doc_index = $this->getDocIndexOfPostingAtOffset($current); @@ -1194,10 +1241,9 @@ class IndexShard extends PersistentStructure implements CrawlConstants $current -= $stride; $stride <<= 1; } while($current >= $end); - $current = $end; - return $post_doc_index; } - + $current = $end; + return $post_doc_index; } /** * Given an offset of a posting into the word_docs string, looks up @@ -1206,8 +1252,8 @@ class IndexShard extends PersistentStructure implements CrawlConstants * @param int $offset byte/char offset into the word_docs string * @return int a document byte/char offset into the doc_infos string */ - public function docOffsetFromPostingOffset($offset, $forward=true) { - $this->forward_direction = $forward; + public function docOffsetFromPostingOffset($offset) + { $doc_index = $this->getDocIndexOfPostingAtOffset($offset >> 2); return ($doc_index << 4); } @@ -1221,19 +1267,20 @@ class IndexShard extends PersistentStructure implements CrawlConstants * @param int $len number of documents * @return array desired list of doc's and their info */ - public function getPostingsSliceById($word_id, $len) + public function getPostingsSliceById($word_id, $len, + $direction = self::ASCENDING) { $results = []; $info = $this->getWordInfo($word_id, true); if ($info !== false) { list($first_offset, $last_offset, $num_docs_or_links) = $info; - if ($this->forward_direction) { + if ($direction == self::ASCENDING) { $results = $this->getPostingsSlice($first_offset, $first_offset, $last_offset, $len); } else { $results = $this->getPostingsSlice($first_offset, - $last_offset, $last_offset, $len, false); + $last_offset, $last_offset, $len, $direction); } } return $results; diff --git a/src/library/PhraseParser.php b/src/library/PhraseParser.php index 1d479ab9d..d90f2fd08 100755 --- a/src/library/PhraseParser.php +++ b/src/library/PhraseParser.php @@ -53,10 +53,10 @@ class PhraseParser * A list of meta words that might be extracted from a query * @var array */ - public static $meta_words_list = ['\-', 'class:', 'class-score:', 'code:', - 'date:', 'dns:', 'duration:', 'elink:', 'filetype:', 'guid:', 'host:', - 'i:', 'info:', 'index:', 'ip:', 'link:', 'modified:', - 'lang:', 'media:', 'location:', 'numlinks:', 'os:', + public static $meta_words_list = ['\-i:', '\-index:', '\-', 'class:', + 'class-score:', 'code:', 'date:', 'dns:', 'duration:', 'elink:', + 'filetype:', 'guid:', 'host:', 'i:', 'info:', 'index:', 'ip:', 'link:', + 'modified:', 'lang:', 'media:', 'location:', 'numlinks:', 'os:', 'path:', 'robot:', 'safe:', 'server:', 'site:', 'size:', 'time:', 'u:', 'version:','weight:', 'w:' ]; @@ -128,6 +128,8 @@ class PhraseParser $index_name = null, $exact_match = false, $threshold = C\MIN_RESULTS_TO_GROUP) { + $index_name = (empty($index_name) || $index_name[0] != '-') ? + $index_name : substr($index_name, 1); $char_class = C\NS_LOCALE . $lang . "\\resources\\Tokenizer"; if (isset(self::$programming_language_map[$lang])) { $control_word = self::$programming_language_map[$lang] . diff --git a/src/library/VersionFunctions.php b/src/library/VersionFunctions.php index 09ab43367..1b5648230 100644 --- a/src/library/VersionFunctions.php +++ b/src/library/VersionFunctions.php @@ -1831,7 +1831,7 @@ EOD; } } /** - * Upgrades a Version 65 version of the Yioop database to a Version 64 version + * Upgrades a Version 66 version of the Yioop database to a Version 65 version * @param object $db datasource to use to upgrade. */ function upgradeDatabaseVersion66(&$db) @@ -1840,7 +1840,7 @@ function upgradeDatabaseVersion66(&$db) C\TITLE_LEN . ") DEFAULT ''"); } /** - * Upgrades a Version 65 version of the Yioop database to a Version 64 version + * Upgrades a Version 67 version of the Yioop database to a Version 66 version * @param object $db datasource to use to upgrade. */ function upgradeDatabaseVersion67(&$db) @@ -1850,3 +1850,16 @@ function upgradeDatabaseVersion67(&$db) $db->execute("ALTER TABLE SUBSEARCH ADD COLUMN " . "DEFAULT_QUERY VARCHAR(" . C\TITLE_LEN . ") DEFAULT ''"); } +/** + * Upgrades a Version 68 version of the Yioop database to a Version 67 version + * @param object $db datasource to use to upgrade. + */ +function upgradeDatabaseVersion68(&$db) +{ + $db->execute("ALTER TABLE ROLE_ACTIVITY ADD COLUMN " . + "DIRECTION INT DEFAULT 1"); + $db->execute("DELETE FROM MIX_COMPONENTS WHERE TIMESTAMP = 4 AND + FRAGMENT_ID = 0"); + $db->execute("INSERT INTO MIX_COMPONENTS VALUES(4, 0, 100, 1, -1, + 'media:news')"); +} diff --git a/src/library/index_bundle_iterators/DisjointIterator.php b/src/library/index_bundle_iterators/DisjointIterator.php index abff878c4..323ce306e 100644 --- a/src/library/index_bundle_iterators/DisjointIterator.php +++ b/src/library/index_bundle_iterators/DisjointIterator.php @@ -93,6 +93,16 @@ class DisjointIterator extends IndexBundleIterator } $this->leastGenDocOffsetsAmongstIterators(); } + /** + * + */ + public function getDirection() + { + if (!empty($this->index_bundle_iterators[0])) { + return $this->index_bundle_iterators[0]->getDirection(); + } + return self::ASCENDING; + } /** * Returns the iterators to the first document block that it could iterate * over @@ -151,6 +161,7 @@ class DisjointIterator extends IndexBundleIterator { $least_gen_offset = -1; $this->least_offset_index = 0; + $direction = $this->getDirection(); for ($i = 0; $i < $this->num_iterators; $i++) { $cur_gen_doc_offset = $this->index_bundle_iterators[ @@ -162,12 +173,8 @@ class DisjointIterator extends IndexBundleIterator } else if ($cur_gen_doc_offset == -1) { continue; } - $forward = true; - if ($this->index_bundle_iterators[$i] instanceof ReverseIterator) { - $forward = false; - } $gen_doc_cmp = $this->genDocOffsetCmp($cur_gen_doc_offset, - $least_gen_offset, $forward); + $least_gen_offset, $direction); if ($gen_doc_cmp < 0) { $least_gen_offset = $cur_gen_doc_offset; $this->least_offset_index = $i; @@ -188,15 +195,12 @@ class DisjointIterator extends IndexBundleIterator //num_docs can change when advance() called so that's why we recompute $total_num_docs = 0; if ($gen_doc_offset !== null) { + $direction = $this->getDirection(); for ($i = 0; $i < $this->num_iterators; $i++) { $cur_gen_doc_offset = $this->index_bundle_iterators[ $i]->currentGenDocOffsetWithWord(); - $forward = true; - if ($this->index_bundle_iterators[$i] instanceof ReverseIterator) { - $forward = false; - } if ($this->genDocOffsetCmp($cur_gen_doc_offset, - $gen_doc_offset, $forward) < 0) { + $gen_doc_offset, $direction) < 0) { if ($no_change) { $this->current_block_fresh = false; $this->seen_docs += 1; @@ -218,7 +222,9 @@ class DisjointIterator extends IndexBundleIterator $this->seen_docs += 1; $this->seen_docs_unfiltered = 0; $least= $this->least_offset_index; - if (!isset($this->index_bundle_iterators[$least])) { return; } + if (!isset($this->index_bundle_iterators[$least])) { + return; + } $this->seen_docs_unfiltered += $this->index_bundle_iterators[$least]->seen_docs; $total_num_docs += $this->index_bundle_iterators[$least]->num_docs; diff --git a/src/library/index_bundle_iterators/DocIterator.php b/src/library/index_bundle_iterators/DocIterator.php index 9c2d2cbd0..321f313b8 100755 --- a/src/library/index_bundle_iterators/DocIterator.php +++ b/src/library/index_bundle_iterators/DocIterator.php @@ -101,11 +101,13 @@ class DocIterator extends IndexBundleIterator * be returned by a findDocsWithWord call */ public function __construct($index_name, $filter = null, - $results_per_block = IndexBundleIterator::RESULTS_PER_BLOCK) + $results_per_block = IndexBundleIterator::RESULTS_PER_BLOCK, + $direction = self::ASCENDING) { $this->filter = $filter; $this->index_name = $index_name; - $index = IndexManager::getIndex($index_name); + $this->direction = $direction; + $index = IndexManager::getIndex($index_name, $direction); $info = $index->getArchiveInfo($index->dir_name); $this->num_docs = $info['COUNT']; $this->num_generations = (isset($index->generation_info['ACTIVE'])) ? @@ -141,7 +143,8 @@ class DocIterator extends IndexBundleIterator if (isset($this->shard_lens[$generation])) { $this->last_offset = $this->shard_lens[$generation]; } else { - $index = IndexManager::getIndex($this->index_name); + $index = IndexManager::getIndex($this->index_name, + $this->direction); $index->setCurrentShard($generation, true); $shard = $index->getCurrentShard(); $this->last_offset = $shard->docids_len; @@ -164,7 +167,7 @@ class DocIterator extends IndexBundleIterator } $pre_results = []; $this->next_offset = $this->current_offset; - $index = IndexManager::getIndex($this->index_name); + $index = IndexManager::getIndex($this->index_name, $this->direction); $index->setCurrentShard($this->current_generation, true); //the next call also updates next offset $shard = $index->getCurrentShard(); @@ -185,7 +188,7 @@ class DocIterator extends IndexBundleIterator } $this->next_offset += ($num_keys + 1) * $doc_key_len; $pre_results[$doc_id] = $item; - $num_docs_so_far ++; + $num_docs_so_far++; } while ($num_docs_so_far < $this->results_per_block); $results = []; $doc_key_len = IndexShard::DOC_KEY_LEN; diff --git a/src/library/index_bundle_iterators/GroupIterator.php b/src/library/index_bundle_iterators/GroupIterator.php index 7e3c1da68..4bb2ee81b 100644 --- a/src/library/index_bundle_iterators/GroupIterator.php +++ b/src/library/index_bundle_iterators/GroupIterator.php @@ -127,9 +127,18 @@ class GroupIterator extends IndexBundleIterator $this->results_per_block /= ceil($num_iterators/2); $this->network_flag = $network_flag; $this->current_machine = $current_machine; - $this->is_feed = false; $this->reset(); } + /** + * + */ + public function getDirection() + { + if (!empty($this->index_bundle_iterators)) { + return $this->index_bundle_iterators->getDirection(); + } + return self::ASCENDING; + } /** * Returns the iterators to the first document block that it could iterate * over @@ -233,11 +242,6 @@ class GroupIterator extends IndexBundleIterator continue; } $hash_url = substr($doc_key, 0, IndexShard::DOC_KEY_LEN); - if (isset($doc_info[self::IS_FEED])) { - $this->is_feed = true; - } else { - $this->is_feed = false; - } // initial aggregate domain score vector for given domain if ($doc_info[self::IS_DOC]) { if (!isset($pre_out_pages[$hash_url])) { diff --git a/src/library/index_bundle_iterators/IndexBundleIterator.php b/src/library/index_bundle_iterators/IndexBundleIterator.php index 74fb5ff34..6959afb59 100644 --- a/src/library/index_bundle_iterators/IndexBundleIterator.php +++ b/src/library/index_bundle_iterators/IndexBundleIterator.php @@ -149,9 +149,10 @@ abstract class IndexBundleIterator implements CrawlConstants * @param array $gen_doc2 second ordered pair * @return int -1,0,1 depending on which is bigger */ - public function genDocOffsetCmp($gen_doc1, $gen_doc2, $forward=true) + public function genDocOffsetCmp($gen_doc1, $gen_doc2, $direction = + self::ASCENDING) { - if ($forward) { + if ($direction == self::ASCENDING) { //less generation or greater if ($gen_doc1[0] < $gen_doc2[0]) { return -1; @@ -164,7 +165,7 @@ abstract class IndexBundleIterator implements CrawlConstants } else if ($gen_doc1[1] > $gen_doc2[1]) { return 1; } - } else if (!$forward) { + } else { //less generation or greater for reverse if ($gen_doc1[0] < $gen_doc2[0]) { return 1; @@ -180,7 +181,14 @@ abstract class IndexBundleIterator implements CrawlConstants } //equal return 0; - } + } + /** + * + */ + public function getDirection() + { + return self::ASCENDING; + } /** * Gets the current block of doc ids and score associated with the * this iterators word diff --git a/src/library/index_bundle_iterators/IntersectIterator.php b/src/library/index_bundle_iterators/IntersectIterator.php index 29d7316fc..3d9936267 100644 --- a/src/library/index_bundle_iterators/IntersectIterator.php +++ b/src/library/index_bundle_iterators/IntersectIterator.php @@ -236,7 +236,7 @@ class IntersectIterator extends IndexBundleIterator } } } - $this->count_block = count($docs); + $this->count_block = (empty($docs)) ? 0 : count($docs); $this->pages = $docs; return $docs; } @@ -416,11 +416,8 @@ class IntersectIterator extends IndexBundleIterator } $gen_doc_offset[0] = $biggest_gen_offset; $all_same = true; - $forward = true; + $direction = $this->getDirection(); for ($i = 1; $i < $this->num_iterators; $i++) { - if ($this->index_bundle_iterators[$i] instanceof ReverseIterator) { - $forward = false; - } $retrieve_postings_time = microtime(true); if ((($cur_gen_doc_offset = $this->index_bundle_iterators[ $i]->currentGenDocOffsetWithWord()) == -1) || @@ -429,7 +426,7 @@ class IntersectIterator extends IndexBundleIterator } $gen_doc_offset[$i] = $cur_gen_doc_offset; $gen_doc_cmp = $this->genDocOffsetCmp($cur_gen_doc_offset, - $biggest_gen_offset, $forward); + $biggest_gen_offset, $direction); if ($gen_doc_cmp > 0) { $biggest_gen_offset = $cur_gen_doc_offset; $all_same = false; @@ -448,7 +445,7 @@ class IntersectIterator extends IndexBundleIterator return -1; } if ($this->genDocOffsetCmp($gen_doc_offset[$i], - $biggest_gen_offset, $forward) < 0) { + $biggest_gen_offset, $direction) < 0) { $iterator = $this->index_bundle_iterators[$i]; $iterator->advance($biggest_gen_offset); if( ($cur_gen_doc_offset = @@ -457,7 +454,7 @@ class IntersectIterator extends IndexBundleIterator } $gen_doc_offset[$i] = $cur_gen_doc_offset; if ($this->genDocOffsetCmp($cur_gen_doc_offset, - $biggest_gen_offset, $forward) > 0) { + $biggest_gen_offset, $direction) > 0) { $last_changed = $i; $biggest_gen_offset = $cur_gen_doc_offset; } @@ -471,6 +468,16 @@ class IntersectIterator extends IndexBundleIterator } return 1; } + /** + * + */ + public function getDirection() + { + if (!empty($this->index_bundle_iterators[0])) { + return $this->index_bundle_iterators[0]->getDirection(); + } + return self::ASCENDING; + } /** * Forwards the iterator one group of docs * @param array $gen_doc_offset a generation, doc_offset pair. If set, diff --git a/src/library/index_bundle_iterators/NegationIterator.php b/src/library/index_bundle_iterators/NegationIterator.php index 2db44c705..a3ad9c82b 100644 --- a/src/library/index_bundle_iterators/NegationIterator.php +++ b/src/library/index_bundle_iterators/NegationIterator.php @@ -63,9 +63,11 @@ class NegationIterator extends IndexBundleIterator */ public function __construct($index_bundle_iterator) { + $direction = $index_bundle_iterator->getDirection(); $this->index_bundle_iterators[0] = new DocIterator( $index_bundle_iterator->index_name, - $index_bundle_iterator->filter); + $index_bundle_iterator->filter, + $index_bundle_iterator->results_per_block, $direction); $this->index_bundle_iterators[1] = $index_bundle_iterator; $this->num_iterators = 2; $this->num_docs = 0; @@ -73,6 +75,16 @@ class NegationIterator extends IndexBundleIterator $this->num_docs = $this->index_bundle_iterators[0]->num_docs; $this->reset(); } + /** + * + */ + public function getDirection() + { + if (!empty($this->index_bundle_iterators[0])) { + return $this->index_bundle_iterators[0]->getDirection(); + } + return self::ASCENDING; + } /** * Returns the iterators to the first document block that it could iterate * over diff --git a/src/library/index_bundle_iterators/NetworkIterator.php b/src/library/index_bundle_iterators/NetworkIterator.php index c2182164f..94d446b76 100644 --- a/src/library/index_bundle_iterators/NetworkIterator.php +++ b/src/library/index_bundle_iterators/NetworkIterator.php @@ -114,7 +114,7 @@ class NetworkIterator extends IndexBundleIterator $this->results_per_block = ceil(C\MIN_RESULTS_TO_GROUP); $this->next_results_per_block = $this->results_per_block; $this->hard_query = false; - $this->base_query = "q=".urlencode($query). + $this->base_query = "q=" . urlencode($query). "&f=serial&network=false&raw=1&its=$timestamp&guess=false"; if ($save_timestamp_name != "") { // used for archive crawls of crawl mixes diff --git a/src/library/index_bundle_iterators/ReverseIterator.php b/src/library/index_bundle_iterators/ReverseIterator.php deleted file mode 100644 index a11ac0810..000000000 --- a/src/library/index_bundle_iterators/ReverseIterator.php +++ /dev/null @@ -1,456 +0,0 @@ -<?php -/** - * SeekQuarry/Yioop -- - * Open Source Pure PHP Search Engine, Crawler, and Indexer - * - * Copyright (C) 2009 - 2019 Chris Pollett chris@pollett.org - * - * LICENSE: - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <https://www.gnu.org/licenses/>. - * - * END LICENSE - * - * @author Chris Pollett chris@pollett.org - * @license https://www.gnu.org/licenses/ GPL3 - * @link https://www.seekquarry.com/ - * @copyright 2009 - 2019 - * @filesource - */ -namespace seekquarry\yioop\library\index_bundle_iterators; - -use seekquarry\yioop\configs as C; -use seekquarry\yioop\library as L; -use seekquarry\yioop\library\IndexShard; -use seekquarry\yioop\library\IndexManager; - -/** - * Used to iterate through the documents associated with a word in - * an IndexArchiveBundle. It also makes it easy to get the summaries - * of these documents. - * - * A description of how words and the documents containing them are stored - * is given in the documentation of IndexArchiveBundle. - * - * @author Chris Pollett and Tim Chow - * @see IndexArchiveBundle - */ -class ReverseIterator extends IndexBundleIterator -{ - /** - * hash of word or phrase that the iterator iterates over - * @var string - */ - public $word_key; - /** - * Position from end of key that doesn't have to be an exact match - * (for phrases as using suffix tree) - * @var int - */ - public $shift; - /** - * The timestamp of the index is associated with this iterator - * @var string - */ - public $index_name; - /** - * First shard generation that word info was obtained for - * @var int - */ - public $start_generation; - /** - * Used to keep track of whether getWordInfo might still get more - * data on the search terms as advance generations - * @var bool - */ - public $no_more_generations; - /** - * The next byte offset in the IndexShard - * @var int - */ - public $next_offset; - /** - * An array of shard generation and posting list offsets, lengths, and - * numbers of documents - * @var array - */ - public $dictionary_info; - /** - * The total number of shards that have data for this word - * @var int - */ - public $num_generations; - /** - * Index into dictionary_info corresponding to the current shard - * @var int - */ - public $generation_pointer; - /** - * Numeric number of current shard - * @var int - */ - public $current_generation; - /** - * The current byte offset in the IndexShard - * @var int - */ - public $current_offset; - /** - * Starting Offset of word occurence in the IndexShard - * @var int - */ - public $start_offset; - /** - * Last Offset of word occurence in the IndexShard - * @var int - */ - public $last_offset; - /** - * Keeps track of whether the word_iterator list is empty because the - * word does not appear in the index shard - * @var int - */ - public $empty; - /** - * Keeps track of whether the word_iterator list is empty because the - * word does not appear in the index shard - * @var int - */ - public $filter; - /** - * The current value of the doc_offset of current posting if known - * @var int - */ - public $current_doc_offset; - /** Host Key position + 1 (first char says doc, inlink or eternal link)*/ - const HOST_KEY_POS = 17; - /** Length of a doc key*/ - const KEY_LEN = 8; - /** - * Creates a word iterator with the given parameters. - * - * @param string $word_key hash of word or phrase to iterate docs of - * @param string $shift up to what point in key should be a match - * when do dictionary look up (for phrases because using suffix tree) - * @param string $index_name time_stamp of the to use - * @param bool $raw whether the $word_key is our variant of base64 encoded - * @param array $filter an array of hashes of domains to filter from - * results - * @param int $results_per_block the maximum number of results that can - * be returned by a findDocsWithWord call - */ - public function __construct($word_key, $shift, $index_name, $raw = false, - $filter = null, $results_per_block = - IndexBundleIterator::RESULTS_PER_BLOCK) - { - if ($raw == false) { - //get rid of out modified base64 encoding - $word_key = L\unbase64Hash($word_key); - } - $this->filter = $filter; - $this->word_key = $word_key; - $this->shift = $shift; - $this->index_name = $index_name; - list($this->num_docs, $this->dictionary_info) = - IndexManager::getWordInfo($index_name, $word_key, $shift, - -1, -1, C\NUM_DISTINCT_GENERATIONS, true); - if ($this->dictionary_info === false) { - $this->empty = true; - } else { - ksort($this->dictionary_info); - $this->dictionary_info = array_values($this->dictionary_info); - $this->num_generations = count($this->dictionary_info); - if ($this->num_generations == 0) { - $this->empty = true; - } else { - $this->empty = false; - } - } - $this->no_more_generations = - ($this->num_generations < C\NUM_DISTINCT_GENERATIONS); - $this->current_doc_offset = null; - $this->results_per_block = $results_per_block; - $this->current_block_fresh = false; - $this->start_generation = $this->num_generations-1; - if ($this->dictionary_info !== false) { - $this->reset(); - } - } - /** - * Resets the iterator to the first document block that it could iterate - * over - * Reversed - */ - public function reset() - { - if (!$this->empty) {//we shouldn't be called when empty - but to be safe - if ($this->start_generation < $this->num_generations-1) { - list($this->num_docs, $this->dictionary_info) = - IndexManager::getWordInfo($this->index_name, - $this->word_key, 0, -1, 0, C\NUM_DISTINCT_GENERATIONS, - true); - ksort($this->dictionary_info); - $this->dictionary_info = array_values($this->dictionary_info); - $this->num_generations = count($this->dictionary_info); - $this->no_more_generations = - ($this->num_generations < C\NUM_DISTINCT_GENERATIONS); - } - list($this->current_generation, $this->start_offset, - $this->last_offset, ) - = $this->dictionary_info[$this->num_generations-1]; - } - $this->current_offset = $this->last_offset; - /* reset pointer to the number of gens, which in reverse is the - first one we want - */ - $this->generation_pointer = $this->num_generations - 1; - $this->count_block = 0; - $this->seen_docs = 0; - $this->current_doc_offset = null; - } - /** - * Hook function used by currentDocsWithWord to return the current block - * of docs if it is not cached - * - * @return mixed doc ids and score if there are docs left, -1 otherwise - */ - public function findDocsWithWord() - { - if ($this->empty) { - return -1; - } - if (($this->generation_pointer>=$this->num_generations) - || ($this->generation_pointer == 0 && - $this->current_offset < $this->start_offset)) { - return -1; - } - $pre_results = []; - if (!$this->empty) { - $this->next_offset = $this->current_offset; - $index = IndexManager::getIndex($this->index_name, false); - $index->setCurrentShard($this->current_generation, true); - //the next call also updates next offset - $shard = $index->getCurrentShard(false, false); - $pre_results = $shard->getPostingsSlice($this->start_offset, - $this->next_offset, $this->last_offset, - $this->results_per_block, false); - if($this->index_name == "feed") { - $time = time(); - foreach ($pre_results as $keys => $pre_result) { - $page = $index->getPage($pre_result[self::SUMMARY_OFFSET], - $this->current_generation); - $delta = $time - $page[self::PUBDATE]; - $pre_results[$keys][self::DOC_RANK] = 720000 / - max($delta, 1); - } - } - } - $results = []; - $doc_key_len = IndexShard::DOC_KEY_LEN; - foreach ($pre_results as $keys => $data) { - $host_key = substr($keys, self::HOST_KEY_POS, self::KEY_LEN); - if (!empty($this->filter) && $this->filter->isFiltered($host_key)) { - continue; - } - $data[self::KEY] = $keys; - // inlinks is the domain of the inlink - $key_parts = str_split($keys, $doc_key_len); - if (isset($key_parts[2])) { - list($hash_url, $data[self::HASH], $data[self::INLINKS]) = - $key_parts; - } else { - continue; - } - $data[self::CRAWL_TIME] = $this->index_name; - $results[$keys] = $data; - } - $this->count_block = count($results); - if ($this->generation_pointer == $this->num_generations - 1 && - $results == []) { - $results = null; - } - $this->pages = $results; - return $results; - } - /** - * Updates the seen_docs count during an advance() call - * For a reverse shard, instead of adding to the offset, we subtract by a - * block instead. - */ - public function advanceSeenDocs() - { - if ($this->current_block_fresh != true) { - $total_guess = IndexShard::numDocsOrLinks($this->next_offset, - $this->start_offset); - $num_docs = $total_guess % $this->results_per_block; - if ($num_docs == 0) { - $num_docs = $this->results_per_block; - } else { - $num_docs = IndexShard::numDocsOrLinks($this->start_offset, - $this->last_offset)%$this->results_per_block; - } - $this->next_offset = $this->current_offset; - $this->next_offset -= IndexShard::POSTING_LEN * $num_docs; - if ($num_docs <= 0) { - return; - } - } else { - $num_docs = $this->count_block; - } - $this->current_block_fresh = false; - $this->seen_docs += $num_docs; - } - /** - * Forwards the iterator one group of docs - * @param array $gen_doc_offset a generation, doc_offset pair. If set, - * the must be of greater than or equal generation, and if equal the - * next block must all have $doc_offsets larger than or equal to - * this value - */ - public function advance($gen_doc_offset = null) - { - if ($gen_doc_offset == null) { - $this->plainAdvance(); - return; - } - $cur_gen_doc_offset = $this->currentGenDocOffsetWithWord(); - if ($cur_gen_doc_offset == -1 || - $this->genDocOffsetCmp($cur_gen_doc_offset, $gen_doc_offset) < 0) { - return; - } - $this->plainAdvance(); - if ($this->current_generation > $gen_doc_offset[0]) { - $this->advanceGeneration($gen_doc_offset[0]); - $this->next_offset = $this->current_offset; - } - $index = IndexManager::getIndex($this->index_name, false); - $index->setCurrentShard($this->current_generation, true); - $shard = $index->getCurrentShard(false, false); - $start = $this->start_offset; - if ($this->current_generation == $gen_doc_offset[0]) { - $offset_pair = $shard->nextPostingOffsetDocOffset( - $start, $this->next_offset, $gen_doc_offset[1], false); - if ($offset_pair === false) { - $this->advanceGeneration(); - $this->next_offset = $this->current_offset; - } else { - list($this->current_offset, $this->current_doc_offset) = - $offset_pair; - } - } - $this->seen_docs = 0; - $this->seen_docs += ($this->current_offset - $this->start_offset) / - IndexShard::POSTING_LEN; - } - /** - * Forwards the iterator one group of docs. This is what's called - * by @see advance($gen_doc_offset) if $gen_doc_offset is null - * Reversed - */ - public function plainAdvance() - { - $this->advanceSeenDocs(); - $this->current_doc_offset = null; - # RC if the current offset is greater than the next - if ($this->current_offset > $this->next_offset) { - $this->current_offset = $this->next_offset; - } else { - $this->advanceGeneration(); - $this->next_offset = $this->current_offset; - } - # if the current offset is smaller, then we need to get next - # generation - if ($this->current_offset < $this->start_offset) { - $this->advanceGeneration(); - $this->next_offset = $this->current_offset; - } - } - /** - * Switches which index shard is being used to return occurrences of - * the word to the next shard containing the word - * Reversed - * - * @param int $generation generation to advance beyond - */ - public function advanceGeneration($generation = null) - { - if ($generation === null) { - $generation = $this->current_generation; - } - do { - // RC if the pointer is greater than the total generations, subtract - if ($this->generation_pointer >= 0) { - $this->generation_pointer--; - } - /* RC if the generation pointer is still more than the number of - generations - */ - if ($this->generation_pointer >= 0) { - list($this->current_generation, $this->start_offset, - $this->last_offset, ) - = $this->dictionary_info[$this->generation_pointer]; - //set the current offset to the last one of the dictionary - $this->current_offset = $this->last_offset; - } - // if there are more generations and - if (!$this->no_more_generations && - $this->current_generation > $generation && - $this->generation_pointer <= 0) { - list($estimated_remaining_total, $info) = - IndexManager::getWordInfo($this->index_name, - $this->word_key, 0, -1, $this->num_generations, - C\NUM_DISTINCT_GENERATIONS, true); - if (count($info) > 0) { - $this->num_docs = $this->seen_docs + - $estimated_remaining_total; - ksort($info); - $this->dictionary_info = array_merge($this->dictionary_info, - array_values($info)); - $this->num_generations = count($this->dictionary_info); - $this->no_more_generations = - count($info) < C\NUM_DISTINCT_GENERATIONS; - //will increment back to where were next loop - $this->generation_pointer++; - } - } - # whle the current generation is greater than supplied argument - } while($this->current_generation > $generation && - # of if we haven't hit the zeroeth generation - $this->generation_pointer >= 0); - } - /** - * Gets the doc_offset and generation for the next document that - * would be return by this iterator - * - * @return mixed an array with the desired document offset - * and generation; -1 on fail - */ - public function currentGenDocOffsetWithWord() { - if ($this->current_doc_offset !== null) { - return [$this->current_generation, $this->current_doc_offset]; - } - /* if the current offset is before the first one, - or if gen pointer is less than 0 we are in an impossible position - */ - if ($this->current_offset < $this->start_offset|| - $this->generation_pointer <= -1) { - return -1; - } - $index = IndexManager::getIndex($this->index_name, false); - $index->setCurrentShard($this->current_generation, true); - $this->current_doc_offset = $index->getCurrentShard( - )->docOffsetFromPostingOffset($this->current_offset, false); - return [$this->current_generation, $this->current_doc_offset]; - } -} diff --git a/src/library/index_bundle_iterators/UnionIterator.php b/src/library/index_bundle_iterators/UnionIterator.php index b20a63a06..84d63c576 100644 --- a/src/library/index_bundle_iterators/UnionIterator.php +++ b/src/library/index_bundle_iterators/UnionIterator.php @@ -105,6 +105,16 @@ class UnionIterator extends IndexBundleIterator } $doc_block = $this->currentDocsWithWord(); } + /** + * + */ + public function getDirection() + { + if (!empty($this->index_bundle_iterators[0])) { + return $this->index_bundle_iterators[0]->getDirection(); + } + return self::ASCENDING; + } /** * Returns the iterators to the first document block that it could iterate * over diff --git a/src/library/index_bundle_iterators/WordIterator.php b/src/library/index_bundle_iterators/WordIterator.php index bc80db2c1..53f9d1f78 100644 --- a/src/library/index_bundle_iterators/WordIterator.php +++ b/src/library/index_bundle_iterators/WordIterator.php @@ -107,12 +107,12 @@ class WordIterator extends IndexBundleIterator */ public $current_offset; /** - * Starting Offset of word occurence in the IndexShard + * Starting Offset of word occurrence in the IndexShard * @var int */ public $start_offset; /** - * Last Offset of word occurence in the IndexShard + * Last Offset of word occurrence in the IndexShard * @var int */ public $last_offset; @@ -148,19 +148,26 @@ class WordIterator extends IndexBundleIterator * of edited and deleted search results * @param int $results_per_block the maximum number of results that can * be returned by a findDocsWithWord call + * @param int $direction when results are access from $index_name in + * which order they should be presented. self::ASCENDING is from first + * added to last added, self::DESCENDING is from last added to first + * added. Note: this value is not saved permanently. So you + * could in theory open two read only versions of the same bundle but + * reading the results in different directions */ public function __construct($word_key, $shift, $index_name, $raw = false, $filter = null, $results_per_block = - IndexBundleIterator::RESULTS_PER_BLOCK) + IndexBundleIterator::RESULTS_PER_BLOCK, $direction=self::ASCENDING) { if ($raw == false) { //get rid of out modified base64 encoding $word_key = L\unbase64Hash($word_key); } + $this->direction = $direction; $this->filter = $filter; $this->word_key = $word_key; $this->shift = $shift; - $this->index_name = $index_name; + $this->index_name = $index_name; list($this->num_docs, $this->dictionary_info) = IndexManager::getWordInfo($index_name, $word_key, $shift, -1, -1, C\NUM_DISTINCT_GENERATIONS, true); @@ -181,11 +188,20 @@ class WordIterator extends IndexBundleIterator $this->current_doc_offset = null; $this->results_per_block = $results_per_block; $this->current_block_fresh = false; - $this->start_generation = 0; + if ($direction == self::ASCENDING) + $this->start_generation = ($direction == self::ASCENDING) ? 0 : + $this->num_generations - 1; if ($this->dictionary_info !== false) { $this->reset(); } } + /** + * + */ + public function getDirection() + { + return $this->direction; + } /** * Resets the iterator to the first document block that it could iterate * over @@ -204,16 +220,26 @@ class WordIterator extends IndexBundleIterator $this->no_more_generations = ($this->num_generations < C\NUM_DISTINCT_GENERATIONS); } + $info = ($this->direction == self::ASCENDING) ? + $this->dictionary_info[0] : $this->dictionary_info[ + $this->num_generations - 1]; list($this->current_generation, $this->start_offset, - $this->last_offset, ) - = $this->dictionary_info[0]; + $this->last_offset, ) = $info; } else { $this->start_offset = 0; $this->last_offset = -1; $this->num_generations = -1; } - $this->current_offset = $this->start_offset; - $this->generation_pointer = 0; + if ($this->direction == self::ASCENDING) { + $this->current_offset = $this->start_offset; + $this->generation_pointer = 0; + } else { + $this->current_offset = $this->last_offset; + /* reset pointer to the number of gens, which in reverse is the + first one we want + */ + $this->generation_pointer = $this->num_generations - 1; + } $this->count_block = 0; $this->seen_docs = 0; $this->current_doc_offset = null; @@ -229,9 +255,18 @@ class WordIterator extends IndexBundleIterator if ($this->empty) { return -1; } - if ($this->generation_pointer == $this->num_generations - 1 && - $this->current_offset > $this->last_offset) { - return -1; + $ascending = ($this->direction == self::ASCENDING); + if ($ascending) { + if ($this->generation_pointer == $this->num_generations - 1 && + $this->current_offset > $this->last_offset) { + return -1; + } + } else { + if (($this->generation_pointer >= $this->num_generations) + || ($this->generation_pointer == 0 && + $this->current_offset < $this->start_offset)) { + return -1; + } } $pre_results = []; if (!$this->empty) { @@ -242,7 +277,7 @@ class WordIterator extends IndexBundleIterator $shard = $index->getCurrentShard(); $pre_results = $shard->getPostingsSlice($this->start_offset, $this->next_offset, $this->last_offset, - $this->results_per_block); + $this->results_per_block, $this->direction); } $results = []; $doc_key_len = IndexShard::DOC_KEY_LEN; @@ -277,12 +312,27 @@ class WordIterator extends IndexBundleIterator public function advanceSeenDocs() { if ($this->current_block_fresh != true) { - $num_docs = min($this->results_per_block, - IndexShard::numDocsOrLinks($this->next_offset, - $this->last_offset)); + if ($this->direction == self::ASCENDING) { + $num_docs = min($this->results_per_block, + IndexShard::numDocsOrLinks($this->next_offset, + $this->last_offset)); + $delta_sign = 1; + } else { + $total_guess = IndexShard::numDocsOrLinks($this->next_offset, + $this->start_offset); + $num_docs = $total_guess % $this->results_per_block; + if ($num_docs == 0) { + $num_docs = $this->results_per_block; + } else { + $num_docs = IndexShard::numDocsOrLinks($this->start_offset, + $this->last_offset) % $this->results_per_block; + } + $delta_sign = -1; + } $this->next_offset = $this->current_offset; - $this->next_offset += IndexShard::POSTING_LEN * $num_docs; - if ($num_docs < 0) { + $this->next_offset += $delta_sign * + IndexShard::POSTING_LEN * $num_docs; + if ($num_docs <= 0) { return; } } else { @@ -304,24 +354,33 @@ class WordIterator extends IndexBundleIterator $this->plainAdvance(); return; } + $is_ascending = ($this->direction == self::ASCENDING); $cur_gen_doc_offset = $this->currentGenDocOffsetWithWord(); if ($cur_gen_doc_offset == -1 || $this->genDocOffsetCmp($cur_gen_doc_offset, - $gen_doc_offset) >= 0) { + $gen_doc_offset, $this->direction) >= 0) { return; } $this->plainAdvance(); - if ($this->current_generation < $gen_doc_offset[0]) { + $advance_check = ($is_ascending) ? + ($this->current_generation < $gen_doc_offset[0]) : + ($this->current_generation > $gen_doc_offset[0]); + if ($advance_check) { $this->advanceGeneration($gen_doc_offset[0]); $this->next_offset = $this->current_offset; } $index = IndexManager::getIndex($this->index_name); $index->setCurrentShard($this->current_generation, true); $shard = $index->getCurrentShard(); - $last = $this->last_offset; + if ($is_ascending) { + $end_point = $this->last_offset; + } else { + $end_point = $this->start_offset; + } if ($this->current_generation == $gen_doc_offset[0]) { $offset_pair = $shard->nextPostingOffsetDocOffset( - $this->next_offset, $last, $gen_doc_offset[1]); + $this->next_offset, $end_point, $gen_doc_offset[1], + $this->direction); if ($offset_pair === false) { $this->advanceGeneration(); $this->next_offset = $this->current_offset; @@ -341,13 +400,20 @@ class WordIterator extends IndexBundleIterator { $this->advanceSeenDocs(); $this->current_doc_offset = null; - if ($this->current_offset < $this->next_offset) { + $is_ascending = ($this->direction == self::ASCENDING); + $update_check = ($is_ascending) ? + ($this->current_offset < $this->next_offset) : + ($this->current_offset > $this->next_offset); + if ($update_check) { $this->current_offset = $this->next_offset; } else { $this->advanceGeneration(); $this->next_offset = $this->current_offset; } - if ($this->current_offset > $this->last_offset) { + $update_check = ($is_ascending) ? + ($this->current_offset > $this->last_offset) : + ($this->current_offset < $this->start_offset); + if ($update_check) { $this->advanceGeneration(); $this->next_offset = $this->current_offset; } @@ -363,38 +429,63 @@ class WordIterator extends IndexBundleIterator if ($generation === null) { $generation = $this->current_generation; } + $is_ascending = ($this->direction == self::ASCENDING); do { - if ($this->generation_pointer < $this->num_generations) { - $this->generation_pointer++; + $gen_check = ($is_ascending) ? + ($this->generation_pointer < $this->num_generations) : + ($this->generation_pointer >= 0); + if ($gen_check) { + if ($is_ascending) { + $this->generation_pointer++; + } else { + $this->generation_pointer--; + } } - if ($this->generation_pointer < $this->num_generations) { + $gen_check = ($is_ascending) ? + $this->generation_pointer < $this->num_generations : + $this->generation_pointer >= 0; + if ($gen_check) { list($this->current_generation, $this->start_offset, $this->last_offset, ) = $this->dictionary_info[$this->generation_pointer]; - $this->current_offset = $this->start_offset; + $this->current_offset = ($is_ascending) ? $this->start_offset: + $this->last_offset; } - if (!$this->no_more_generations && - $this->current_generation < $generation && - $this->generation_pointer >= $this->num_generations) { - list($estimated_remaining_total, $info) = - IndexManager::getWordInfo($this->index_name, - $this->word_key, 0, -1, $this->num_generations, - C\NUM_DISTINCT_GENERATIONS, true); - if (count($info) > 0) { - $this->num_docs = $this->seen_docs + - $estimated_remaining_total; - ksort($info); - $this->dictionary_info = array_merge($this->dictionary_info, - array_values($info)); - $this->num_generations = count($this->dictionary_info); - $this->no_more_generations = - count($info) < C\NUM_DISTINCT_GENERATIONS; - //will increment back to where were next loop - $this->generation_pointer--; + if (!$this->no_more_generations) { + $gen_check = ($is_ascending) ? + ($this->current_generation < $generation && + $this->generation_pointer >= $this->num_generations) : + ($this->current_generation > $generation && + $this->generation_pointer <= 0); + if ($gen_check) { + list($estimated_remaining_total, $info) = + IndexManager::getWordInfo($this->index_name, + $this->word_key, 0, -1, $this->num_generations, + C\NUM_DISTINCT_GENERATIONS, true); + if (count($info) > 0) { + $this->num_docs = $this->seen_docs + + $estimated_remaining_total; + ksort($info); + $this->dictionary_info = array_merge( + $this->dictionary_info, array_values($info)); + $this->num_generations = count($this->dictionary_info); + $this->no_more_generations = + count($info) < C\NUM_DISTINCT_GENERATIONS; + //will increment back to where were next loop + if ($is_ascending) { + $this->generation_pointer--; + } else { + $this->generation_pointer++; + } + } } } - } while($this->current_generation < $generation && - $this->generation_pointer < $this->num_generations); + $gen_check = ($is_ascending) ? + ($this->current_generation < $generation && + $this->generation_pointer < $this->num_generations) : + ($this->current_generation > $generation && + $this->generation_pointer >= 0); + } while($gen_check); } /** * Gets the doc_offset and generation for the next document that @@ -408,8 +499,13 @@ class WordIterator extends IndexBundleIterator if ($this->current_doc_offset !== null) { return [$this->current_generation, $this->current_doc_offset]; } - if ($this->current_offset > $this->last_offset || - $this->generation_pointer >= $this->num_generations) { + $is_ascending = ($this->direction == self::ASCENDING); + $offset_check = ($is_ascending) ? + ($this->current_offset > $this->last_offset || + $this->generation_pointer >= $this->num_generations) : + ($this->current_offset < $this->start_offset|| + $this->generation_pointer <= -1); + if ($offset_check) { return -1; } $index = IndexManager::getIndex($this->index_name); diff --git a/src/library/media_jobs/FeedsUpdateJob.php b/src/library/media_jobs/FeedsUpdateJob.php index ad067906e..b08b57346 100644 --- a/src/library/media_jobs/FeedsUpdateJob.php +++ b/src/library/media_jobs/FeedsUpdateJob.php @@ -606,9 +606,8 @@ class FeedsUpdateJob extends MediaJob $prune_shard_name = C\WORK_DIRECTORY . "/feeds/prune_index"; $dir = C\CRAWL_DIR . '/cache/' . self::feed_index_data_base_name; $info['DESCRIPTION'] = "feed"; - $info[self::DIRECTION] = self::BACKWARD; $index_archive = new IndexArchiveBundle($dir, false, - serialize($info), C\NUM_DOCS_PER_GENERATION); + serialize($info), C\NUM_DOCS_PER_GENERATION, self::DESCENDING); $this->db->setWorldPermissionsRecursive($dir); $prune_shard = new IndexShard($prune_shard_name); $too_old = $time - $age; diff --git a/src/locale/ar/configure.ini b/src/locale/ar/configure.ini index bb0bec899..c7dd4168a 100755 --- a/src/locale/ar/configure.ini +++ b/src/locale/ar/configure.ini @@ -335,6 +335,9 @@ social_component_num_results = "عدد من النتائج" social_component_del_frag = "" social_component_weight = "الوزن" social_component_name = "" +social_component_order = "" +social_component_ascending = "" +social_component_descending = "" social_component_add_keywords = "" social_component_actions = "إجراءات" social_component_add_query = "إضافة استعلام" @@ -1605,6 +1608,8 @@ mixcrawls_element_available_mixes = "يمزج المتاحة" mixcrawls_view_name = "الاسم" mixcrawls_view_definition = "تعريف" mixcrawls_view_actions = "إجراءات" +mixcrawls_view_ascending = "" +mixcrawls_view_descending = "" mixcrawls_view_no_components = "مزيج لم أية مكونات" mixcrawls_view_edit = "تحرير" mixcrawls_set_index = "تعيين كفهرس" diff --git a/src/locale/bn/configure.ini b/src/locale/bn/configure.ini index f0e4d09b9..35362a452 100755 --- a/src/locale/bn/configure.ini +++ b/src/locale/bn/configure.ini @@ -335,6 +335,9 @@ social_component_num_results = "" social_component_del_frag = "" social_component_weight = "" social_component_name = "" +social_component_order = "" +social_component_ascending = "" +social_component_descending = "" social_component_add_keywords = "" social_component_actions = "" social_component_add_query = "" @@ -1605,6 +1608,8 @@ mixcrawls_element_available_mixes = "" mixcrawls_view_name = "" mixcrawls_view_definition = "" mixcrawls_view_actions = "" +mixcrawls_view_ascending = "" +mixcrawls_view_descending = "" mixcrawls_view_no_components = "" mixcrawls_view_edit = "" mixcrawls_set_index = "" diff --git a/src/locale/de/configure.ini b/src/locale/de/configure.ini index e828a61a7..4ecc9b7e2 100755 --- a/src/locale/de/configure.ini +++ b/src/locale/de/configure.ini @@ -335,6 +335,9 @@ social_component_num_results = "" social_component_del_frag = "" social_component_weight = "" social_component_name = "" +social_component_order = "" +social_component_ascending = "" +social_component_descending = "" social_component_add_keywords = "" social_component_actions = "" social_component_add_query = "" @@ -1605,6 +1608,8 @@ mixcrawls_element_available_mixes = "" mixcrawls_view_name = "" mixcrawls_view_definition = "" mixcrawls_view_actions = "" +mixcrawls_view_ascending = "" +mixcrawls_view_descending = "" mixcrawls_view_no_components = "" mixcrawls_view_edit = "" mixcrawls_set_index = "" diff --git a/src/locale/en_US/configure.ini b/src/locale/en_US/configure.ini index d0a4a5dd4..5eb6404a3 100644 --- a/src/locale/en_US/configure.ini +++ b/src/locale/en_US/configure.ini @@ -335,6 +335,9 @@ social_component_num_results = "Results Shown" social_component_del_frag = "Remove" social_component_weight = "Weight" social_component_name = "Name" +social_component_order = "Order" +social_component_ascending = "Ascending" +social_component_descending = "Descending" social_component_add_keywords = "Keywords" social_component_actions = "Actions" social_component_add_query = "Add Query" @@ -1605,6 +1608,8 @@ mixcrawls_element_available_mixes = "Mixes" mixcrawls_view_name = "Name" mixcrawls_view_definition = "Definition" mixcrawls_view_actions = "Actions" +mixcrawls_view_ascending = "ASC" +mixcrawls_view_descending = "DESC" mixcrawls_view_no_components = "Mix has no components yet" mixcrawls_view_edit = "Edit" mixcrawls_set_index = "Set as Index" diff --git a/src/locale/es/configure.ini b/src/locale/es/configure.ini index 1319a8d0f..416ab87a1 100755 --- a/src/locale/es/configure.ini +++ b/src/locale/es/configure.ini @@ -335,6 +335,9 @@ social_component_num_results = "Número de Resultados" social_component_del_frag = "" social_component_weight = "Tamaño" social_component_name = "" +social_component_order = "" +social_component_ascending = "" +social_component_descending = "" social_component_add_keywords = "" social_component_actions = "Acciones" social_component_add_query = "Agregar consulta" @@ -1605,6 +1608,8 @@ mixcrawls_element_available_mixes = "" mixcrawls_view_name = "" mixcrawls_view_definition = "" mixcrawls_view_actions = "" +mixcrawls_view_ascending = "" +mixcrawls_view_descending = "" mixcrawls_view_no_components = "" mixcrawls_view_edit = "" mixcrawls_set_index = "" diff --git a/src/locale/fa/configure.ini b/src/locale/fa/configure.ini index 3d404ca7f..1e30f1801 100755 --- a/src/locale/fa/configure.ini +++ b/src/locale/fa/configure.ini @@ -335,6 +335,9 @@ social_component_num_results = "تعداد نتایج" social_component_del_frag = "" social_component_weight = "وزن" social_component_name = "" +social_component_order = "" +social_component_ascending = "" +social_component_descending = "" social_component_add_keywords = "" social_component_actions = "فرمان‌ها" social_component_add_query = "پُرسمان اضافه کن" @@ -1605,6 +1608,8 @@ mixcrawls_element_available_mixes = "ترکیب‌های موجود" mixcrawls_view_name = "نام" mixcrawls_view_definition = "تعریف" mixcrawls_view_actions = "فرمان‌ها" +mixcrawls_view_ascending = "" +mixcrawls_view_descending = "" mixcrawls_view_no_components = "ترکیب هنوز هیچ جزئی ندارد" mixcrawls_view_edit = "ویرایش" mixcrawls_set_index = "برای نمایه قرار بده" diff --git a/src/locale/fr_FR/configure.ini b/src/locale/fr_FR/configure.ini index 4aea36a46..8ef5dfd51 100755 --- a/src/locale/fr_FR/configure.ini +++ b/src/locale/fr_FR/configure.ini @@ -335,6 +335,9 @@ social_component_num_results = "" social_component_del_frag = "" social_component_weight = "" social_component_name = "" +social_component_order = "" +social_component_ascending = "" +social_component_descending = "" social_component_add_keywords = "" social_component_actions = "" social_component_add_query = "" @@ -1605,6 +1608,8 @@ mixcrawls_element_available_mixes = "" mixcrawls_view_name = "" mixcrawls_view_definition = "" mixcrawls_view_actions = "" +mixcrawls_view_ascending = "" +mixcrawls_view_descending = "" mixcrawls_view_no_components = "" mixcrawls_view_edit = "" mixcrawls_set_index = "" diff --git a/src/locale/he/configure.ini b/src/locale/he/configure.ini index a96de60e3..d1c15118a 100755 --- a/src/locale/he/configure.ini +++ b/src/locale/he/configure.ini @@ -335,6 +335,9 @@ social_component_num_results = "" social_component_del_frag = "" social_component_weight = "" social_component_name = "" +social_component_order = "" +social_component_ascending = "" +social_component_descending = "" social_component_add_keywords = "" social_component_actions = "" social_component_add_query = "" @@ -1605,6 +1608,8 @@ mixcrawls_element_available_mixes = "" mixcrawls_view_name = "" mixcrawls_view_definition = "" mixcrawls_view_actions = "" +mixcrawls_view_ascending = "" +mixcrawls_view_descending = "" mixcrawls_view_no_components = "" mixcrawls_view_edit = "" mixcrawls_set_index = "" diff --git a/src/locale/hi/configure.ini b/src/locale/hi/configure.ini index a04b25daf..646a844de 100755 --- a/src/locale/hi/configure.ini +++ b/src/locale/hi/configure.ini @@ -335,6 +335,9 @@ social_component_num_results = "" social_component_del_frag = "" social_component_weight = "" social_component_name = "" +social_component_order = "" +social_component_ascending = "" +social_component_descending = "" social_component_add_keywords = "" social_component_actions = "" social_component_add_query = "" @@ -1605,6 +1608,8 @@ mixcrawls_element_available_mixes = "" mixcrawls_view_name = "" mixcrawls_view_definition = "" mixcrawls_view_actions = "" +mixcrawls_view_ascending = "" +mixcrawls_view_descending = "" mixcrawls_view_no_components = "" mixcrawls_view_edit = "" mixcrawls_set_index = "" diff --git a/src/locale/in_ID/configure.ini b/src/locale/in_ID/configure.ini index 80cee686d..b1d16423c 100755 --- a/src/locale/in_ID/configure.ini +++ b/src/locale/in_ID/configure.ini @@ -335,6 +335,9 @@ social_component_num_results = "" social_component_del_frag = "" social_component_weight = "" social_component_name = "" +social_component_order = "" +social_component_ascending = "" +social_component_descending = "" social_component_add_keywords = "" social_component_actions = "" social_component_add_query = "" @@ -1605,6 +1608,8 @@ mixcrawls_element_available_mixes = "" mixcrawls_view_name = "" mixcrawls_view_definition = "" mixcrawls_view_actions = "" +mixcrawls_view_ascending = "" +mixcrawls_view_descending = "" mixcrawls_view_no_components = "" mixcrawls_view_edit = "" mixcrawls_set_index = "" diff --git a/src/locale/it/configure.ini b/src/locale/it/configure.ini index bc58f102e..0198f96cf 100755 --- a/src/locale/it/configure.ini +++ b/src/locale/it/configure.ini @@ -335,6 +335,9 @@ social_component_num_results = "Numero di risultati" social_component_del_frag = "" social_component_weight = "Peso" social_component_name = "" +social_component_order = "" +social_component_ascending = "" +social_component_descending = "" social_component_add_keywords = "" social_component_actions = "Azioni" social_component_add_query = "Aggiungi Ricerca" @@ -1605,6 +1608,8 @@ mixcrawls_element_available_mixes = "Unioni disponibili" mixcrawls_view_name = "Nome" mixcrawls_view_definition = "Definizione" mixcrawls_view_actions = "Azioni" +mixcrawls_view_ascending = "" +mixcrawls_view_descending = "" mixcrawls_view_no_components = "L'Unione non ha ancora componenti" mixcrawls_view_edit = "Modifica" mixcrawls_set_index = "Usa come Indice" diff --git a/src/locale/ja/configure.ini b/src/locale/ja/configure.ini index cf0b0ce37..8afb4539e 100755 --- a/src/locale/ja/configure.ini +++ b/src/locale/ja/configure.ini @@ -335,6 +335,9 @@ social_component_num_results = "" social_component_del_frag = "" social_component_weight = "" social_component_name = "" +social_component_order = "" +social_component_ascending = "" +social_component_descending = "" social_component_add_keywords = "" social_component_actions = "" social_component_add_query = "" @@ -1605,6 +1608,8 @@ mixcrawls_element_available_mixes = "" mixcrawls_view_name = "" mixcrawls_view_definition = "" mixcrawls_view_actions = "" +mixcrawls_view_ascending = "" +mixcrawls_view_descending = "" mixcrawls_view_no_components = "" mixcrawls_view_edit = "" mixcrawls_set_index = "" diff --git a/src/locale/kn/configure.ini b/src/locale/kn/configure.ini index 278eb5121..8ca931740 100755 --- a/src/locale/kn/configure.ini +++ b/src/locale/kn/configure.ini @@ -335,6 +335,9 @@ social_component_num_results = "ಫಲಿತಾಂಶಗಳ ಸಂಖ್ಯೆ" social_component_del_frag = "" social_component_weight = "ಗೌರವ" social_component_name = "" +social_component_order = "" +social_component_ascending = "" +social_component_descending = "" social_component_add_keywords = "" social_component_actions = "ಕ್ರಿಯೆಗಳು" social_component_add_query = "ಪ್ರಶ್ನೆಯನ್ನು ಸೇರಿಸು" @@ -1605,6 +1608,8 @@ mixcrawls_element_available_mixes = "" mixcrawls_view_name = "" mixcrawls_view_definition = "" mixcrawls_view_actions = "" +mixcrawls_view_ascending = "" +mixcrawls_view_descending = "" mixcrawls_view_no_components = "" mixcrawls_view_edit = "" mixcrawls_set_index = "" diff --git a/src/locale/ko/configure.ini b/src/locale/ko/configure.ini index 45105c7cd..57f653260 100755 --- a/src/locale/ko/configure.ini +++ b/src/locale/ko/configure.ini @@ -335,6 +335,9 @@ social_component_num_results = "" social_component_del_frag = "" social_component_weight = "" social_component_name = "" +social_component_order = "" +social_component_ascending = "" +social_component_descending = "" social_component_add_keywords = "" social_component_actions = "" social_component_add_query = "" @@ -1605,6 +1608,8 @@ mixcrawls_element_available_mixes = "" mixcrawls_view_name = "" mixcrawls_view_definition = "" mixcrawls_view_actions = "" +mixcrawls_view_ascending = "" +mixcrawls_view_descending = "" mixcrawls_view_no_components = "" mixcrawls_view_edit = "" mixcrawls_set_index = "" diff --git a/src/locale/nl/configure.ini b/src/locale/nl/configure.ini index 53b7146e8..1d19b99d8 100644 --- a/src/locale/nl/configure.ini +++ b/src/locale/nl/configure.ini @@ -335,6 +335,9 @@ social_component_num_results = "resultaten getoond" social_component_del_frag = "verwijderen" social_component_weight = "gewicht" social_component_name = "naam" +social_component_order = "" +social_component_ascending = "" +social_component_descending = "" social_component_add_keywords = "trefwoorden" social_component_actions = "acties" social_component_add_query = "Zoekopdracht toevoegen" @@ -1605,6 +1608,8 @@ mixcrawls_element_available_mixes = "beschikbaar Mixes" mixcrawls_view_name = "naam" mixcrawls_view_definition = "definitie" mixcrawls_view_actions = "acties" +mixcrawls_view_ascending = "" +mixcrawls_view_descending = "" mixcrawls_view_no_components = "Mix heeft nog geen componenten" mixcrawls_view_edit = "uitgeven" mixcrawls_set_index = "Instellen als Index" diff --git a/src/locale/pl/configure.ini b/src/locale/pl/configure.ini index f59364fbb..8fd5db1dc 100755 --- a/src/locale/pl/configure.ini +++ b/src/locale/pl/configure.ini @@ -335,6 +335,9 @@ social_component_num_results = "" social_component_del_frag = "" social_component_weight = "" social_component_name = "" +social_component_order = "" +social_component_ascending = "" +social_component_descending = "" social_component_add_keywords = "" social_component_actions = "" social_component_add_query = "" @@ -1605,6 +1608,8 @@ mixcrawls_element_available_mixes = "" mixcrawls_view_name = "" mixcrawls_view_definition = "" mixcrawls_view_actions = "" +mixcrawls_view_ascending = "" +mixcrawls_view_descending = "" mixcrawls_view_no_components = "" mixcrawls_view_edit = "" mixcrawls_set_index = "" diff --git a/src/locale/pt/configure.ini b/src/locale/pt/configure.ini index eaf93a394..7ed4a1ef6 100755 --- a/src/locale/pt/configure.ini +++ b/src/locale/pt/configure.ini @@ -335,6 +335,9 @@ social_component_num_results = "" social_component_del_frag = "" social_component_weight = "" social_component_name = "" +social_component_order = "" +social_component_ascending = "" +social_component_descending = "" social_component_add_keywords = "" social_component_actions = "" social_component_add_query = "" @@ -1605,6 +1608,8 @@ mixcrawls_element_available_mixes = "" mixcrawls_view_name = "" mixcrawls_view_definition = "" mixcrawls_view_actions = "" +mixcrawls_view_ascending = "" +mixcrawls_view_descending = "" mixcrawls_view_no_components = "" mixcrawls_view_edit = "" mixcrawls_set_index = "" diff --git a/src/locale/ru/configure.ini b/src/locale/ru/configure.ini index e6306a324..548273d2a 100755 --- a/src/locale/ru/configure.ini +++ b/src/locale/ru/configure.ini @@ -335,6 +335,9 @@ social_component_num_results = "" social_component_del_frag = "" social_component_weight = "" social_component_name = "" +social_component_order = "" +social_component_ascending = "" +social_component_descending = "" social_component_add_keywords = "" social_component_actions = "" social_component_add_query = "" @@ -1605,6 +1608,8 @@ mixcrawls_element_available_mixes = "" mixcrawls_view_name = "" mixcrawls_view_definition = "" mixcrawls_view_actions = "" +mixcrawls_view_ascending = "" +mixcrawls_view_descending = "" mixcrawls_view_no_components = "" mixcrawls_view_edit = "" mixcrawls_set_index = "" diff --git a/src/locale/te/configure.ini b/src/locale/te/configure.ini index fb32832d9..111582bde 100644 --- a/src/locale/te/configure.ini +++ b/src/locale/te/configure.ini @@ -335,6 +335,9 @@ social_component_num_results = "ఫలితాలను చూపించా social_component_del_frag = "తొలగించు" social_component_weight = "బరువు" social_component_name = "పేరు " +social_component_order = "" +social_component_ascending = "" +social_component_descending = "" social_component_add_keywords = "కీ పదాలు" social_component_actions = "యాక్సన్ లు" social_component_add_query = "క్వెరి జోడించుము" @@ -1605,6 +1608,8 @@ mixcrawls_element_available_mixes = "లభ్యంఅయ్యే మిక్ mixcrawls_view_name = "పేరు" mixcrawls_view_definition = "నిర్వచనం" mixcrawls_view_actions = "చర్యలు" +mixcrawls_view_ascending = "" +mixcrawls_view_descending = "" mixcrawls_view_no_components = "" mixcrawls_view_edit = "ఎడిట్" mixcrawls_set_index = "" diff --git a/src/locale/th/configure.ini b/src/locale/th/configure.ini index 2bb9935f9..d12b2ea56 100755 --- a/src/locale/th/configure.ini +++ b/src/locale/th/configure.ini @@ -335,6 +335,9 @@ social_component_num_results = "" social_component_del_frag = "" social_component_weight = "" social_component_name = "" +social_component_order = "" +social_component_ascending = "" +social_component_descending = "" social_component_add_keywords = "" social_component_actions = "" social_component_add_query = "" @@ -1605,6 +1608,8 @@ mixcrawls_element_available_mixes = "" mixcrawls_view_name = "" mixcrawls_view_definition = "" mixcrawls_view_actions = "" +mixcrawls_view_ascending = "" +mixcrawls_view_descending = "" mixcrawls_view_no_components = "" mixcrawls_view_edit = "" mixcrawls_set_index = "" diff --git a/src/locale/tr/configure.ini b/src/locale/tr/configure.ini index 8c95ebd85..8e70eef27 100755 --- a/src/locale/tr/configure.ini +++ b/src/locale/tr/configure.ini @@ -335,6 +335,9 @@ social_component_num_results = "" social_component_del_frag = "" social_component_weight = "" social_component_name = "" +social_component_order = "" +social_component_ascending = "" +social_component_descending = "" social_component_add_keywords = "" social_component_actions = "" social_component_add_query = "" @@ -1605,6 +1608,8 @@ mixcrawls_element_available_mixes = "" mixcrawls_view_name = "" mixcrawls_view_definition = "" mixcrawls_view_actions = "" +mixcrawls_view_ascending = "" +mixcrawls_view_descending = "" mixcrawls_view_no_components = "" mixcrawls_view_edit = "" mixcrawls_set_index = "" diff --git a/src/locale/vi_VN/configure.ini b/src/locale/vi_VN/configure.ini index 1eb5db270..8b708a342 100755 --- a/src/locale/vi_VN/configure.ini +++ b/src/locale/vi_VN/configure.ini @@ -335,6 +335,9 @@ social_component_num_results = "Số kết quả" social_component_del_frag = "" social_component_weight = "Trọng lượng" social_component_name = "" +social_component_order = "" +social_component_ascending = "" +social_component_descending = "" social_component_add_keywords = "" social_component_actions = "Hành động" social_component_add_query = "Cộng thêm truy vấn" @@ -1605,6 +1608,8 @@ mixcrawls_element_available_mixes = "" mixcrawls_view_name = "" mixcrawls_view_definition = "" mixcrawls_view_actions = "" +mixcrawls_view_ascending = "" +mixcrawls_view_descending = "" mixcrawls_view_no_components = "" mixcrawls_view_edit = "" mixcrawls_set_index = "" diff --git a/src/locale/zh_CN/configure.ini b/src/locale/zh_CN/configure.ini index 631f5d3a3..104a5b131 100755 --- a/src/locale/zh_CN/configure.ini +++ b/src/locale/zh_CN/configure.ini @@ -335,6 +335,9 @@ social_component_num_results = "結果數量" social_component_del_frag = "" social_component_weight = "元素重量" social_component_name = "" +social_component_order = "" +social_component_ascending = "" +social_component_descending = "" social_component_add_keywords = "" social_component_actions = "元素活動" social_component_add_query = "增加查詢" @@ -1605,6 +1608,8 @@ mixcrawls_element_available_mixes = "" mixcrawls_view_name = "" mixcrawls_view_definition = "" mixcrawls_view_actions = "" +mixcrawls_view_ascending = "" +mixcrawls_view_descending = "" mixcrawls_view_no_components = "" mixcrawls_view_edit = "" mixcrawls_set_index = "" diff --git a/src/models/CrawlModel.php b/src/models/CrawlModel.php index be5060fa3..665f45e15 100755 --- a/src/models/CrawlModel.php +++ b/src/models/CrawlModel.php @@ -278,7 +278,7 @@ class CrawlModel extends ParallelModel $mix['FRAGMENTS'][$row['FRAGMENT_ID']]['RESULT_BOUND'] = $row['RESULT_BOUND']; } - $sql = "SELECT CRAWL_TIMESTAMP, WEIGHT, KEYWORDS ". + $sql = "SELECT CRAWL_TIMESTAMP, WEIGHT, DIRECTION, KEYWORDS ". " FROM MIX_COMPONENTS WHERE ". " TIMESTAMP=:timestamp AND FRAGMENT_ID=:fragment_id"; $params = [":timestamp" => $timestamp]; @@ -386,10 +386,10 @@ class CrawlModel extends ParallelModel $db->execute($sql, [$timestamp, $fid, $fragment_data['RESULT_BOUND']]); foreach ($fragment_data['COMPONENTS'] as $component) { - $sql = "INSERT INTO MIX_COMPONENTS VALUES (?, ?, ?, ?, ?)"; + $sql = "INSERT INTO MIX_COMPONENTS VALUES (?, ?, ?, ?, ?, ?)"; $db->execute($sql, [$timestamp, $fid, $component['CRAWL_TIMESTAMP'], $component['WEIGHT'], - $component['KEYWORDS']]); + $component['DIRECTION'], $component['KEYWORDS']]); } $fid++; } @@ -422,7 +422,7 @@ class CrawlModel extends ParallelModel $search_controller->clearQuerySavepoint($timestamp); $archive_dir = C\WORK_DIRECTORY."/schedules/". - self::name_archive_iterator.$timestamp; + self::name_archive_iterator . $timestamp; if (file_exists($archive_dir)) { $this->db->unlinkRecursive($archive_dir); } diff --git a/src/models/ParallelModel.php b/src/models/ParallelModel.php index 51269b15d..04c37e079 100755 --- a/src/models/ParallelModel.php +++ b/src/models/ParallelModel.php @@ -305,7 +305,7 @@ class ParallelModel extends Model list($machine, $key, $index_name, $generation, $summary_offset) = $lookup_item; } - if (strcmp($index_name, "feed") != 0) { + if (strcmp($index_name, "db") != 0) { $index = IndexManager::getIndex($index_name); if (is_integer($summary_offset) && is_integer($generation)) { diff --git a/src/models/PhraseModel.php b/src/models/PhraseModel.php index 2b5cdb604..34f377aa9 100755 --- a/src/models/PhraseModel.php +++ b/src/models/PhraseModel.php @@ -158,8 +158,11 @@ class PhraseModel extends ParallelModel if (isset($component['KEYWORDS'])) { $disjunct_string .= " ".$component['KEYWORDS']; } - $stamp = ($component['CRAWL_TIMESTAMP'] <= 1) ? - "" : " i:".$component['CRAWL_TIMESTAMP']; + $sign = ($component['DIRECTION'] == + self::ASCENDING) ? "" : "-"; + $stamp = ($component['CRAWL_TIMESTAMP'] <= 1) ?"" : + " " . $sign . "i:" . + $component['CRAWL_TIMESTAMP']; $rewrite .= $pipe2.$disjunct_string." w:". ($component['WEIGHT'] * $base_weight).$stamp; $pipe2 = ' | '; @@ -722,24 +725,31 @@ class PhraseModel extends ParallelModel foreach ($meta_words as $meta_word) { $pattern = "/(\s)($meta_word(\S)+)/"; preg_match_all($pattern, $phrase, $matches); + if (empty($matches[2])) { + continue; + } if (!in_array($meta_word, - ['i:', 'index:', 'w:', 'weight:', '\-'])) { + ['i:', 'index:', '\-i:', '\-index:', 'w:', 'weight:', '\-'])) { $matches = $matches[2]; $found_metas = array_merge($found_metas, $matches); + } elseif ($meta_word == 'i:' || $meta_word == 'index:' || + $meta_word == '\-i:' || $meta_word == '\-index:') { + if (substr($meta_word, 0, 2) == '\-') { + $index_name = substr($matches[2][0], + strlen($meta_word) - 1); + $index_name = "-$index_name"; + } else { + $index_name = substr($matches[2][0], strlen($meta_word)); + } } elseif ($meta_word == '\-') { - if (count($matches[0]) > 0) { + if (count($matches[0]) > 0 && + strpos($matches[0][0], ':') === false) { foreach ($matches[2] as $disallowed) { $disallow_phrases[] = substr($disallowed, 1); } } - } elseif ($meta_word == 'i:' || $meta_word == 'index:') { - if (isset($matches[2][0])) { - $index_name = substr($matches[2][0], strlen($meta_word)); - } } elseif ($meta_word == 'w:' || $meta_word == 'weight:') { - if (isset($matches[2][0])) { - $weight = substr($matches[2][0], strlen($meta_word)); - } + $weight = substr($matches[2][0], strlen($meta_word)); } if (!empty($matches[0]) && is_string($matches[0]) && substr($matches[0], 0, 11) == 'media:image') { @@ -1668,8 +1678,15 @@ class PhraseModel extends ParallelModel } if (in_array(substr($current_key, 0, 9), $doc_iterate_hashes)) { + $actual_index_name = $index_name; + $direction = self::ASCENDING; + if (($index_name[0] == "-")) { + $actual_index_name = substr($index_name, 1); + $direction = self::DESCENDING; + } $word_iterators[$i] = new I\DocIterator( - $index_name, $filter, $to_retrieve); + $actual_index_name, $filter, $to_retrieve, + $direction); $min_group_override = true; } else { //can happen if exact phrase search suffix approach used @@ -1687,27 +1704,16 @@ class PhraseModel extends ParallelModel $distinct_key[1] : 0; $distinct_key_id = L\unbase64Hash( $distinct_key[0]); - $index = IndexManager::getIndex($index_name); - $archive_info = $index->getArchiveInfo( - $index->dir_name); - $description = unserialize( - $archive_info['DESCRIPTION']); - if (isset($description[self::DIRECTION])) { - $direction = - $description[self::DIRECTION]; - } else { - $direction = self::FORWARD; - } - // have to change index name for checking iterator - if ($direction == self::FORWARD) { - $tmp_word_iterators[$m] = - new I\WordIterator($distinct_key_id, $shift, - $index_name, true, $filter, $to_retrieve); - } else { - $tmp_word_iterators[$m] = new I\ReverseIterator( - $distinct_key_id, $shift, $index_name, true, - $filter, $to_retrieve); + $actual_index_name = $index_name; + $direction = self::ASCENDING; + if (($index_name[0] == "-")) { + $actual_index_name = substr($index_name, 1); + $direction = self::DESCENDING; } + $tmp_word_iterators[$m] = + new I\WordIterator($distinct_key_id, $shift, + $actual_index_name, true, $filter, $to_retrieve, + $direction); $sum += $tmp_word_iterators[$m]->num_docs; if ($tmp_word_iterators[$m]->dictionary_info !=[]) { $min_group_override = true; @@ -1736,11 +1742,19 @@ class PhraseModel extends ParallelModel $num_disallow_keys = count($disallow_keys); if ($num_disallow_keys > 0) { for ($i = 0; $i < $num_disallow_keys; $i++) { + $actual_index_name = $index_name; + $direction = self::ASCENDING; + if (($index_name[0] == "-")) { + $actual_index_name = substr($index_name, 1); + $direction = self::DESCENDING; + } /* notice for now shift always 0 - you can't disallow phrases */ $disallow_iterator = new I\WordIterator($disallow_keys[$i], 0, - $index_name, false, $filter); + $actual_index_name, false, $filter, + I\IndexBundleIterator::RESULTS_PER_BLOCK, + $direction); $word_iterators[$num_word_keys + $i] = new I\NegationIterator($disallow_iterator); } diff --git a/src/models/ProfileModel.php b/src/models/ProfileModel.php index c6b2d3e50..24127d02a 100755 --- a/src/models/ProfileModel.php +++ b/src/models/ProfileModel.php @@ -300,7 +300,7 @@ class ProfileModel extends Model TIMESTAMP NUMERIC(" . C\TIMESTAMP_LEN . "), FRAGMENT_ID INTEGER, CRAWL_TIMESTAMP NUMERIC(".C\TIMESTAMP_LEN."), WEIGHT FLOAT, - KEYWORDS VARCHAR(" . C\TITLE_LEN . "), + DIRECTION INT, KEYWORDS VARCHAR(" . C\TITLE_LEN . "), PRIMARY KEY(TIMESTAMP, FRAGMENT_ID, CRAWL_TIMESTAMP) )", "MIX_FRAGMENTS" => "CREATE TABLE MIX_FRAGMENTS ( TIMESTAMP NUMERIC(" . C\TIMESTAMP_LEN . "),FRAGMENT_ID INTEGER, diff --git a/src/scripts/mix.js b/src/scripts/mix.js index fd2fa6da5..eb1017fc1 100644 --- a/src/scripts/mix.js +++ b/src/scripts/mix.js @@ -49,7 +49,7 @@ function drawFragments() var rcnt = 0; for (var ckey in fragment['components']) { var comp = fragment['components'][ckey]; - drawCrawl(fcnt, rcnt, comp[0], comp[1], comp[2], comp[3]); + drawCrawl(fcnt, rcnt, comp[0], comp[1], comp[2], comp[3], comp[4]); rcnt++; } fcnt++; @@ -110,7 +110,7 @@ function drawFragment(fragment_num, num_results) */ function makeBlankMixTable(tbl, num_fragments, num_results) { - var tdata = "<tr><td colspan=\"2\"><label for=\"add-crawls-"+num_fragments + + var tdata = "<tr><td colspan=\"3\"><label for=\"add-crawls-"+num_fragments + "\">"+tl['social_component_add_crawls']+"</label>"+ drawCrawlSelect(num_fragments)+"</td><td><label for=\"num-results-"+ num_fragments+"\">"+tl['social_component_num_results']+"</label>"+ @@ -119,6 +119,7 @@ function makeBlankMixTable(tbl, num_fragments, num_results) tl['social_component_del_frag']+'</a></td></tr>'+ "<tr><th>"+tl['social_component_weight']+'</th>'+ "<th>"+tl['social_component_name']+'</th>'+ + "<th>"+tl['social_component_order']+'</th>'+ "<th>"+tl['social_component_add_keywords']+'</th>'+ "<th>"+tl['social_component_actions']+"</th></tr>"; tbl.innerHTML = tdata; @@ -153,7 +154,7 @@ function addCrawlHandler(i) var name = ac.options[sel].text; var ts = ac.options[sel].value; ac.selectedIndex = 0; - addCrawl(i, ts, name, 1, ""); + addCrawl(i, ts, name, 1, 1, ""); } } /* @@ -162,15 +163,16 @@ function addCrawlHandler(i) * @param int i crawl fragment to add to * @param int ts timestamp of crawl that is being added * @param String name name of crawl - * @param float weight the crawl should ahve within fragment + * @param float weight the crawl should have within fragment + * @param int direction result from crawl within fragment should have * @param String keywords words to add to search when using this crawl */ -function addCrawl(i, ts, name, weight, keywords) +function addCrawl(i, ts, name, weight, order, keywords) { var frg = fragments[i]['components']; var j = frg.length; - fragments[i]['components'][j] = [ts, name, weight, keywords]; - drawCrawl(i, j, ts, name, weight, keywords) + fragments[i]['components'][j] = [ts, name, weight, order, keywords]; + drawCrawl(i, j, ts, name, weight, order, keywords) } /* * Draws a single crawl within a crawl fragment according to the passed @@ -180,16 +182,19 @@ function addCrawl(i, ts, name, weight, keywords) * @param int j index of crawl that is being added * @param int ts timestamp of crawl that is being drawn * @param String name name of crawl + * @param int order results from crawl within fragment should have * @param float weight the crawl should ahve within fragment + * @param String keywords words to add to search when using this crawl * */ -function drawCrawl(i, j, ts, name, weight, keywords) +function drawCrawl(i, j, ts, name, weight, order, keywords) { var tr =document.createElement("tr"); tr.id = i+"-"+j; elt("mix-table-"+i).appendChild(tr); tr.innerHTML += "<td>"+drawWeightSelect(i, j, weight)+"</td><td>"+name+ + "</td><td>"+drawOrderSelect(i, j, order)+ "</td><td><input type='hidden' name= \"mix[FRAGMENTS]["+i+ "][COMPONENTS]["+j+"][CRAWL_TIMESTAMP]\"' value=\""+ts+"\" />"+ "<input title=\""+tl['social_component_add_query']+"\" "+ @@ -257,6 +262,29 @@ function drawWeightSelect(i, j, selected_weight) select += "</select>"; return select; } +/* + * Used to draw the select drop down to allow users to select a order of + * a given crawl within a crawl fragment should be iterated through at search + * time + * + * @param int i which crawl fragment the crawl belongs to + * @param int j which crawl index within the fragment to draw this weight select + * for + * @param int selected_order -1 == descending, 1 == ascending + */ +function drawOrderSelect(i, j, selected_order) +{ + var asc_select = (selected_order == 1) ? " selected='selected' " : ""; + var desc_select = (selected_order == -1) ? " selected='selected' " : ""; + var select = + "<select name=\'mix[FRAGMENTS]["+i+"][COMPONENTS]["+j+"][DIRECTION]\'>"; + select += "<option value=\'1\' " + asc_select +">" + + tl['social_component_ascending'] + "</option>"; + select += "<option value=\'-1\' " + desc_select + ">" + + tl['social_component_descending'] + "</option>"; + select += "</select>"; + return select; +} /* * Used to draw the select drop down to allow users to select a crawl to be * added to a crawl fragment diff --git a/src/views/elements/MixcrawlsElement.php b/src/views/elements/MixcrawlsElement.php index e36b3bb37..a63ac2370 100644 --- a/src/views/elements/MixcrawlsElement.php +++ b/src/views/elements/MixcrawlsElement.php @@ -117,19 +117,21 @@ class MixcrawlsElement extends Element count($fragment_data['COMPONENTS']) == 0) { continue; } - e(" #".$fragment_data['RESULT_BOUND']."["); + e(" #".$fragment_data['RESULT_BOUND']."{"); $plus = ""; foreach ($fragment_data['COMPONENTS'] as $component) { - $crawl_timestamp = - $component['CRAWL_TIMESTAMP']; + $crawl_timestamp = $component['CRAWL_TIMESTAMP']; + $order = ($component['DIRECTION'] > 0) ? + tl('mixcrawls_view_ascending') : + tl('mixcrawls_view_descending'); e($plus . $component['WEIGHT']." * (". $data['available_crawls'][ - $crawl_timestamp]." + K:". + $crawl_timestamp]."[$order] + K:". $component['KEYWORDS'].")"); $plus = "<br /> + "; } - e("]<br />"); + e("}<br />"); } } else { e(tl('mixcrawls_view_no_components')); diff --git a/tests/IndexShardTest.php b/tests/IndexShardTest.php index 2d1cf11b3..5b8279838 100644 --- a/tests/IndexShardTest.php +++ b/tests/IndexShardTest.php @@ -37,7 +37,6 @@ use seekquarry\yioop\library\IndexShard; use seekquarry\yioop\library\LinearAlgebra as LA; use seekquarry\yioop\library\UnitTest; use seekquarry\yioop\library\index_bundle_iterators\WordIterator; -use seekquarry\yioop\library\index_bundle_iterators\ReverseIterator; use seekquarry\yioop\library\IndexManager; /** @@ -61,7 +60,7 @@ class IndexShardTest extends UnitTest $this->test_objects['shard3'] = new IndexShard(C\WORK_DIRECTORY. "/shard3.txt", 0); $this->test_objects['shard4'] = new IndexShard(C\WORK_DIRECTORY. - "/shard4.txt", 0, C\NUM_DOCS_PER_GENERATION, false, false); + "/shard4.txt", 0); } /** * Deletes any index shard files we may have created @@ -151,9 +150,9 @@ class IndexShardTest extends UnitTest "Doc lookup by meta word works has correct count"); } /** - * Check if can store documents into a reverse index shard and retrieve them - * Shard is just a normal regular IndexShard, while Shard4 sets the - * additional flag which makes everything go in reverse + * Check if can iterate over posting slices in the reverse direction + * To do this we construct two identical shards. We go over 'shard' + * ascendingly, while we go over 'shard4' descendingly and compare */ public function addDocumentsGetPostingsSliceReverseTestCase() { @@ -291,92 +290,44 @@ class IndexShardTest extends UnitTest $this->assertTrue(isset($forward[$docid]), "Doc lookup by word works for shard"); $backward = $this->test_objects['shard4']->getPostingsSliceById( - L\crawlHashWord('the', true), 5); + L\crawlHashWord('the', true), 5, IndexShard::DESCENDING); $this->assertTrue(isset($backward[$docid]), "Doc lookup by word works for shard4"); $this->assertEqual($forward, $backward, "Both only have one document with this word"); $info = $this->test_objects['shard']->getWordInfo( L\crawlHashWord('CCCCCCCC', true), true); - list($first_offset, $last_offset, - $num_docs_or_links) = $info; + list($first_offset, $last_offset, $num_docs_or_links) = $info; $this->assertEqual($first_offset, 36, - "First offset set correctly"); + "First posting offset for CCCCCCCC set correctly"); $this->assertEqual($last_offset, 40, - "Second offset set correctly"); + "Second posting offset for CCCCCCCC set correctly"); + $this->assertEqual($num_docs_or_links, 2, + "Term CCCCCCCC appears in the correct number of documents"); $forward = $this->test_objects['shard']->nextPostingOffsetDocOffset( $first_offset, $last_offset, 5); + $this->assertEqual($forward[0], 36, + "Search ascending finds correct next posting offset"); $backward = $this->test_objects['shard4']->nextPostingOffsetDocOffset( - $first_offset, $last_offset, 5); + $first_offset, $last_offset, 37, IndexShard::DESCENDING); + $this->assertEqual($forward[0], 36, + "Search descending finds correct next posting offset"); + /* + Now we check posting slices between ascending descending are + reversed + */ $forward = $this->test_objects['shard']->getPostingsSlice($first_offset, - $first_offset, $last_offset, 5); + $first_offset, $last_offset, $num_docs_or_links); // have to reset offset values, since getPostingsSlice modifies by ref $info = $this->test_objects['shard4']->getWordInfo( L\crawlHashWord('CCCCCCCC', true), true); - list($first_offset, $last_offset, - $num_docs_or_links) = $info; - $backward = $this->test_objects['shard4']->getPostingsSlice($first_offset, - $last_offset, $last_offset, 5, false); - $reversed = array_reverse($backward); - $this->assertEqual($forward, $backward, - "ReverseIndexShard returns a flipped version off a forward one"); - $word = "media:news"; - list($hash_key, $shift) = L\allCrawlHashPaths($word, true)[0]; - $index_name = 1573453725; - $index_name = 1575422839; - $index_archive_name = "IndexData" . $index_name; - $index_archive_name = "IndexDataFeed"; - $index_name = "feed"; - $results_limit = 200; - $total_results = 0; - if (file_exists(C\CRAWL_DIR.'/cache/' . $index_archive_name)) { - $info = IndexManager::getWordInfo($index_name, $hash_key, $shift, - -1, 0, -1); - $this->assertTrue(isset($info[0][4])); - $forward = []; - if (isset($info[0][4])) { - $word_iterator = new WordIterator($info[0][4], 0, $index_name, - true, null, $results_limit); - $forward_offsets = []; - $offset = $word_iterator->currentGenDocOffsetWithWord(); - array_push($forward_offsets, $offset); - while($offset != -1){ - $word_iterator->advance(); - $offset = $word_iterator->currentGenDocOffsetWithWord(); - array_push($forward_offsets, $offset); - } - foreach ($norm_docs as $k => $v) { - $item['bn'] = $v['bn']; - $item['U'] = $v['U']; - $forward[] = $item; - } - $for_results = count($forward_offsets); - } - $backward = []; - $info = IndexManager::getWordInfo($index_name, $hash_key, $shift, - -1, 0, -1); - $this->assertTrue(isset($info[0][4])); - if (isset($info[0][4])) { - $word_rev_iterator = new ReverseIterator($info[0][4], 0, - $index_name, true, null, $results_limit); - $backward_offsets = []; - $offset = $word_rev_iterator->currentGenDocOffsetWithWord(); - array_push($backward_offsets, $offset); - while($offset != -1){ - $word_rev_iterator->advance(); - $offset = $word_rev_iterator->currentGenDocOffsetWithWord(); - array_push($backward_offsets, $offset); - } - $reversed = array_reverse($backward_offsets); - foreach ($rev_docs as $k => $v) { - $item['bn'] = $v['bn']; - $item['U'] = $v['U']; - $backward[] = $item; - } - $backward = array_reverse($backward); - $back_results = count($reversed); - } - } + list($first_offset, $last_offset, $num_docs_or_links) = $info; + $backward = $this->test_objects['shard4']->getPostingsSlice( + $first_offset, $last_offset, $last_offset, $num_docs_or_links, + IndexShard::DESCENDING); + $this->assertEqual(array_keys($forward), + array_reverse(array_keys($backward)), + "DESCENDING Slice returns a reversed version off a ASCENDING one"); } /** * Check if can store link documents into an index shard and retrieve them