viewgit/inc/functions.php:22 Function utf8_encode() is deprecated [8192]
diff --git a/src/configs/Config.php b/src/configs/Config.php index b22405e87..895273dc7 100755 --- a/src/configs/Config.php +++ b/src/configs/Config.php @@ -158,7 +158,7 @@ function nsconddefine($constant, $value) * Version number for upgrade database function * @var int */ -nsdefine('DATABASE_VERSION', 68); +nsdefine('DATABASE_VERSION', 67); /** * Minimum Version fo Yioop for which keyword ad script * still works with this version diff --git a/src/configs/Createdb.php b/src/configs/Createdb.php index a3c2918b2..897333a13 100755 --- a/src/configs/Createdb.php +++ b/src/configs/Createdb.php @@ -153,9 +153,9 @@ $sql ="INSERT INTO USERS VALUES (".PUBLIC_USER_ID.", 'all', 'all','public', $db->execute($sql); //default public group with group id 1 $creation_time = L\microTimestamp(); -$sql = "INSERT INTO GROUPS VALUES(".PUBLIC_GROUP_ID.",'Public','". - $creation_time."','".ROOT_ID."', '".PUBLIC_JOIN."', '".GROUP_READ. - "', ".NON_VOTING_GROUP.", " . FOREVER . ", 0)"; +$sql = "INSERT INTO GROUPS VALUES(".PUBLIC_GROUP_ID.",'Public','" . + $creation_time . "','".ROOT_ID."', '" .PUBLIC_JOIN . "', '" . GROUP_READ . + "', " . NON_VOTING_GROUP.", " . FOREVER . ", 0)"; $db->execute($sql); $now = time(); $db->execute("INSERT INTO ROLE VALUES (" . ADMIN_ROLE . ", 'Admin' )"); @@ -164,9 +164,9 @@ $db->execute("INSERT INTO ROLE VALUES (".BOT_ROLE.", 'Bot User' )"); $db->execute("INSERT INTO USER_ROLE VALUES (" . ROOT_ID . ", " . ADMIN_ROLE . ")"); $db->execute("INSERT INTO USER_GROUP VALUES (" . ROOT_ID . ", ". - PUBLIC_GROUP_ID.", ".ACTIVE_STATUS.", $now)"); + PUBLIC_GROUP_ID.", " . ACTIVE_STATUS . ", $now)"); $db->execute("INSERT INTO USER_GROUP VALUES (".PUBLIC_USER_ID.", ". - PUBLIC_GROUP_ID.", ".ACTIVE_STATUS.", $now)"); + PUBLIC_GROUP_ID.", " . ACTIVE_STATUS . ", $now)"); //Create a Group for Wiki HELP. $sql = "INSERT INTO GROUPS VALUES (" . HELP_GROUP_ID . ",'Help','" . $creation_time . "','" . ROOT_ID . "', @@ -490,7 +490,7 @@ $db->execute("INSERT INTO MIX_COMPONENTS VALUES( 3, 0, 1, 1, 'media:video')"); $db->execute("INSERT INTO CRAWL_MIXES VALUES (4, 'news', ".ROOT_ID.", -1)"); $db->execute("INSERT INTO MIX_FRAGMENTS VALUES(4, 0, 1)"); -$db->execute("INSERT INTO MIX_COMPONENTS VALUES(4, 0, 1, 1, +$db->execute("INSERT INTO MIX_COMPONENTS VALUES(4, 0, 100, 1, 'media:news')"); $db->execute("INSERT INTO SUBSEARCH VALUES('db_subsearch_images', 'images','m:2', 50, '')"); diff --git a/src/executables/ArcTool.php b/src/executables/ArcTool.php index 8c5dcd5a0..73846ca5e 100755 --- a/src/executables/ArcTool.php +++ b/src/executables/ArcTool.php @@ -282,8 +282,8 @@ class ArcTool implements CrawlConstants $index_timestamp = (isset($matches[0])) ? $matches[0] : 0; if ($bundle_num >= 0) { $index_timestamp .= "-$bundle_num"; - } else if ($bundle_name == "IndexDataNewsFeed") { - $index_timestamp = "NewsFeed"; + } else if ($bundle_name == "IndexDataFeed") { + $index_timestamp = "feed"; } $hash_paths = L\allCrawlHashPaths($word, true); $found = false; @@ -373,8 +373,8 @@ class ArcTool implements CrawlConstants $index_timestamp = (isset($matches[0])) ? $matches[0] : 0; if ($bundle_num >= 0) { $index_timestamp .= "-$bundle_num"; - } else if ($bundle_name == "IndexDataNewsFeed") { - $index_timestamp = "NewsFeed"; + } else if ($bundle_name == "IndexDataFeed") { + $index_timestamp = "feed"; } $index = IndexManager::getIndex($index_timestamp); $index->setCurrentShard($generation); @@ -493,8 +493,8 @@ class ArcTool implements CrawlConstants $index_timestamp = (isset($matches[0])) ? $matches[0] : 0; if ($bundle_num >= 0) { $index_timestamp .= "-$bundle_num"; - } else if ($bundle_num = "IndexDataNewsFeed") { - $index_timestamp = "NewsFeed"; + } else if ($bundle_num = "IndexDataFeed") { + $index_timestamp = "feed"; } $index = IndexManager::getIndex($index_timestamp); $index->setCurrentShard($generation, true); diff --git a/src/library/CrawlConstants.php b/src/library/CrawlConstants.php index 75d41fcbd..1546388a9 100755 --- a/src/library/CrawlConstants.php +++ b/src/library/CrawlConstants.php @@ -60,7 +60,7 @@ interface CrawlConstants const robot_data_base_name = "RobotData"; const etag_expires_data_base_name = "EtagExpiresData"; const index_data_base_name = "IndexData"; - const feed_index_data_base_name = "IndexDataNewsFeed"; + const feed_index_data_base_name = "IndexDataFeed"; const double_index_base_name = "DoubleIndexData"; const network_base_name = "Network"; const network_crawllist_base_name = "NetworkCrawlList"; @@ -74,6 +74,11 @@ interface CrawlConstants const robot_table_name = "robot_table.txt"; const mirror_table_name = "mirror_table.txt"; const local_ip_cache_file = "local_ip_cache.txt"; + /** used for word iterator direction */ + const FORWARD = 1; + const BACKWARD = -1; + /** media feed index archive bundle timestamp */ + const FEED_CRAWL_TIME = 100; /** Used in priority queue*/ const MAX = 1; const MIN = -1; @@ -113,6 +118,7 @@ interface CrawlConstants const WIDTH = 'C'; const ROBOTS_TXT = 'D'; const DEBUG = "E"; + const DIRECTION = "F"; // codes available here const DOC_DEPTH = 'M'; const DOC_RANK = 'N'; diff --git a/src/library/IndexArchiveBundle.php b/src/library/IndexArchiveBundle.php index 167d98b6e..88736b1a0 100644 --- a/src/library/IndexArchiveBundle.php +++ b/src/library/IndexArchiveBundle.php @@ -317,7 +317,8 @@ class IndexArchiveBundle implements CrawlConstants * merge dictionary side effects * @return object the currently being index shard */ - public function getCurrentShard($force_read = false, $forward = true) + public function getCurrentShard($force_read = false, + $direction = self::FORWARD) { if (!isset($this->current_shard)) { if (!isset($this->generation_info['CURRENT'])) { @@ -331,7 +332,7 @@ class IndexArchiveBundle implements CrawlConstants $this->current_shard = new IndexShard( $current_index_shard_file, $this->generation_info['CURRENT'], - $this->num_docs_per_generation, true, $forward); + $this->num_docs_per_generation, true, $direction); $this->current_shard->getShardHeader($force_read); $this->current_shard->read_only_from_disk = true; } else { @@ -415,7 +416,9 @@ class IndexArchiveBundle implements CrawlConstants public function countWordKeys($word_keys) { $words_array = []; - if (!is_array($word_keys) || count($word_keys) < 1) { return null;} + if (!is_array($word_keys) || count($word_keys) < 1) { + return null; + } foreach ($word_keys as $word_key) { $tmp = $this->dictionary->getWordInfo($word_key); if ($tmp === false) { diff --git a/src/library/IndexManager.php b/src/library/IndexManager.php index 95df96176..1a7f87715 100644 --- a/src/library/IndexManager.php +++ b/src/library/IndexManager.php @@ -62,13 +62,13 @@ class IndexManager implements CrawlConstants const INDEX_CACHE_SIZE = 1000; /** * Returns a reference to the managed copy of an IndexArchiveBundle object - * with a given timestamp or an IndexShard in the case where - * $index_name == "feed" (for handling media feeds) + * with a given timestamp or feed (for handling media feeds) * * @param string $index_name timestamp of desired IndexArchiveBundle + * @param int $direction * @return object the desired IndexArchiveBundle reference */ - public static function getIndex($index_name, $forward_direction = true) + public static function getIndex($index_name, $direction = self::FORWARD) { $index_name = trim($index_name); //trim to fix postgres quirkiness if (empty(self::$indexes[$index_name]) || @@ -76,65 +76,53 @@ class IndexManager implements CrawlConstants ($index_name == "feed" || php_sapi_name() == 'cli') && (time() - self::$index_times[$index_name]) > C\MIN_QUERY_CACHE_TIME) ) { - if ($index_name == "feed") { - $index_file = C\WORK_DIRECTORY . "/feeds/index"; - if (file_exists($index_file)) { - self::$indexes[$index_name] = new IndexShard( - $index_file, 0, C\NUM_DOCS_PER_GENERATION, true); - self::$index_times["feed"] = time(); - } else { + if ($index_name == "feed" || $index_name == self::FEED_CRAWL_TIME) { + $index_archive_name = self::feed_index_data_base_name; + $index_name = "feed"; + } else { + $index_archive_name = self::index_data_base_name . $index_name; + } + if (file_exists(C\CRAWL_DIR.'/cache/' . $index_archive_name)) { + $tmp = new IndexArchiveBundle( + C\CRAWL_DIR . '/cache/' . $index_archive_name, null, + C\NUM_DOCS_PER_GENERATION, $direction); + if (!$tmp) { return false; } } else { - if ($index_name == "NewsFeed") { - $index_archive_name = self::feed_index_data_base_name; - $index_name = 13; - } else { - $index_archive_name = self::index_data_base_name . $index_name; + $tmp = false; + $use_name = $index_name; + $serve_archive = -1; + if (preg_match("/\-\d$/", $index_name)) { + $serve_archive = substr($index_name, -1); + $use_name = substr($index_name, 0, -2); } - $index_archive_name = self::index_data_base_name . $index_name; - if (file_exists(C\CRAWL_DIR.'/cache/' . $index_archive_name)) { - $tmp = new IndexArchiveBundle( - C\CRAWL_DIR.'/cache/' . $index_archive_name, null, - C\NUM_DOCS_PER_GENERATION, $forward_direction); - if (!$tmp) { - return false; - } - } else { - $tmp = false; - $use_name = $index_name; - $serve_archive = -1; - if (preg_match("/\-\d$/", $index_name)) { - $serve_archive = substr($index_name, -1); - $use_name = substr($index_name, 0, -2); - } - $index_archive_name = self::double_index_base_name . - $use_name; - $status_file = C\CRAWL_DIR . '/cache/' . - $index_archive_name . "/status.txt"; - if ($serve_archive < 0 && file_exists($status_file)) { - $status = unserialize(file_get_contents($status_file)); - $active_archive = (empty($status["swap_count"])) ? 1 : - $status["swap_count"] % 2; - $serve_archive = 1 - $active_archive; - } + $index_archive_name = self::double_index_base_name . + $use_name; + $status_file = C\CRAWL_DIR . '/cache/' . + $index_archive_name . "/status.txt"; + if ($serve_archive < 0 && file_exists($status_file)) { + $status = unserialize(file_get_contents($status_file)); + $active_archive = (empty($status["swap_count"])) ? 1 : + $status["swap_count"] % 2; + $serve_archive = 1 - $active_archive; + } + $tmp = new IndexArchiveBundle( + C\CRAWL_DIR . '/cache/' . $index_archive_name . + "/bundle$serve_archive"); + if (!$tmp) { + $serve_archive = ($serve_archive == 0) ? 1 : 0; $tmp = new IndexArchiveBundle( C\CRAWL_DIR . '/cache/' . $index_archive_name . "/bundle$serve_archive"); - if (!$tmp) { - $serve_archive = ($serve_archive == 0) ? 1 : 0; - $tmp = new IndexArchiveBundle( - C\CRAWL_DIR . '/cache/' . $index_archive_name . - "/bundle$serve_archive"); - } - if (!$tmp) { - return false; - } } - self::$indexes[$index_name] = $tmp; - self::$indexes[$index_name]->setCurrentShard(0, true); - self::$index_times[$index_name] = time(); + if (!$tmp) { + return false; + } } + self::$indexes[$index_name] = $tmp; + self::$indexes[$index_name]->setCurrentShard(0, true); + self::$index_times[$index_name] = time(); /* If too many cached discard oldest 1/3 of cached indices */ @@ -207,20 +195,12 @@ class IndexManager implements CrawlConstants $threshold = -1, $start_generation = -1, $num_distinct_generations = -1, $with_remaining_total = false) { + if ($index_name == self::FEED_CRAWL_TIME) { + $index_name = "feed"; + } $id = "$index_name:$start_generation:$num_distinct_generations"; $index = self::getIndex($index_name); $tmp = []; - if ((!C\nsdefined('NO_FEEDS') || !C\NO_FEEDS) && - $start_generation < 0 - && file_exists(C\WORK_DIRECTORY . "/feeds/index")) { - $use_feeds = true; - $feed_shard = self::getIndex("feed"); - $feed_info = $feed_shard->getWordInfo($hash, true, $shift); - if (is_array($feed_info)) { - $tmp[-1] = [-1, $feed_info[0], - $feed_info[1], $feed_info[2], $feed_info[3]]; - } - } if (!empty($index->dictionary)) { $pre_info = $index->dictionary->getWordInfo($hash, true, $shift, diff --git a/src/library/IndexShard.php b/src/library/IndexShard.php index 49240695b..472972129 100644 --- a/src/library/IndexShard.php +++ b/src/library/IndexShard.php @@ -283,7 +283,7 @@ class IndexShard extends PersistentStructure implements CrawlConstants * * @param string $fname filename to store the index shard with * @param int $generation when returning documents from the shard - * pretend there ar ethis many earlier documents + * pretend there are this many earlier documents * @param int $num_docs_per_generation the number of documents that a * given shard can hold. * @param bool $read_only_from_disk used to determined if this shard is @@ -673,17 +673,19 @@ class IndexShard extends PersistentStructure implements CrawlConstants } // Normal forward iterator if ($forward_dir) { - return $this->postingsSliceForward($start_offset, $next_offset, $last_offset, - $len); - } - // Reverse direction iterator used for newsfeed - else { - return $this->postingsSliceBackward($start_offset, $next_offset, $last_offset, - $len); + return $this->postingsSliceForward($start_offset, $next_offset, + $last_offset, $len); + } else { + // Reverse direction iterator used for newsfeed + return $this->postingsSliceBackward($start_offset, $next_offset, + $last_offset, $len); } } - public function postingsSliceForward($start_offset, &$next_offset, $last_offset, - $len) + /** + * + */ + public function postingsSliceForward($start_offset, &$next_offset, + $last_offset, $len) { $num_docs_so_far = 0; $results = []; @@ -719,8 +721,11 @@ class IndexShard extends PersistentStructure implements CrawlConstants $next_offset = $next << 2; return $results; } - public function postingsSliceBackward($start_offset, &$next_offset, $last_offset, - $len) + /** + * + */ + public function postingsSliceBackward($start_offset, &$next_offset, + $last_offset, $len) { $num_docs_so_far = 0; $results = []; @@ -745,8 +750,9 @@ class IndexShard extends PersistentStructure implements CrawlConstants break; } $posting_start = $next; - // getPostingAtOffset will modify both start and end to the value of next - // using addresses + /* getPostingAtOffset will modify both start and end to the value of + next using addresses + */ $posting = $this->getPostingAtOffset( $next, $posting_start, $posting_end); $total_posting_len += strlen($posting); @@ -1225,12 +1231,10 @@ class IndexShard extends PersistentStructure implements CrawlConstants if ($this->forward_direction) { $results = $this->getPostingsSlice($first_offset, $first_offset, $last_offset, $len); - } - else { + } else { $results = $this->getPostingsSlice($first_offset, - $last_offset, $last_offset, $len); + $last_offset, $last_offset, $len, false); } - } return $results; } @@ -2188,4 +2192,4 @@ class IndexShard extends PersistentStructure implements CrawlConstants substr($value, self::WORD_KEY_LEN, self::WORD_DATA_LEN); } -} \ No newline at end of file +} diff --git a/src/library/VersionFunctions.php b/src/library/VersionFunctions.php index 078aad92b..09ab43367 100644 --- a/src/library/VersionFunctions.php +++ b/src/library/VersionFunctions.php @@ -1850,14 +1850,3 @@ function upgradeDatabaseVersion67(&$db) $db->execute("ALTER TABLE SUBSEARCH ADD COLUMN " . "DEFAULT_QUERY VARCHAR(" . C\TITLE_LEN . ") DEFAULT ''"); } -/** - * Upgrades a Version 67 version of the Yioop database to a Version 68 version - * @param object $db datasource to use to upgrade. - */ -function upgradeDatabaseVersion68(&$db) -{ - $db->execute("DELETE FROM MIX_COMPONENTS WHERE MIX_TIMESTAMP=4 - AND GROUP_ID=0"); - $db->execute("INSERT INTO MIX_COMPONENTS VALUES( - 4, 0, 13, 1, 'media:news')"); -} diff --git a/src/library/index_bundle_iterators/ReverseIterator.php b/src/library/index_bundle_iterators/ReverseIterator.php index 5436146da..a11ac0810 100644 --- a/src/library/index_bundle_iterators/ReverseIterator.php +++ b/src/library/index_bundle_iterators/ReverseIterator.php @@ -86,17 +86,6 @@ class ReverseIterator extends IndexBundleIterator * @var array */ public $dictionary_info; - /** - * File name (including path) of the feed shard for news items - * @var string - */ - public $feed_shard_name; - /** - * Structure used to hold posting list start and stops for the query - * in the feed shard - * @var array - */ - public $feed_info; /** * The total number of shards that have data for this word * @var int @@ -148,10 +137,6 @@ class ReverseIterator extends IndexBundleIterator const HOST_KEY_POS = 17; /** Length of a doc key*/ const KEY_LEN = 8; - /** If the $limit_feeds constructor input is true then limit the number - * of items coming from the feed shard to this count. - */ - const LIMIT_FEEDS_COUNT = 25; /** * Creates a word iterator with the given parameters. * @@ -164,15 +149,10 @@ class ReverseIterator extends IndexBundleIterator * results * @param int $results_per_block the maximum number of results that can * be returned by a findDocsWithWord call - * @param bool $limit_feeds feed results appear before all others when - * gotten out of this iterator (may be reordered later). This flag - * controls whether an upper bound of self::LIMIT_FEEDS_COUNT is - * imposed on the number of feed results returned */ public function __construct($word_key, $shift, $index_name, $raw = false, - $filter = null, - $results_per_block = IndexBundleIterator::RESULTS_PER_BLOCK, - $limit_feeds = false) + $filter = null, $results_per_block = + IndexBundleIterator::RESULTS_PER_BLOCK) { if ($raw == false) { //get rid of out modified base64 encoding @@ -181,55 +161,10 @@ class ReverseIterator extends IndexBundleIterator $this->filter = $filter; $this->word_key = $word_key; $this->shift = $shift; - // 13 is somewhat of a magic number right now - if($index_name == 13) { - $index_name = "NewsFeed"; - } $this->index_name = $index_name; - list($estimated_total, $this->dictionary_info) = + list($this->num_docs, $this->dictionary_info) = IndexManager::getWordInfo($index_name, $word_key, $shift, -1, -1, C\NUM_DISTINCT_GENERATIONS, true); - $this->feed_shard_name = C\WORK_DIRECTORY . "/feeds/index"; - if ((!C\nsdefined('NO_FEEDS') || !C\NO_FEEDS) - && file_exists($this->feed_shard_name)) { - $this->use_feeds = true; - } else { - $this->use_feeds = false; - } - if ($this->use_feeds) { - if (!isset($this->dictionary_info[-1])) { - $this->feed_info = false; - $this->feed_empty = true; - } else { - $this->feed_info = $this->dictionary_info[-1]; - unset($this->dictionary_info[-1]); - $this->feed_empty = false; - } - } else { - $this->feed_info = false; - $this->feed_empty = true; - } - if (is_array($this->feed_info)) { - list(,$this->feed_start, $this->feed_end, $this->feed_count,) = - $this->feed_info; - $this->feed_info = [$this->feed_start, $this->feed_end, - $this->feed_count]; - } else { - $this->feed_start = 0; - $this->feed_end = 0; - $this->feed_count = 0; - } - if ($this->feed_count > 0) { - $this->using_feeds = true; - } else { - $this->using_feeds = false; - } - if ($limit_feeds && $this->feed_count > self::LIMIT_FEEDS_COUNT) { - $this->feed_count = self::LIMIT_FEEDS_COUNT; - $this->feed_end = $this->feed_start + - IndexShard::POSTING_LEN * (self::LIMIT_FEEDS_COUNT - 1); - } - $this->num_docs = $this->feed_count + $estimated_total; if ($this->dictionary_info === false) { $this->empty = true; } else { @@ -248,7 +183,7 @@ class ReverseIterator extends IndexBundleIterator $this->results_per_block = $results_per_block; $this->current_block_fresh = false; $this->start_generation = $this->num_generations-1; - if ($this->dictionary_info !== false || $this->feed_info !== false) { + if ($this->dictionary_info !== false) { $this->reset(); } } @@ -259,19 +194,12 @@ class ReverseIterator extends IndexBundleIterator */ public function reset() { - if ($this->feed_count > 0) { - $this->using_feeds = true; - } else { - $this->using_feeds = false; - } - $no_feeds = $this->feed_empty || !$this->use_feeds; if (!$this->empty) {//we shouldn't be called when empty - but to be safe if ($this->start_generation < $this->num_generations-1) { - list($estimated_total, $this->dictionary_info) = + list($this->num_docs, $this->dictionary_info) = IndexManager::getWordInfo($this->index_name, $this->word_key, 0, -1, 0, C\NUM_DISTINCT_GENERATIONS, true); - $this->num_docs = $this->feed_count + $estimated_total; ksort($this->dictionary_info); $this->dictionary_info = array_values($this->dictionary_info); $this->num_generations = count($this->dictionary_info); @@ -281,15 +209,12 @@ class ReverseIterator extends IndexBundleIterator list($this->current_generation, $this->start_offset, $this->last_offset, ) = $this->dictionary_info[$this->num_generations-1]; - # if the feed isn't empty - } else { - $this->start_offset = 0; - $this->last_offset = -1; - $this->num_generations = -1; } $this->current_offset = $this->last_offset; - // reset pointer to the number of gens, which in reverse is the first one we want - $this->generation_pointer = $this->num_generations-1; + /* reset pointer to the number of gens, which in reverse is the + first one we want + */ + $this->generation_pointer = $this->num_generations - 1; $this->count_block = 0; $this->seen_docs = 0; $this->current_doc_offset = null; @@ -317,20 +242,19 @@ class ReverseIterator extends IndexBundleIterator $index->setCurrentShard($this->current_generation, true); //the next call also updates next offset $shard = $index->getCurrentShard(false, false); - $pre_results = $shard->getPostingsSlice( - $this->start_offset, + $pre_results = $shard->getPostingsSlice($this->start_offset, $this->next_offset, $this->last_offset, $this->results_per_block, false); - if($this->index_name == "NewsFeed") { + if($this->index_name == "feed") { $time = time(); foreach ($pre_results as $keys => $pre_result) { - $page = $index->getPage($pre_result[self::SUMMARY_OFFSET], + $page = $index->getPage($pre_result[self::SUMMARY_OFFSET], $this->current_generation); $delta = $time - $page[self::PUBDATE]; $pre_results[$keys][self::DOC_RANK] = 720000 / max($delta, 1); } - } + } } $results = []; $doc_key_len = IndexShard::DOC_KEY_LEN; @@ -348,11 +272,7 @@ class ReverseIterator extends IndexBundleIterator } else { continue; } - if (!empty($data[self::IS_FEED])) { - $data[self::CRAWL_TIME] = "feed"; - } else { - $data[self::CRAWL_TIME] = $this->index_name; - } + $data[self::CRAWL_TIME] = $this->index_name; $results[$keys] = $data; } $this->count_block = count($results); @@ -365,10 +285,11 @@ class ReverseIterator extends IndexBundleIterator } /** * Updates the seen_docs count during an advance() call - * For a reverse shard, instead of adding to the offset, we subtract by a block instead. + * For a reverse shard, instead of adding to the offset, we subtract by a + * block instead. */ public function advanceSeenDocs() - { + { if ($this->current_block_fresh != true) { $total_guess = IndexShard::numDocsOrLinks($this->next_offset, $this->start_offset); @@ -405,8 +326,7 @@ class ReverseIterator extends IndexBundleIterator } $cur_gen_doc_offset = $this->currentGenDocOffsetWithWord(); if ($cur_gen_doc_offset == -1 || - $this->genDocOffsetCmp($cur_gen_doc_offset, - $gen_doc_offset) < 0) { + $this->genDocOffsetCmp($cur_gen_doc_offset, $gen_doc_offset) < 0) { return; } $this->plainAdvance(); @@ -414,16 +334,10 @@ class ReverseIterator extends IndexBundleIterator $this->advanceGeneration($gen_doc_offset[0]); $this->next_offset = $this->current_offset; } - $using_feeds = $this->using_feeds && $this->use_feeds; - if ($using_feeds) { - $shard = IndexManager::getIndex("feed"); - $last = $this->feed_end; - } else { - $index = IndexManager::getIndex($this->index_name, false); - $index->setCurrentShard($this->current_generation, true); - $shard = $index->getCurrentShard(false, false); - $start = $this->start_offset; - } + $index = IndexManager::getIndex($this->index_name, false); + $index->setCurrentShard($this->current_generation, true); + $shard = $index->getCurrentShard(false, false); + $start = $this->start_offset; if ($this->current_generation == $gen_doc_offset[0]) { $offset_pair = $shard->nextPostingOffsetDocOffset( $start, $this->next_offset, $gen_doc_offset[1], false); @@ -471,27 +385,25 @@ class ReverseIterator extends IndexBundleIterator */ public function advanceGeneration($generation = null) { - if ($this->using_feeds && $this->use_feeds) { - $this->using_feeds = false; - $this->generation_pointer = -1; - } if ($generation === null) { $generation = $this->current_generation; } do { - # RC if the pointer is greater than the total generations, subtract + // RC if the pointer is greater than the total generations, subtract if ($this->generation_pointer >= 0) { $this->generation_pointer--; } - # RC if the generation pointer is still more than the number of generations + /* RC if the generation pointer is still more than the number of + generations + */ if ($this->generation_pointer >= 0) { list($this->current_generation, $this->start_offset, $this->last_offset, ) = $this->dictionary_info[$this->generation_pointer]; - #set the current offset to the last one of the dictionary + //set the current offset to the last one of the dictionary $this->current_offset = $this->last_offset; } - # if there are more generations and + // if there are more generations and if (!$this->no_more_generations && $this->current_generation > $generation && $this->generation_pointer <= 0) { @@ -528,16 +440,17 @@ class ReverseIterator extends IndexBundleIterator if ($this->current_doc_offset !== null) { return [$this->current_generation, $this->current_doc_offset]; } - # if the current offset is before the first one, or if gen pointer is less than 0 - # we are in an impossible position + /* if the current offset is before the first one, + or if gen pointer is less than 0 we are in an impossible position + */ if ($this->current_offset < $this->start_offset|| $this->generation_pointer <= -1) { return -1; } - $index = IndexManager::getIndex($this->index_name); + $index = IndexManager::getIndex($this->index_name, false); $index->setCurrentShard($this->current_generation, true); $this->current_doc_offset = $index->getCurrentShard( )->docOffsetFromPostingOffset($this->current_offset, false); return [$this->current_generation, $this->current_doc_offset]; } -} \ No newline at end of file +} diff --git a/src/library/index_bundle_iterators/WordIterator.php b/src/library/index_bundle_iterators/WordIterator.php index 333dfb6df..bc80db2c1 100644 --- a/src/library/index_bundle_iterators/WordIterator.php +++ b/src/library/index_bundle_iterators/WordIterator.php @@ -86,17 +86,6 @@ class WordIterator extends IndexBundleIterator * @var array */ public $dictionary_info; - /** - * File name (including path) of the feed shard for news items - * @var string - */ - public $feed_shard_name; - /** - * Structure used to hold posting list start and stops for the query - * in the feed shard - * @var array - */ - public $feed_info; /** * The total number of shards that have data for this word * @var int @@ -147,10 +136,6 @@ class WordIterator extends IndexBundleIterator const HOST_KEY_POS = 17; /** Length of a doc key*/ const KEY_LEN = 8; - /** If the $limit_feeds constructor input is true then limit the number - * of items coming from the feed shard to this count. - */ - const LIMIT_FEEDS_COUNT = 25; /** * Creates a word iterator with the given parameters. * @@ -163,15 +148,10 @@ class WordIterator extends IndexBundleIterator * of edited and deleted search results * @param int $results_per_block the maximum number of results that can * be returned by a findDocsWithWord call - * @param bool $limit_feeds feed results appear before all others when - * gotten out of this iterator (may be reordered later). This flag - * controls whether an upper bound of self::LIMIT_FEEDS_COUNT is - * imposed on the number of feed results returned */ public function __construct($word_key, $shift, $index_name, $raw = false, - $filter = null, - $results_per_block = IndexBundleIterator::RESULTS_PER_BLOCK, - $limit_feeds = false) + $filter = null, $results_per_block = + IndexBundleIterator::RESULTS_PER_BLOCK) { if ($raw == false) { //get rid of out modified base64 encoding @@ -180,54 +160,10 @@ class WordIterator extends IndexBundleIterator $this->filter = $filter; $this->word_key = $word_key; $this->shift = $shift; - if($index_name == 13) { - $index_name = "NewsFeed"; - } $this->index_name = $index_name; - list($estimated_total, $this->dictionary_info) = + list($this->num_docs, $this->dictionary_info) = IndexManager::getWordInfo($index_name, $word_key, $shift, -1, -1, C\NUM_DISTINCT_GENERATIONS, true); - $this->feed_shard_name = C\WORK_DIRECTORY . "/feeds/index"; - if ((!C\nsdefined('NO_FEEDS') || !C\NO_FEEDS) - && file_exists($this->feed_shard_name)) { - $this->use_feeds = true; - } else { - $this->use_feeds = false; - } - if ($this->use_feeds) { - if (!isset($this->dictionary_info[-1])) { - $this->feed_info = false; - $this->feed_empty = true; - } else { - $this->feed_info = $this->dictionary_info[-1]; - unset($this->dictionary_info[-1]); - $this->feed_empty = false; - } - } else { - $this->feed_info = false; - $this->feed_empty = true; - } - if (is_array($this->feed_info)) { - list(,$this->feed_start, $this->feed_end, $this->feed_count,) = - $this->feed_info; - $this->feed_info = [$this->feed_start, $this->feed_end, - $this->feed_count]; - } else { - $this->feed_start = 0; - $this->feed_end = 0; - $this->feed_count = 0; - } - if ($this->feed_count > 0) { - $this->using_feeds = true; - } else { - $this->using_feeds = false; - } - if ($limit_feeds && $this->feed_count > self::LIMIT_FEEDS_COUNT) { - $this->feed_count = self::LIMIT_FEEDS_COUNT; - $this->feed_end = $this->feed_start + - IndexShard::POSTING_LEN * (self::LIMIT_FEEDS_COUNT - 1); - } - $this->num_docs = $this->feed_count + $estimated_total; if ($this->dictionary_info === false) { $this->empty = true; } else { @@ -246,7 +182,7 @@ class WordIterator extends IndexBundleIterator $this->results_per_block = $results_per_block; $this->current_block_fresh = false; $this->start_generation = 0; - if ($this->dictionary_info !== false || $this->feed_info !== false) { + if ($this->dictionary_info !== false) { $this->reset(); } } @@ -256,19 +192,12 @@ class WordIterator extends IndexBundleIterator */ public function reset() { - if ($this->feed_count > 0) { - $this->using_feeds = true; - } else { - $this->using_feeds = false; - } - $no_feeds = $this->feed_empty || !$this->use_feeds; if (!$this->empty) {//we shouldn't be called when empty - but to be safe if ($this->start_generation > 0) { - list($estimated_total, $this->dictionary_info) = + list($this->num_docs, $this->dictionary_info) = IndexManager::getWordInfo($this->index_name, $this->word_key, 0, -1, 0, C\NUM_DISTINCT_GENERATIONS, true); - $this->num_docs = $this->feed_count + $estimated_total; ksort($this->dictionary_info); $this->dictionary_info = array_values($this->dictionary_info); $this->num_generations = count($this->dictionary_info); @@ -283,12 +212,7 @@ class WordIterator extends IndexBundleIterator $this->last_offset = -1; $this->num_generations = -1; } - if (!$no_feeds) { - $this->current_offset = $this->feed_start; - $this->current_generation = -1; - } else { - $this->current_offset = $this->start_offset; - } + $this->current_offset = $this->start_offset; $this->generation_pointer = 0; $this->count_block = 0; $this->seen_docs = 0; @@ -302,41 +226,21 @@ class WordIterator extends IndexBundleIterator */ public function findDocsWithWord() { - $no_feeds = $this->feed_empty || !$this->use_feeds; - $feed_in_use = $this->using_feeds && !$no_feeds; - if ($this->empty && $no_feeds) { + if ($this->empty) { return -1; } - if (!$feed_in_use &&(($this->generation_pointer>=$this->num_generations) - || ($this->generation_pointer == $this->num_generations - 1 && - $this->current_offset > $this->last_offset))) { + if ($this->generation_pointer == $this->num_generations - 1 && + $this->current_offset > $this->last_offset) { return -1; } $pre_results = []; - if ($feed_in_use) { - $this->next_offset = $this->current_offset; - $feed_shard = IndexManager::getIndex("feed"); - if ($feed_shard) { - $pre_results = $feed_shard->getPostingsSlice( - $this->feed_start, - $this->next_offset, $this->feed_end, - $this->results_per_block); - $time = time(); - foreach ($pre_results as $keys => $pre_result) { - $pre_results[$keys][self::IS_FEED] = true; - $delta = $time - $pre_result[self::SUMMARY_OFFSET]; - $pre_results[$keys][self::DOC_RANK] = 720000 / - max($delta, 1); - } - } - } else if (!$this->empty) { + if (!$this->empty) { $this->next_offset = $this->current_offset; $index = IndexManager::getIndex($this->index_name); $index->setCurrentShard($this->current_generation, true); //the next call also updates next offset $shard = $index->getCurrentShard(); - $pre_results = $shard->getPostingsSlice( - $this->start_offset, + $pre_results = $shard->getPostingsSlice($this->start_offset, $this->next_offset, $this->last_offset, $this->results_per_block); } @@ -356,11 +260,7 @@ class WordIterator extends IndexBundleIterator } else { continue; } - if (!empty($data[self::IS_FEED])) { - $data[self::CRAWL_TIME] = "feed"; - } else { - $data[self::CRAWL_TIME] = $this->index_name; - } + $data[self::CRAWL_TIME] = $this->index_name; $results[$keys] = $data; } $this->count_block = count($results); @@ -377,15 +277,9 @@ class WordIterator extends IndexBundleIterator public function advanceSeenDocs() { if ($this->current_block_fresh != true) { - if ($this->using_feeds && $this->use_feeds) { - $num_docs = min($this->results_per_block, - IndexShard::numDocsOrLinks($this->next_offset, - $this->feed_end)); - } else { - $num_docs = min($this->results_per_block, - IndexShard::numDocsOrLinks($this->next_offset, - $this->last_offset)); - } + $num_docs = min($this->results_per_block, + IndexShard::numDocsOrLinks($this->next_offset, + $this->last_offset)); $this->next_offset = $this->current_offset; $this->next_offset += IndexShard::POSTING_LEN * $num_docs; if ($num_docs < 0) { @@ -421,16 +315,10 @@ class WordIterator extends IndexBundleIterator $this->advanceGeneration($gen_doc_offset[0]); $this->next_offset = $this->current_offset; } - $using_feeds = $this->using_feeds && $this->use_feeds; - if ($using_feeds) { - $shard = IndexManager::getIndex("feed"); - $last = $this->feed_end; - } else { - $index = IndexManager::getIndex($this->index_name); - $index->setCurrentShard($this->current_generation, true); - $shard = $index->getCurrentShard(); - $last = $this->last_offset; - } + $index = IndexManager::getIndex($this->index_name); + $index->setCurrentShard($this->current_generation, true); + $shard = $index->getCurrentShard(); + $last = $this->last_offset; if ($this->current_generation == $gen_doc_offset[0]) { $offset_pair = $shard->nextPostingOffsetDocOffset( $this->next_offset, $last, $gen_doc_offset[1]); @@ -442,14 +330,8 @@ class WordIterator extends IndexBundleIterator $offset_pair; } } - if ($this->current_generation == -1) { - $this->seen_docs = ($this->current_offset - $this->feed_start) / - IndexShard::POSTING_LEN; - } else { - $this->seen_docs = ($using_feeds) ? $this->feed_count : 0; - $this->seen_docs += ($this->current_offset - $this->start_offset) / - IndexShard::POSTING_LEN; - } + $this->seen_docs = ($this->current_offset - $this->start_offset) / + IndexShard::POSTING_LEN; } /** * Forwards the iterator one group of docs. This is what's called @@ -465,10 +347,7 @@ class WordIterator extends IndexBundleIterator $this->advanceGeneration(); $this->next_offset = $this->current_offset; } - $using_feeds = $this->using_feeds && $this->use_feeds; - if (($using_feeds && - $this->current_offset > $this->feed_end) || (!$using_feeds && - $this->current_offset > $this->last_offset)) { + if ($this->current_offset > $this->last_offset) { $this->advanceGeneration(); $this->next_offset = $this->current_offset; } @@ -481,10 +360,6 @@ class WordIterator extends IndexBundleIterator */ public function advanceGeneration($generation = null) { - if ($this->using_feeds && $this->use_feeds) { - $this->using_feeds = false; - $this->generation_pointer = -1; - } if ($generation === null) { $generation = $this->current_generation; } @@ -518,7 +393,6 @@ class WordIterator extends IndexBundleIterator $this->generation_pointer--; } } - } while($this->current_generation < $generation && $this->generation_pointer < $this->num_generations); } @@ -529,22 +403,15 @@ class WordIterator extends IndexBundleIterator * @return mixed an array with the desired document offset * and generation; -1 on fail */ - public function currentGenDocOffsetWithWord() { + public function currentGenDocOffsetWithWord() + { if ($this->current_doc_offset !== null) { return [$this->current_generation, $this->current_doc_offset]; } - $feeds = $this->using_feeds && $this->use_feeds && !$this->feed_empty; - if ( ($feeds && $this->current_offset > $this->feed_end) || - (!$feeds && ($this->current_offset > $this->last_offset|| - $this->generation_pointer >= $this->num_generations))) { + if ($this->current_offset > $this->last_offset || + $this->generation_pointer >= $this->num_generations) { return -1; } - if ($feeds) { - $index = IndexManager::getIndex("feed"); - $this->current_doc_offset = - $index->docOffsetFromPostingOffset($this->current_offset); - return [-1, $this->current_doc_offset]; - } $index = IndexManager::getIndex($this->index_name); $index->setCurrentShard($this->current_generation, true); $index->setCurrentShard($this->current_generation, true); diff --git a/src/library/media_jobs/AnalyticsJob.php b/src/library/media_jobs/AnalyticsJob.php index b515b7cf1..c679360eb 100644 --- a/src/library/media_jobs/AnalyticsJob.php +++ b/src/library/media_jobs/AnalyticsJob.php @@ -168,7 +168,7 @@ class AnalyticsJob extends MediaJob $num_machines = count($machine_urls); if ($num_machines < 1 || ($num_machines == 1 && UrlParser::isLocalhostUrl($machine_urls[0]))) { - $machine_urls = null; + $machine_urls = []; } $queries = [ "CODE" => [100, 101, 102, 103, 122, 200, 201, 202, 203, 204, diff --git a/src/library/media_jobs/FeedsUpdateJob.php b/src/library/media_jobs/FeedsUpdateJob.php index 478f4cdc6..ad067906e 100644 --- a/src/library/media_jobs/FeedsUpdateJob.php +++ b/src/library/media_jobs/FeedsUpdateJob.php @@ -605,12 +605,12 @@ class FeedsUpdateJob extends MediaJob $time = time(); $prune_shard_name = C\WORK_DIRECTORY . "/feeds/prune_index"; $dir = C\CRAWL_DIR . '/cache/' . self::feed_index_data_base_name; - $info['DESCRIPTION'] = "NewsFeed"; - $info['FORWARD_DIRECTION'] = false; - $this->index_archive = new IndexArchiveBundle($dir, false, - serialize($info), C\NUM_DOCS_PER_GENERATION, false); + $info['DESCRIPTION'] = "feed"; + $info[self::DIRECTION] = self::BACKWARD; + $index_archive = new IndexArchiveBundle($dir, false, + serialize($info), C\NUM_DOCS_PER_GENERATION); $this->db->setWorldPermissionsRecursive($dir); - $prune_shard = new IndexShard($prune_shard_name); + $prune_shard = new IndexShard($prune_shard_name); $too_old = $time - $age; $num_sites = 0; if (!$prune_shard) { @@ -674,8 +674,9 @@ class FeedsUpdateJob extends MediaJob $meta_ids[] = "safe:false"; $meta_ids[] = "safe:all"; } - $prune_shard->addDocumentWords($doc_keys, self::NEEDS_OFFSET_FLAG, - $word_and_qa_lists["WORD_LIST"], $meta_ids, true, false); + $prune_shard->addDocumentWords($doc_keys, + self::NEEDS_OFFSET_FLAG, $word_and_qa_lists["WORD_LIST"], + $meta_ids, true, false); $this->updateTrendingTermCounts($term_counts, $phrase_string, $word_and_qa_lists["WORD_LIST"], $media_category, $source_name, $lang, @@ -694,42 +695,39 @@ class FeedsUpdateJob extends MediaJob unset($term_counts['seen']); $this->addTermCountsTrendingTable($db, $term_counts); } - L\crawlLog("----..deleting old feed items"); + L\crawlLog("----..deleting old feed items"); $sql = " DELETE FROM FEED_ITEM "; $db->execute($sql); L\crawlLog("----..done deleting old items"); - // 1. check if indexshard is full or not. if it is, new gen - $generation = $this->index_archive->initGenerationToAdd( - $prune_shard->num_docs, null); - if ($generation != -1) { - $summary_offsets = []; - if (!empty($seen_sites)) { - // 2. add pages, get summary_offset - $this->index_archive->addPages($generation, self::SUMMARY_OFFSET, - $seen_sites, $seen_url_count); - // keeping track of duplicates - $sql = " INSERT INTO FEED_ITEM (GUID) VALUES (?)"; - foreach ($seen_sites as $site) { - $result = $db->execute($sql, [$site[self::HASH]]); - $site_url = str_replace('|', "%7C", $site[self::URL]); - $host = UrlParser::getHost($site_url); - $raw_guid = L\unbase64Hash($site[self::HASH]); - $hash = L\crawlHash($site[self::URL], true) . - $raw_guid . "d". substr(L\crawlHash( - UrlParser::getHost($site[self::URL]) . "/", true), 1); - $summary_offsets[$hash] = $site[self::SUMMARY_OFFSET]; - } - unset($seen_sites); - } - $prune_string = $prune_shard->save(true, true); - $tmp_shard = IndexShard::load("news" , $prune_string); - if (!empty($summary_offsets)) { - $tmp_shard->changeDocumentOffsets($summary_offsets); - $this->index_archive->addIndexData($tmp_shard); - $this->index_dirty = true; + // 1. check if index shard is full or not. if it is, new gen + $generation = $index_archive->initGenerationToAdd( + $prune_shard->num_docs); + $summary_offsets = []; + if (!empty($seen_sites)) { + // 2. add pages, get summary_offset + $index_archive->addPages($generation, + self::SUMMARY_OFFSET, $seen_sites, $seen_url_count); + // keeping track of duplicates + $sql = " INSERT INTO FEED_ITEM (GUID) VALUES (?)"; + foreach ($seen_sites as $site) { + $result = $db->execute($sql, [$site[self::HASH]]); + $site_url = str_replace('|', "%7C", $site[self::URL]); + $host = UrlParser::getHost($site_url); + $raw_guid = L\unbase64Hash($site[self::HASH]); + $hash = L\crawlHash($site[self::URL], true) . + $raw_guid . "d". substr(L\crawlHash( + UrlParser::getHost($site[self::URL]) . "/", true), 1); + $summary_offsets[$hash] = $site[self::SUMMARY_OFFSET]; } - $this->index_archive->stopIndexingBundle(); + unset($seen_sites); + } + $prune_string = $prune_shard->save(true, true); + $tmp_shard = IndexShard::load("news" , $prune_string); + if (!empty($summary_offsets)) { + $tmp_shard->changeDocumentOffsets($summary_offsets); + $index_archive->addIndexData($tmp_shard); } + $index_archive->stopIndexingBundle(); if (file_exists($prune_shard_name)) { unlink($prune_shard_name); } @@ -748,7 +746,7 @@ class FeedsUpdateJob extends MediaJob * @param array $word_or_phrase_list associate array of * stemmed_word_or_phrase => positions in feed item of where occurs * @param string $media_category of feed source the item case from. We - * tredning counts grouped by media category + * trending counts grouped by media category * @param string $source_name of feed source the item case from. We exclude * from counts the name of the feed source * @param string $lang locale_tag for this feed item @@ -853,7 +851,7 @@ class FeedsUpdateJob extends MediaJob * Updates TRENDING_TERM, hourly, daily, and weekly top term occurrences. * Removes entries older than a week * - * @param resource $db hadnle to database with TRENDING_TERM table + * @param resource $db handle to database with TRENDING_TERM table * @param array $term_counts for the most recent uupdate of the * FEED_ITEM table an array [$lang => [$term => $occurences]] * for the top NUM_TRENDING terms per language @@ -864,7 +862,7 @@ class FeedsUpdateJob extends MediaJob $update_intervals = [ C\ONE_HOUR => [24, C\ONE_DAY], C\ONE_DAY => [7, C\ONE_WEEK], - C\ONE_WEEK => [4, 4*C\ONE_WEEK], + C\ONE_WEEK => [4, 4 * C\ONE_WEEK], ]; $num_timestamp_sql = "SELECT COUNT(DISTINCT TIMESTAMP) AS NUM_TIMESTAMPS " . diff --git a/src/models/CrawlModel.php b/src/models/CrawlModel.php index cc6ce9ae5..be5060fa3 100755 --- a/src/models/CrawlModel.php +++ b/src/models/CrawlModel.php @@ -1184,7 +1184,7 @@ EOT; $crawl['CRAWL_TIME'] = $matches[2]; } else { $bundle_class_name = C\NS_LIB . "IndexArchiveBundle"; - $crawl['CRAWL_TIME'] = 13; + $crawl['CRAWL_TIME'] = self::FEED_CRAWL_TIME; } $info = $bundle_class_name::getArchiveInfo($dir); if (isset($info['DESCRIPTION'])) { @@ -1220,8 +1220,7 @@ EOT; } $sub_dir = opendir($sub_path); $i = 0; - while (($sub_name = readdir($sub_dir)) !== false && - $i < 5) { + while (($sub_name=readdir($sub_dir)) !== false && $i < 5) { if ($sub_name[0] == 'A' && $sub_name[1] == 't') { $crawl['RESUMABLE'] = true; break 2; diff --git a/src/models/PhraseModel.php b/src/models/PhraseModel.php index d90732381..2b5cdb604 100755 --- a/src/models/PhraseModel.php +++ b/src/models/PhraseModel.php @@ -198,9 +198,6 @@ class PhraseModel extends ParallelModel * @param int $save_timestamp if this timestamp is nonzero, then save * iterate position, so can resume on future queries that make * use of the timestamp - * @param bool $limit_feeds if true the number of feed shard items to - * allow in search results is limited to - * WordIterator::LIMIT_FEEDS_COUNT * * @return array an array of summary data */ @@ -208,7 +205,7 @@ class PhraseModel extends ParallelModel $input_phrase, $low = 0, $results_per_page = C\NUM_RESULTS_PER_PAGE, $format = true, $filter = null, $use_cache_if_allowed = true, $raw = 0, $queue_servers = [], $guess_semantics = true, - $save_timestamp = 0, $limit_feeds = true) + $save_timestamp = 0) { if (C\QUERY_STATISTICS) { $indent= " "; @@ -398,7 +395,7 @@ class PhraseModel extends ParallelModel $out_results = $this->getSummariesByHash($word_structs, $low, $phrase_num, $filter, $use_cache_if_allowed, $raw, $queue_servers, $phrase, $save_timestamp_name, - $limit_feeds, $format_words); + $format_words); if (isset($out_results['PAGES']) && count($out_results['PAGES']) != 0) { $out_count = 0; @@ -1062,8 +1059,6 @@ class PhraseModel extends ParallelModel * save iterate position, so can resume on future queries that make * use of the timestamp. If used then $limit ignored and get next $num * docs after $save_timestamp 's previous iterate position. - * @param bool $limit_feeds if true the number of feed shard items to - * allow in search results is limited to WordIterator::LIMIT_FEEDS_COUNT * @param array $format_words words which should be highlighted in * search snippets returned * @return array document summaries @@ -1071,7 +1066,7 @@ class PhraseModel extends ParallelModel public function getSummariesByHash($word_structs, $limit, $num, $filter, $use_cache_if_allowed = true, $raw = 0, $queue_servers = [], $original_query = "", $save_timestamp_name = "", - $limit_feeds = true, $format_words = null) + $format_words = null) { $indent= " "; $in2 = $indent . $indent; @@ -1170,7 +1165,7 @@ class PhraseModel extends ParallelModel $get_query_time = microtime(true); $query_iterator = $this->getQueryIterator($word_structs, $filter, $raw, $to_retrieve, $queue_servers, $original_query, - $save_timestamp_name, $limit_feeds); + $save_timestamp_name); $get_query_time = L\changeInMicrotime($get_query_time); $num_retrieved = 0; $pages = []; @@ -1584,15 +1579,13 @@ class PhraseModel extends ParallelModel * @param string $save_timestamp_name if this timestamp is non empty, then * when making iterator get sub-iterators to advance to gen doc_offset * stored with respect to save_timestamp if exists. - * @param bool $limit_feeds if true the number of feed shard items to - * allow in search results is limited to WordIterator::LIMIT_FEEDS_COUNT * * @return &object an iterator for iterating through results to the * query */ public function getQueryIterator($word_structs, $filter, $raw, &$to_retrieve, $queue_servers = [], $original_query = "", - $save_timestamp_name = "", $limit_feeds = true) + $save_timestamp_name = "") { $iterators = []; $total_iterators = 0; @@ -1694,39 +1687,29 @@ class PhraseModel extends ParallelModel $distinct_key[1] : 0; $distinct_key_id = L\unbase64Hash( $distinct_key[0]); - // 13 is somewhat of a magic number right now - if ($index_name == 13) { - $dir_name = C\CRAWL_DIR."/cache/" - .self::index_data_base_name.$index_name; - } else { - $dir_name = C\CRAWL_DIR."/cache/" - .self::index_data_base_name.$index_name; - } $index = IndexManager::getIndex($index_name); - $archive_info = $index->getArchiveInfo($dir_name); - $description = unserialize($archive_info['DESCRIPTION']); - if (isset($description['FORWARD_DIRECTION'])) { - $forward_direction = $description['FORWARD_DIRECTION']; + $archive_info = $index->getArchiveInfo( + $index->dir_name); + $description = unserialize( + $archive_info['DESCRIPTION']); + if (isset($description[self::DIRECTION])) { + $direction = + $description[self::DIRECTION]; } else { - $forward_direction = 1; + $direction = self::FORWARD; } - // will have to change index name for checking iterator - if ($forward_direction) { + // have to change index name for checking iterator + if ($direction == self::FORWARD) { $tmp_word_iterators[$m] = new I\WordIterator($distinct_key_id, $shift, - $index_name, true, $filter, $to_retrieve, - $limit_feeds); - } - else { - $tmp_word_iterators[$m] = - new I\ReverseIterator($distinct_key_id, $shift, - $index_name, true, $filter, $to_retrieve, - $limit_feeds); + $index_name, true, $filter, $to_retrieve); + } else { + $tmp_word_iterators[$m] = new I\ReverseIterator( + $distinct_key_id, $shift, $index_name, true, + $filter, $to_retrieve); } $sum += $tmp_word_iterators[$m]->num_docs; - if ($tmp_word_iterators[$m]->dictionary_info != - [] || - $tmp_word_iterators[$m]->feed_count > 0) { + if ($tmp_word_iterators[$m]->dictionary_info !=[]) { $min_group_override = true; $m++; } else { diff --git a/tests/IndexShardTest.php b/tests/IndexShardTest.php index 124b270da..2d1cf11b3 100644 --- a/tests/IndexShardTest.php +++ b/tests/IndexShardTest.php @@ -152,8 +152,8 @@ class IndexShardTest extends UnitTest } /** * Check if can store documents into a reverse index shard and retrieve them - * Shard is just a normal regular IndexShard, while Shard4 sets the additional - * flag which makes everything go in reverse + * Shard is just a normal regular IndexShard, while Shard4 sets the + * additional flag which makes everything go in reverse */ public function addDocumentsGetPostingsSliceReverseTestCase() { @@ -304,13 +304,13 @@ class IndexShardTest extends UnitTest "First offset set correctly"); $this->assertEqual($last_offset, 40, "Second offset set correctly"); - $forward = $this->test_objects['shard']->nextPostingOffsetDocOffset($first_offset, $last_offset, 5); - //print_r($forward); - $backward = $this->test_objects['shard4']->nextPostingOffsetDocOffset($first_offset, $last_offset, 5); - //print_r($backward); + $forward = $this->test_objects['shard']->nextPostingOffsetDocOffset( + $first_offset, $last_offset, 5); + $backward = $this->test_objects['shard4']->nextPostingOffsetDocOffset( + $first_offset, $last_offset, 5); $forward = $this->test_objects['shard']->getPostingsSlice($first_offset, - $first_offset, $last_offset, 5); - # have to reset offset values, since getPostingsSlice modifies by ref + $first_offset, $last_offset, 5); + // have to reset offset values, since getPostingsSlice modifies by ref $info = $this->test_objects['shard4']->getWordInfo( L\crawlHashWord('CCCCCCCC', true), true); list($first_offset, $last_offset, @@ -325,17 +325,18 @@ class IndexShardTest extends UnitTest $index_name = 1573453725; $index_name = 1575422839; $index_archive_name = "IndexData" . $index_name; - $index_archive_name = "IndexDataNewsFeed"; - $index_name = "NewsFeed"; + $index_archive_name = "IndexDataFeed"; + $index_name = "feed"; $results_limit = 200; $total_results = 0; if (file_exists(C\CRAWL_DIR.'/cache/' . $index_archive_name)) { - $info = IndexManager::getWordInfo($index_name, $hash_key, $shift, -1, 0, -1); + $info = IndexManager::getWordInfo($index_name, $hash_key, $shift, + -1, 0, -1); $this->assertTrue(isset($info[0][4])); $forward = []; if (isset($info[0][4])) { - $word_iterator = new WordIterator($info[0][4], 0, $index_name, true, null, $results_limit); - // $norm_docs = $word_iterator->findDocsWithWord(); + $word_iterator = new WordIterator($info[0][4], 0, $index_name, + true, null, $results_limit); $forward_offsets = []; $offset = $word_iterator->currentGenDocOffsetWithWord(); array_push($forward_offsets, $offset); @@ -352,11 +353,12 @@ class IndexShardTest extends UnitTest $for_results = count($forward_offsets); } $backward = []; - $info = IndexManager::getWordInfo($index_name, $hash_key, $shift, -1, 0, -1); + $info = IndexManager::getWordInfo($index_name, $hash_key, $shift, + -1, 0, -1); $this->assertTrue(isset($info[0][4])); if (isset($info[0][4])) { - $word_rev_iterator = new ReverseIterator($info[0][4], 0, $index_name, true, null, $results_limit); - // $rev_docs = $word_rev_iterator->findDocsWithWord(); + $word_rev_iterator = new ReverseIterator($info[0][4], 0, + $index_name, true, null, $results_limit); $backward_offsets = []; $offset = $word_rev_iterator->currentGenDocOffsetWithWord(); array_push($backward_offsets, $offset); @@ -800,4 +802,4 @@ class IndexShardTest extends UnitTest $this->assertTrue(isset($c_data["AAAAAAAABBBBBBBBCCCCCCCC"]), "Save without dictionary test works"); } -} \ No newline at end of file +}