viewgit/inc/functions.php:22 Function utf8_encode() is deprecated [8192]
diff --git a/src/controllers/components/CrawlComponent.php b/src/controllers/components/CrawlComponent.php index 0ecc9e333..9e9ef8b80 100644 --- a/src/controllers/components/CrawlComponent.php +++ b/src/controllers/components/CrawlComponent.php @@ -2765,7 +2765,7 @@ class CrawlComponent extends Component implements CrawlConstants $feeds_update_job = new M\FeedsUpdateJob(); $feeds_update_job->parseFeedAuxInfo($source); $data['FEED_TEST_RESULTS'] = - $feeds_update_job->updateFeedItemsOneGo([$source], + $feeds_update_job->updateFoundItemsOneGo([$source], C\ONE_WEEK, true); } else if (in_array($source['TYPE'], ['feed_podcast', 'scrape_podcast'])) { diff --git a/src/library/BloomFilterFile.php b/src/library/BloomFilterFile.php index 50843d65d..71d91df37 100755 --- a/src/library/BloomFilterFile.php +++ b/src/library/BloomFilterFile.php @@ -51,6 +51,11 @@ class BloomFilterFile extends PersistentStructure * @var int */ public $num_keys; + /** + * Number of items currently stored in this filter + * @var int + */ + public $count; /** * Size in bits of the packed string array used to store the filter's * contents @@ -82,6 +87,7 @@ class BloomFilterFile extends PersistentStructure */ $this->num_keys = ceil(log($num_values)/$log2); $this->filter_size = ceil( ($this->num_keys) * $num_values/$log2_sq); + $this->count = 0; $mem_before = memory_get_usage(true); $this->filter = pack("x". ceil(0.125 * $this->filter_size)); // 1/8 =.125 = num bits/bytes, want to make things floats @@ -100,6 +106,7 @@ class BloomFilterFile extends PersistentStructure for ($i = 0; $i < $num_keys; $i++) { $this->setBit($pos_array[$i]); } + $this->count++; $this->checkSave(); } /** diff --git a/src/library/IndexArchiveBundle.php b/src/library/IndexArchiveBundle.php index 7465f6d88..e5afb8a8b 100644 --- a/src/library/IndexArchiveBundle.php +++ b/src/library/IndexArchiveBundle.php @@ -148,6 +148,7 @@ class IndexArchiveBundle implements CrawlConstants file_get_contents($this->dir_name . "/generation.txt")); } else if (!$read_only_archive) { $this->generation_info['ACTIVE'] = 0; + $this->generation_info['LAST_DICTIONARY_SHARD'] = -1; file_put_contents($this->dir_name . "/generation.txt", serialize($this->generation_info)); } @@ -287,6 +288,8 @@ class IndexArchiveBundle implements CrawlConstants $current_index_shard_file, $this->generation_info['ACTIVE'], $this->num_docs_per_generation, true); $this->dictionary->addShardDictionary($this->current_shard, $callback); + $this->generation_info['LAST_DICTIONARY_SHARD'] = + $this->generation_info['ACTIVE']; } /** * Sets the current shard to be the active shard (the active shard is diff --git a/src/library/IndexManager.php b/src/library/IndexManager.php index 15a1dd96e..a559a538c 100644 --- a/src/library/IndexManager.php +++ b/src/library/IndexManager.php @@ -196,13 +196,42 @@ class IndexManager implements CrawlConstants $with_remaining_total = false) { $index = self::getIndex($index_name); - $tmp = []; + $added_active = false; + $pre_info = []; if (!empty($index->dictionary)) { $pre_info = $index->dictionary->getWordInfo($hash, true, $shift, $threshold, $start_generation, $num_distinct_generations, true); } + $last_desired_generation = $start_generation + + $num_distinct_generations; + if (!empty($index->generation_info)) { + $active_generation = $index->generation_info['ACTIVE']; + if ((empty($index->generation_info['LAST_DICTIONARY_SHARD']) || + $index->generation_info['LAST_DICTIONARY_SHARD'] < + $active_generation) && $active_generation < + $last_desired_generation) { + $active_shard_file = $index->dir_name . + "/posting_doc_shards/index" . $active_generation; + if (file_exists($active_shard_file)) { + $active_shard = new IndexShard($active_shard_file, 0, + C\NUM_DOCS_PER_GENERATION, true); + $active_info = $active_shard->getWordInfo($hash, true, + $shift); + if (is_array($active_info)) { + if (empty($pre_info)) { + $pre_info[0] = 0; + $pre_info[1] = []; + } + $pre_info[1][] = [$active_generation, + $active_info[0], $active_info[1], $active_info[2], + $active_info[3]]; + $pre_info[0] += $active_info[2]; + } + } + } + } if (!empty($pre_info[1])) { list($total, $info) = $pre_info; } else { diff --git a/src/library/IndexShard.php b/src/library/IndexShard.php index 70ed6302b..65cba7c81 100644 --- a/src/library/IndexShard.php +++ b/src/library/IndexShard.php @@ -596,7 +596,9 @@ class IndexShard extends PersistentStructure implements CrawlConstants $old_check_loc = $check_loc; $word_string = $this->getWordString($is_disk, $start, $check_loc, $word_item_len); - if ($word_string == false) {return false;} + if ($word_string == false) { + return false; + } $id = substr($word_string, 0, $word_key_len); $cmp = compareWordHashes($word_id, $id, $shift); if ($cmp === 0) { diff --git a/src/library/PersistentStructure.php b/src/library/PersistentStructure.php index 41c8e6df2..b9960b3c2 100755 --- a/src/library/PersistentStructure.php +++ b/src/library/PersistentStructure.php @@ -85,7 +85,7 @@ class PersistentStructure public static function load($fname) { /* code to handle the fact that name space of object may not be the - modern nameepace + modern namespace */ $obj_string = file_get_contents($fname); $name_length = intval(substr($obj_string, 2, 14)); diff --git a/src/library/index_bundle_iterators/DocIterator.php b/src/library/index_bundle_iterators/DocIterator.php index 321f313b8..052e33dfb 100755 --- a/src/library/index_bundle_iterators/DocIterator.php +++ b/src/library/index_bundle_iterators/DocIterator.php @@ -59,7 +59,7 @@ class DocIterator extends IndexBundleIterator */ public $next_offset; /** - * Last Offset of a doc occurence in the IndexShard + * Last offset of a doc occurrence in the IndexShard * @var int */ public $last_offset; @@ -90,7 +90,7 @@ class DocIterator extends IndexBundleIterator public $filter; /** Host Key position + 1 (first char says doc, inlink or eternal link)*/ const HOST_KEY_POS = 17; - /** Length of a doc key*/ + /** Length of a doc key */ const KEY_LEN = 8; /** * Creates a word iterator with the given parameters. @@ -122,12 +122,15 @@ class DocIterator extends IndexBundleIterator */ public function reset() { - $this->current_generation = 0; + $is_ascending = ($this->direction == self::ASCENDING); + $this->current_generation = ($is_ascending) ? 0 : + $this->num_generations - 1; + $this->getShardInfo($this->current_generation); $this->count_block = 0; $this->seen_docs = 0; - $this->current_offset = 0; - $this->next_offset = 0; - $this->getShardInfo($this->current_generation); + $this->current_offset = ($is_ascending) ? 0 : + $this->getPreviousDocOffset($this->last_offset); + $this->next_offset = $this->current_offset; } /** * Mainly used to get the last_offset in shard $generation of the @@ -143,8 +146,7 @@ class DocIterator extends IndexBundleIterator if (isset($this->shard_lens[$generation])) { $this->last_offset = $this->shard_lens[$generation]; } else { - $index = IndexManager::getIndex($this->index_name, - $this->direction); + $index = IndexManager::getIndex($this->index_name); $index->setCurrentShard($generation, true); $shard = $index->getCurrentShard(); $this->last_offset = $shard->docids_len; @@ -160,14 +162,18 @@ class DocIterator extends IndexBundleIterator */ public function findDocsWithWord() { - if (($this->current_generation >= $this->num_generations) + $is_ascending = ($this->direction == self::ASCENDING); + if (($is_ascending && + ($this->current_generation >= $this->num_generations) || ($this->current_generation == $this->num_generations - 1 && - $this->current_offset > $this->last_offset)) { + $this->current_offset > $this->last_offset)) || + !$is_ascending && ($this->current_generation < 0) || + ($this->current_generation == 0 && $this->current_offset < 0)) { return -1; } $pre_results = []; $this->next_offset = $this->current_offset; - $index = IndexManager::getIndex($this->index_name, $this->direction); + $index = IndexManager::getIndex($this->index_name); $index->setCurrentShard($this->current_generation, true); //the next call also updates next offset $shard = $index->getCurrentShard(); @@ -177,16 +183,22 @@ class DocIterator extends IndexBundleIterator $pre_results = []; $num_docs_so_far = 0; do { - if ($this->next_offset >= $this->last_offset) { + if (($is_ascending && $this->next_offset >= $this->last_offset) + || (!$is_ascending && $this->next_offset < 0)) { break; } $posting = L\packPosting($this->next_offset >> 4, [1]); list($doc_id, $num_keys, $item) = - $shard->makeItem($posting, $num_docs_or_links); - if ($num_keys % 2 == 0) { - $num_keys++; + $shard->makeItem($posting, $num_docs_or_links, + $this->direction); + if ($is_ascending) { + if ($num_keys % 2 == 0) { + $num_keys++; + } + $this->next_offset += ($num_keys + 1) * $doc_key_len; + } else { + $this->next_offset = $this->getPreviousDocOffset($next_offset); } - $this->next_offset += ($num_keys + 1) * $doc_key_len; $pre_results[$doc_id] = $item; $num_docs_so_far++; } while ($num_docs_so_far < $this->results_per_block); @@ -212,17 +224,34 @@ class DocIterator extends IndexBundleIterator $this->pages = $results; return $results; } + /** + * + */ + public function getPreviousDocOffset($doc_offset) + { + $doc_item_len = 4 * IndexShard::DOC_KEY_LEN; + // this is nott correct, only works if no additions doc keys + return $doc_offset - $doc_item_len; + } /** * Updates the seen_docs count during an advance() call */ public function advanceSeenDocs() { if ($this->current_block_fresh != true) { + $is_ascending = ($this->direction == self::ASCENDING); $doc_item_len = 4 * IndexShard::DOC_KEY_LEN; + $pre_num_docs = ($is_ascending) ? + ($this->last_offset - $this->next_offset) / $doc_item_len : + $this->next_offset/$doc_item_len; $num_docs = min($this->results_per_block, - ($this->last_offset - $this->next_offset) / $doc_item_len); + ); $this->next_offset = $this->current_offset; - $this->next_offset += $doc_item_len * $num_docs; + if ($is_ascending) { + $this->next_offset += $doc_item_len * $num_docs; + } else { + $this->next_offset -= $doc_item_len * $num_docs; + } if ($num_docs < 0) { return; } @@ -242,32 +271,40 @@ class DocIterator extends IndexBundleIterator public function advance($gen_doc_offset = null) { $this->advanceSeenDocs(); - if ($this->current_offset < $this->next_offset) { + if (($is_ascending && $this->current_offset < $this->next_offset) || + (!$is_ascending && $this->current_offset > $this->next_offset)) { $this->current_offset = $this->next_offset; } else { $this->advanceGeneration(); $this->next_offset = $this->current_offset; } - if ($this->current_offset > $this->last_offset) { + if (($is_ascending && $this->current_offset > $this->last_offset) || + (!$is_ascending && $this->current_offset < 0)) { $this->advanceGeneration(); $this->next_offset = $this->current_offset; } if ($gen_doc_offset !== null) { - if ($this->current_generation < $gen_doc_offset[0]) { + if (($is_ascending && + $this->current_generation < $gen_doc_offset[0]) || + (!$is_ascending && + $this->current_generation > $gen_doc_offset[0])) { $this->advanceGeneration($gen_doc_offset[0]); $this->next_offset = $this->current_offset; } if ($this->current_generation == $gen_doc_offset[0]) { - $this->current_offset = max($this->current_offset, - $gen_doc_offset[1]); - if ($this->current_offset > $this->last_offset) { + $this->current_offset = ($is_ascending) ? + max($this->current_offset, $gen_doc_offset[1]) : + min($this->current_offset, $gen_doc_offset[1]); + if (($is_ascending && + $this->current_offset > $this->last_offset) || + (!$is_ascending && + $this->current_offset < $this->last_offset)) { $this->advanceGeneration(); $this->next_offset = $this->current_offset; } } - $this->seen_docs = - $this->current_offset / - 4 * IndexShard::DOC_KEY_LEN; + $this->seen_docs = $this->current_offset / + 4 * IndexShard::DOC_KEY_LEN; } } /** @@ -278,12 +315,16 @@ class DocIterator extends IndexBundleIterator */ public function advanceGeneration($generation = null) { + $is_ascending = ($this->direction == self::ASCENDING); if ($generation === null) { - $generation = $this->current_generation + 1; + $generation = ($is_ascending) ? $this->current_generation + 1 : + $this->current_generation - 1; } $this->current_generation = $generation; - $this->current_offset = 0; - if ($generation < $this->num_generations) { + $this->current_offset = ($is_ascending) ? 0 : + $this->last_offset; + if (($is_ascending && $generation < $this->num_generations) || + (!$is_ascending && $generation >= 0) ) { $this->getShardInfo($generation); } } @@ -295,8 +336,11 @@ class DocIterator extends IndexBundleIterator * and generation; -1 on fail */ public function currentGenDocOffsetWithWord() { - if (($this->current_offset > $this->last_offset || - $this->current_generation >= $this->num_generations)) { + $is_ascending = ($this->direction == self::ASCENDING); + if (($is_ascending && ($this->current_offset > $this->last_offset || + $this->current_generation >= $this->num_generations)) || + (!$is_ascending && ($this->current_offset < 0 || + $this->current_generation < 0))) { return -1; } return [$this->current_generation, $this->current_offset]; diff --git a/src/library/index_bundle_iterators/NegationIterator.php b/src/library/index_bundle_iterators/NegationIterator.php index a3ad9c82b..f94d0ad8f 100644 --- a/src/library/index_bundle_iterators/NegationIterator.php +++ b/src/library/index_bundle_iterators/NegationIterator.php @@ -142,16 +142,15 @@ class NegationIterator extends IndexBundleIterator $old_gen_offset_all) == 0)) { return -1; } - $gen_offset_term = - $this->index_bundle_iterators[ - 1]->currentGenDocOffsetWithWord(); + $gen_offset_term = $this->index_bundle_iterators[ + 1]->currentGenDocOffsetWithWord(); if ($gen_offset_term == -1 || ($changed_term && $this->genDocOffsetCmp($gen_offset_term, $old_gen_offset_term) == 0)) { return -1; } $gen_doc_cmp = $this->genDocOffsetCmp($gen_offset_all, - $gen_offset_term); + $gen_offset_term, $this->getDirection()); if ($gen_doc_cmp > 0) { $this->index_bundle_iterators[1]->advance($gen_offset_all); $old_gen_offset_term = $gen_offset_term; diff --git a/src/library/media_jobs/FeedsUpdateJob.php b/src/library/media_jobs/FeedsUpdateJob.php index b08b57346..791592618 100644 --- a/src/library/media_jobs/FeedsUpdateJob.php +++ b/src/library/media_jobs/FeedsUpdateJob.php @@ -36,7 +36,7 @@ use seekquarry\yioop\library as L; use seekquarry\yioop\library\CrawlConstants; use seekquarry\yioop\library\FetchUrl; use seekquarry\yioop\library\IndexShard; -use seekquarry\yioop\library\IndexArchiveBundle; +use seekquarry\yioop\library\FeedArchiveBundle; use seekquarry\yioop\library\PhraseParser; use seekquarry\yioop\library\UrlParser; @@ -47,14 +47,7 @@ use seekquarry\yioop\library\UrlParser; */ class FeedsUpdateJob extends MediaJob { - /** - * how long in seconds before a feed item expires - */ - const ITEM_EXPIRES_TIME = 4 * C\ONE_WEEK; - /** - * Mamimum number of feeds to download in one try - */ - const MAX_FEEDS_ONE_GO = 100; + /** * Time in current epoch when feeds last updated * @var int @@ -66,6 +59,22 @@ class FeedsUpdateJob extends MediaJob * @var object */ public $db; + /** + * @var IndexArchiveBundle + */ + public $index_archive; + /** + * @var array + */ + public $found_items; + /** + * Mamimum number of feeds to download in one try + */ + const MAX_FEEDS_ONE_GO = 100; + /** + * how long in seconds before a feed item expires + */ + const OLD_ITEM_TIME = 4 * C\ONE_WEEK; /** * Initializes the last update time to far in the past so, feeds will get * immediately updated. Sets up connect to DB to store feeds items, and @@ -77,9 +86,14 @@ class FeedsUpdateJob extends MediaJob $this->update_time = 0; $this->name_server_does_client_tasks = true; $this->name_server_does_client_tasks_only = true; + $dir = C\CRAWL_DIR . '/cache/' . self::feed_index_data_base_name; + $info['DESCRIPTION'] = "feed"; + $this->index_archive = new FeedArchiveBundle($dir, false, + serialize($info), C\NUM_DOCS_PER_GENERATION); $db_class = C\NS_DATASOURCES . ucfirst(C\DBMS). "Manager"; $this->db = new $db_class(); $this->db->connect(); + $this->db->setWorldPermissionsRecursive($dir); C\nsconddefine("FEEDS_UPDATE_INTERVAL", C\ONE_HOUR); } /** @@ -121,8 +135,8 @@ class FeedsUpdateJob extends MediaJob } /** * For each feed source downloads the feeds, checks which items are - * not in the database, adds them. Then calls the method to rebuild the - * inverted index shard for feeds + * new, and makes an array of them. Then calls the method to add these + * items to both the IndexArchiveBundle for feeds * * @param array $tasks array of feed info (url to download, paths to * extract etc) @@ -138,18 +152,19 @@ class FeedsUpdateJob extends MediaJob L\crawlLog("----This media updater is responsible for the feeds:"); $i = 1; foreach ($feeds as $feed) { - L\crawlLog("---- $i. ".$feed["NAME"]); + L\crawlLog("---- $i. " . $feed["NAME"]); $i++; } $num_feeds = count($feeds); $feeds_one_go = self::MAX_FEEDS_ONE_GO; $limit = 0; + $this->found_items = []; while ($limit < $num_feeds) { $feeds_batch = array_slice($feeds, $limit, $feeds_one_go); - $this->updateFeedItemsOneGo($feeds_batch, self::ITEM_EXPIRES_TIME); + $this->updateFoundItemsOneGo($feeds_batch, self::OLD_ITEM_TIME); $limit += $feeds_one_go; } - $this->rebuildFeedShard(self::ITEM_EXPIRES_TIME); + $this->addFoundItemsInvertedIndex(self::OLD_ITEM_TIME); } /** * Handles the request to get the array of feed sources which hash to @@ -216,7 +231,7 @@ class FeedsUpdateJob extends MediaJob * @return mixed either true, or if $test_mode is true then the results * as a string of downloading the feeds and extracting the feed items */ - public function updateFeedItemsOneGo($feeds, $age = C\ONE_WEEK, + public function updateFoundItemsOneGo($feeds, $age = C\ONE_WEEK, $test_mode = false) { $test_results = ""; @@ -596,25 +611,13 @@ class FeedsUpdateJob extends MediaJob * shard to be active. If this method is going to take max_execution_time/2 * it returns false, so an additional job can be schedules; otherwise * it returns true - * * @param int $age how many seconds old records should be deleted + * * @return bool whether job executed to complete */ - public function rebuildFeedShard($age) + public function addFoundItemsInvertedIndex($age) { - $time = time(); - $prune_shard_name = C\WORK_DIRECTORY . "/feeds/prune_index"; - $dir = C\CRAWL_DIR . '/cache/' . self::feed_index_data_base_name; - $info['DESCRIPTION'] = "feed"; - $index_archive = new IndexArchiveBundle($dir, false, - serialize($info), C\NUM_DOCS_PER_GENERATION, self::DESCENDING); - $this->db->setWorldPermissionsRecursive($dir); - $prune_shard = new IndexShard($prune_shard_name); - $too_old = $time - $age; - $num_sites = 0; - if (!$prune_shard) { - return false; - } + $items = $this->found_items; $pre_feeds = $this->tasks; if (!$pre_feeds) { return false; @@ -626,90 +629,83 @@ class FeedsUpdateJob extends MediaJob } $feeds[$pre_feed['NAME']] = $pre_feed; } + $time = time(); + $tmp_shard_name = C\WORK_DIRECTORY . "/data/tmp_index"; + $tmp_shard = new IndexShard($tmp_shard_name); + $num_sites = 0; + if (!$tmp_shard) { + return false; + } $db = $this->db; - // we now rebuild the inverted index with the remaining items - $sql = "SELECT * FROM FEED_ITEM WHERE PUBDATE >= ? " . - "ORDER BY PUBDATE ASC"; + $completed = true; + L\crawlLog("----.. Creating inverted index of new items."); + $i = 0; + $term_counts = []; $seen_url_count = 0; - $seen_sites = []; - $result = $db->execute($sql, [$too_old]); - if ($result) { - $completed = true; - L\crawlLog("----..Making new index" . - " of non-pruned items."); - $i = 0; - $term_counts = []; - while ($item = $db->fetchArray($result)) { - L\crawlTimeoutLog( - "----..have added %s non-pruned items to index.", $i); - $i++; - if (!isset($item['SOURCE_NAME'])) { - continue; - } - $source_name = $item['SOURCE_NAME']; - if (isset($feeds[$source_name])) { - $lang = $feeds[$source_name]['LANGUAGE']; - $media_category = $feeds[$source_name]['CATEGORY']; - } else { - $lang = ""; - $media_category = "news"; - } - $phrase_string = $item["TITLE"] . " " . $item["DESCRIPTION"]; - $word_and_qa_lists = PhraseParser::extractPhrasesInLists( - $phrase_string, $lang); - $raw_guid = L\unbase64Hash($item["GUID"]); - $doc_keys = L\crawlHash($item["LINK"], true) . - $raw_guid . "d". substr(L\crawlHash( - UrlParser::getHost($item["LINK"]) . "/", true), 1); - $meta_ids = $this->calculateMetas($lang, $item['PUBDATE'], - $source_name, $item["GUID"], $media_category); - $len = strlen($phrase_string); - $word_list = $word_and_qa_lists["WORD_LIST"]; - if (PhraseParser::computeSafeSearchScore($word_list, $len, - $item["LINK"]) < 0.012) { - $meta_ids[] = "safe:true"; - $meta_ids[] = "safe:all"; - } else { - $meta_ids[] = "safe:false"; - $meta_ids[] = "safe:all"; - } - $prune_shard->addDocumentWords($doc_keys, - self::NEEDS_OFFSET_FLAG, $word_and_qa_lists["WORD_LIST"], - $meta_ids, true, false); - $this->updateTrendingTermCounts($term_counts, $phrase_string, - $word_and_qa_lists["WORD_LIST"], $media_category, - $source_name, $lang, - $item['PUBDATE']); - $seen_url_count += 1; - $page = []; - $page[self::TITLE] = $item['TITLE']; - $page[self::DESCRIPTION] = $item['DESCRIPTION']; - $page[self::URL] = $item['LINK']; - $page[self::HASH] = $item['GUID']; - $page[self::SOURCE_NAME] = $item['SOURCE_NAME']; - $page[self::IMAGE_LINK] = $item['IMAGE_LINK']; - $page[self::PUBDATE] = $item['PUBDATE']; - $seen_sites[] = $page; + foreach ($items as $item) { + L\crawlTimeoutLog( + "----..have added %s items to new item index.", $i); + $i++; + if (!isset($item['SOURCE_NAME'])) { + continue; + } + $source_name = $item['SOURCE_NAME']; + if (isset($feeds[$source_name])) { + $lang = $feeds[$source_name]['LANGUAGE']; + $media_category = $feeds[$source_name]['CATEGORY']; + } else { + $lang = ""; + $media_category = "news"; + } + $phrase_string = $item["TITLE"] . " " . $item["DESCRIPTION"]; + $word_and_qa_lists = PhraseParser::extractPhrasesInLists( + $phrase_string, $lang); + $raw_guid = L\unbase64Hash($item["GUID"]); + $doc_keys = L\crawlHash($item["LINK"], true) . + $raw_guid . "d". substr(L\crawlHash( + UrlParser::getHost($item["LINK"]) . "/", true), 1); + $meta_ids = $this->calculateMetas($lang, $item['PUBDATE'], + $source_name, $item["GUID"], $media_category); + $len = strlen($phrase_string); + $word_list = $word_and_qa_lists["WORD_LIST"]; + if (PhraseParser::computeSafeSearchScore($word_list, $len, + $item["LINK"]) < 0.012) { + $meta_ids[] = "safe:true"; + $meta_ids[] = "safe:all"; + } else { + $meta_ids[] = "safe:false"; + $meta_ids[] = "safe:all"; } - unset($term_counts['seen']); - $this->addTermCountsTrendingTable($db, $term_counts); + $tmp_shard->addDocumentWords($doc_keys, + self::NEEDS_OFFSET_FLAG, $word_and_qa_lists["WORD_LIST"], + $meta_ids, true, false); + $this->updateTrendingTermCounts($term_counts, $phrase_string, + $word_and_qa_lists["WORD_LIST"], $media_category, + $source_name, $lang, + $item['PUBDATE']); + $seen_url_count += 1; + $page = []; + $page[self::TITLE] = $item['TITLE']; + $page[self::DESCRIPTION] = $item['DESCRIPTION']; + $page[self::URL] = $item['LINK']; + $page[self::HASH] = $item['GUID']; + $page[self::SOURCE_NAME] = $item['SOURCE_NAME']; + $page[self::IMAGE_LINK] = $item['IMAGE_LINK']; + $page[self::PUBDATE] = $item['PUBDATE']; + $seen_sites[] = $page; } - L\crawlLog("----..deleting old feed items"); - $sql = " DELETE FROM FEED_ITEM "; - $db->execute($sql); - L\crawlLog("----..done deleting old items"); + unset($term_counts['seen']); + $this->addTermCountsTrendingTable($db, $term_counts); + L\crawlLog("----..adding items to IndexArchiveBundle"); // 1. check if index shard is full or not. if it is, new gen - $generation = $index_archive->initGenerationToAdd( - $prune_shard->num_docs); + $generation = $this->index_archive->initGenerationToAdd( + $tmp_shard->num_docs); $summary_offsets = []; if (!empty($seen_sites)) { // 2. add pages, get summary_offset - $index_archive->addPages($generation, - self::SUMMARY_OFFSET, $seen_sites, $seen_url_count); - // keeping track of duplicates - $sql = " INSERT INTO FEED_ITEM (GUID) VALUES (?)"; + $this->index_archive->addPagesAndSeenKeys($generation, + self::SUMMARY_OFFSET, self::HASH, $seen_sites, $seen_url_count); foreach ($seen_sites as $site) { - $result = $db->execute($sql, [$site[self::HASH]]); $site_url = str_replace('|', "%7C", $site[self::URL]); $host = UrlParser::getHost($site_url); $raw_guid = L\unbase64Hash($site[self::HASH]); @@ -720,17 +716,17 @@ class FeedsUpdateJob extends MediaJob } unset($seen_sites); } - $prune_string = $prune_shard->save(true, true); - $tmp_shard = IndexShard::load("news" , $prune_string); + $tmp_string = $tmp_shard->save(true, true); + $tmp_shard = IndexShard::load("feed_data", $tmp_string); if (!empty($summary_offsets)) { $tmp_shard->changeDocumentOffsets($summary_offsets); - $index_archive->addIndexData($tmp_shard); + $this->index_archive->addIndexData($tmp_shard); } - $index_archive->stopIndexingBundle(); - if (file_exists($prune_shard_name)) { - unlink($prune_shard_name); + $this->index_archive->forceSave(); + if (file_exists($tmp_shard_name)) { + unlink($tmp_shard_name); } - unset($prune_shard); + unset($tmp_shard); set_error_handler(null); set_error_handler(C\NS_CONFIGS . "yioop_error_handler"); } @@ -873,7 +869,7 @@ class FeedsUpdateJob extends MediaJob "AND CATEGORY = ?"; $interval_sql = "SELECT TERM, SUM(OCCURRENCES) AS OCCURRENCES ". "FROM TRENDING_TERM WHERE UPDATE_PERIOD = ? AND " . - "TIMESTAMP > ? AND LANGUAGE = ? GROUP BY TERM ". + "TIMESTAMP > ? AND LANGUAGE = ? AND CATEGORY = ? GROUP BY TERM ". "ORDER BY OCCURRENCES DESC ". $db->limitOffset(C\NUM_TRENDING); $insert_sql = "INSERT INTO TRENDING_TERM (TERM, OCCURRENCES, " . @@ -904,7 +900,7 @@ class FeedsUpdateJob extends MediaJob } } $result = $db->execute($interval_sql, - [$sub_interval, $interval_start, $lang]); + [$sub_interval, $interval_start, $lang, $category]); while ($interval_info = $db->fetchArray($result)) { $db->execute($insert_sql, [$interval_info['TERM'], $interval_info['OCCURRENCES'], $interval, $time, @@ -952,53 +948,42 @@ class FeedsUpdateJob extends MediaJob strlen($item["link"]) > C\MAX_URL_LEN) { return false; } - $item["title"] = substr($item["title"], 0, C\TITLE_LEN); - $item["description"] = substr($item["description"], 0, + $out_item = []; + $out_item["TITLE"] = substr($item["title"], 0, C\TITLE_LEN); + $out_item["DESCRIPTION"] = substr($item["description"], 0, C\MAX_GROUP_POST_LEN); + $out_item["LINK"] = $item["link"]; if (empty($item["guid"])) { $hash_string = ""; foreach ($unique_fields as $field) { $hash_string .= $item[$field]; } - $item["guid"] = L\crawlHash($hash_string); + $out_item["GUID"] = L\crawlHash($hash_string); } else { - $item["guid"] = L\crawlHash($item["guid"]); + $out_item["GUID"] = L\crawlHash($item["guid"]); } if (!isset($item["image_link"]) || strlen($item["image_link"]) > C\MAX_URL_LEN) { - $item["image_link"] = ""; + $out_item["IMAGE_LINK"] = ""; + } else { + $out_item["IMAGE_LINK"] = $item["image_link"]; } - $raw_guid = L\unbase64Hash($item["guid"]); if (!isset($item["pubdate"]) || $item["pubdate"] == "") { - $item["pubdate"] = time(); + $out_item["PUBDATE"] = time(); } else { - $item["pubdate"] = strtotime($item["pubdate"]); - if ($item["pubdate"] < 0) { - $item["pubdate"] = time(); + $out_item["PUBDATE"] = strtotime($item["pubdate"]); + if ($out_item["PUBDATE"] < 0) { + $out_item["PUBDATE"] = time(); } } - if (time() - $item["pubdate"] > $age) { + if (time() - $out_item["PUBDATE"] > $age) { return false; } - $sql = "SELECT COUNT(*) AS NUMBER FROM FEED_ITEM WHERE GUID = ?"; - $db = $this->db; - $result = $db->execute($sql, [$item["guid"]]); - if ($result) { - $row = $db->fetchArray($result); - if ($row["NUMBER"] > 0) { - return false; - } - } else { - return true; - } - $sql = "INSERT INTO FEED_ITEM (GUID, TITLE, LINK, IMAGE_LINK,". - "DESCRIPTION, PUBDATE, SOURCE_NAME) VALUES (?, ?, ?, ?, ?, ?, ?)"; - $result = $db->execute($sql, [$item['guid'], - $item['title'], $item['link'], $item['image_link'], - $item['description'], $item['pubdate'], $source_name]); - if (!$result) { + if ($this->index_archive->contains($out_item["GUID"])) { return false; } + $out_item['SOURCE_NAME'] = $source_name; + $this->found_items[] = $out_item; return true; } /**