viewgit/inc/functions.php:22 Function utf8_encode() is deprecated [8192]
<?php /** * SeekQuarry/Yioop -- * Open Source Pure PHP Search Engine, Crawler, and Indexer * * Copyright (C) 2009 - 2024 Chris Pollett chris@pollett.org * * LICENSE: * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <https://www.gnu.org/licenses/>. * * END LICENSE * * @author Chris Pollett chris@pollett.org * @license https://www.gnu.org/licenses/ GPL3 * @link https://www.seekquarry.com/ * @copyright 2009 - 2024 * @filesource */ namespace seekquarry\yioop\library; use seekquarry\yioop\configs as C; /** * Used for crawlLog, crawlHash, and garbageCollect */ require_once __DIR__ . '/Utility.php'; /** * Subclass of IndexDocumentBundle with bloom filters to make it easy to check * if a news feed item has been added to the bundle already before adding it * * @author Chris Pollett */ class FeedDocumentBundle extends IndexDocumentBundle { /** * how long in seconds before a feed item expires */ const OLD_ITEM_TIME = 4 * C\ONE_WEEK; /** * Reference to a DatasourceManager to communicate with the database to * get a list of search sources (news feeds) associated with this * feed bundle * @var DatasourceManager */ public $db; /** * Array of information about the search sources (news feeds) that * were used to collect news items stored in this bundle * @var array */ public $feeds; /** * Used to store unique identifiers of feed items that have been stored * in this FeedArchiveBundle. This filter_a is used for checking if items * are already in the archive, when it has URL_FILTER_SIZE/2 items * filter_b is added to as well as filter_a. When filter_a is of size * URL_FILTER_SIZE filter_a is deleted, filter_b is renamed to filter_a * and the process is repeated. * @var BloomFilterFile */ public $filter_a; /** * Auxiliary BloomFilterFile used in checking if feed items are in this * archive or not. @see $filter_a * @var BloomFilterFile */ public $filter_b; /** * Makes or initializes an FeedArchiveBundle with the provided parameters * * @param string $dir_name folder name to store this bundle * @param bool $read_only_archive whether to open archive only for reading * or reading and writing * @param string $description a text name/serialized info about this * IndexDocumentBundle * @param int $num_docs_per_partition the number of pages to be stored * in a single shard */ public function __construct($dir_name, $db, $read_only_archive = true, $description = null, $num_docs_per_partition = C\NUM_DOCS_PER_PARTITION) { parent::__construct($dir_name, $read_only_archive, $description, $num_docs_per_partition); $this->feeds = []; // set by FeedsUpdateJob $this->db = $db; if (file_exists($dir_name . "/filter_a.ftr")) { $this->filter_a = BloomFilterFile::load($dir_name . "/filter_a.ftr"); $this->filter_a->filename = $dir_name . "/filter_a.ftr"; } else { $this->filter_a = new BloomFilterFile($dir_name . "/filter_a.ftr", C\URL_FILTER_SIZE); set_error_handler(null); @chmod($dir_name . "/filter_a.ftr", 0755); set_error_handler(C\NS_CONFIGS . "yioop_error_handler"); } if (file_exists($dir_name . "/filter_b.ftr")) { $this->filter_a = BloomFilterFile::load($dir_name . "/filter_b.ftr"); $this->filter_b->filename = $dir_name . "/filter_b.ftr"; } else { $this->filter_b = null; } } /** * Adds pages of feed items to document bundle and adds their unique hashes * (guids)) to bloom filters so they are not reindexed * * @param array $pages array of feed items * @param int $visited_urls_count number of feed items * @return bool whether or not succeeded in adding pages */ public function addPagesAndSeenKeys($pages, $visited_urls_count) { foreach ($pages as $page) { $key = $page[self::SUMMARY][self::HASH]; $this->addFilters($key); } return parent::addPages($pages, $visited_urls_count); } /** * Adds the key (often GUID) of a feed item to the bloom filter pair * associated with this archive. This always adds to filter a, if * filter a is more than half full it adds to filter b. If filter a is full * it is deletedand filter b is renamed filter a and te process continues * where a new filter b is created when this becomee half full. * @param string $key unique identifier of a feed item */ public function addFilters($key) { if ($this->filter_a->count > C\URL_FILTER_SIZE/2 && !$this->filter_b) { if (file_exists($this->dir_name . "/filter_b.ftr")) { $this->filter_b = BloomFilterFile::load($dir_name . "/filter_b.ftr"); } else { $this->filter_b = new BloomFilterFile( $this->dir_name . "/filter_b.ftr", C\URL_FILTER_SIZE); chmod($dir_name . "/filter_a.ftr", 0755); } } if ($this->filter_a->count > C\URL_FILTER_SIZE) { unlink($this->dir_name . "/filter_a.ftr"); rename($this->dir_name . "/filter_b.ftr", $this->dir_name . "/filter_a.ftr"); } $this->filter_a->add($key); if ($this->filter_b) { $this->filter_b->add($key); } } /** * Copies all feeds items newer than $age to a new shard, then deletes * old index shard and database entries older than $age. Finally sets copied * shard to be active. If this method is going to take max_execution_time/2 * it returns false, so an additional job can be schedules; otherwise * it returns true * * @param int $partition bundle partition to build inverted index for * @param string $taking_too_long_touch name of file to touch if building * inverted index takes too long (whether SCHEDULES_DIR/ . * "/{$this->channel}-" . CrawmConstants::crawl_status_file * has been recently modified) is used in crawling to see if have run out * of new data and the crawl can stopped. * @param bool $just_stats whether to just compute stats on the inverted * or to actually save the results * @return mixed whether job executed to completion (true or false) if * !$just_stats, otherwise, an array with NUM_DOCS, NUM_LINKS, * and TERM_STATISTICS (the latter having term frequency info) */ public function buildInvertedIndexPartition($partition = -1, $taking_too_long_touch = null, $just_stats = false) { $age = self::OLD_ITEM_TIME; $pre_feeds = $this->feeds ?? []; if ($partition < 0) { $partition = $this->documents->parameters["SAVE_PARTITION"]; } $dockeys_filename = $this->documents->getPartitionIndex($partition); if(!file_exists($dockeys_filename)) { crawlLog("----.. No new items to index returning."); return false; } crawlLog("----.. Loading guids of items to index."); $doc_index = $this->documents->loadPartitionIndex($partition, true); $doc_ids = array_keys($doc_index); $num_ids = count($doc_ids); crawlLog("----.. Loaded $num_ids items in partition."); $feeds = []; foreach ($pre_feeds as $pre_feed) { if (!isset($pre_feed['NAME'])) { continue; } $feeds[$pre_feed['NAME']] = $pre_feed; } $time = time(); $num_sites = 0; $completed = true; crawlLog("----.. Creating inverted index of items."); $i = 0; $term_counts = []; $seen_url_count = 0; $base_folder = $this->getPartitionBaseFolder($partition); if (!file_exists($base_folder)) { mkdir($base_folder); } $doc_map_tools = $this->doc_map_tools; $postings_tools = $this->postings_tools; $last_entries_tools = $this->last_entries_tools; $doc_map_filename = $base_folder . "/" . self::DOC_MAP_FILENAME; $postings_filename = $base_folder . "/" . self::POSTINGS_FILENAME; $positions_filename = $base_folder . "/" . self::POSITIONS_FILENAME; $last_entries_filename = $base_folder . "/" . self::LAST_ENTRIES_FILENAME; $this->doc_map = ""; $this->doc_map_counter = 0; $this->postings = []; $this->positions = ""; $this->last_entries = []; for ($i = 0; $i < $num_ids; $i++) { $doc_id = $doc_ids[$i]; $item = $this->getSummary($doc_id, $partition); if (crawlTimeoutLog( "----..have added %s items to new index.", $i) && $taking_too_long_touch) { if (file_exists($taking_too_long_touch)) { touch($taking_too_long_touch, time()); } } $i++; if (!isset($item[self::SOURCE_NAME])) { continue; } $source_name = $item[self::SOURCE_NAME]; if (isset($feeds[$source_name])) { $lang = $feeds[$source_name]['LANGUAGE']; $media_category = $feeds[$source_name]['CATEGORY']; $source_stop_regex = $feeds[$source_name]['TRENDING_STOP_REGEX']; } else { $lang = C\DEFAULT_LOCALE; $media_category = "news"; $source_stop_regex = ""; } /* r6t was chosen as short enough not to be changed by chargramming, but rare enough that can be used as a useful splitter */ $phrase_string = $item[self::TITLE] . " r6t " . $item[self::DESCRIPTION]; $word_and_qa_lists = PhraseParser::extractPhrasesInLists( $phrase_string, $lang); $meta_ids = $this->calculateMetas($lang, $item[self::PUBDATE], $source_name, $item[self::HASH], $media_category); $len = strlen($phrase_string); $word_list = $word_and_qa_lists["WORD_LIST"]; if (PhraseParser::computeSafeSearchScore($phrase_string, $item[self::URL]) < PhraseParser::SAFE_PHRASE_THRESHOLD) { $meta_ids[] = "safe:true"; $meta_ids[] = "safe:all"; } else { $meta_ids[] = "safe:false"; $meta_ids[] = "safe:all"; } $title_length = 0; if (!empty($word_lists["r6t"][0])) { $title_length = $word_lists["r6t"][0] + 1; unset($word_lists["r6t"]); } $num_words = 0; foreach($word_list as $word => $position_list) { $num_words += count($position_list); } $this->addScoresDocMap($doc_id, $num_words, intval($item[self::PUBDATE]), 0, $title_length, $title_length, [], []); $this->addTermPostingLists(0, $word_list, $meta_ids, $this->doc_map_counter); $this->doc_map_counter++; $this->updateTrendingTermCounts($term_counts, $phrase_string, $word_list, $media_category, $source_name, $lang, $item[self::PUBDATE], $source_stop_regex); } if ($just_stats) { $term_stats = []; foreach ($this->postings as $term => $postings) { $posting_records = $postings_tools->unpack($postings); $term_stats[$term] = count($posting_records); } $statistics = [ "NUM_DOCS" => count($this->doc_map), "NUM_LINKS" => 0, "TERM_STATISTICS" => $term_stats ]; return $statistics; } unset($term_counts['seen']); $this->addTermCountsTrendingTable($term_counts); $doc_map_tools->save($doc_map_filename, $this->doc_map); $postings_tools->save($postings_filename, $this->postings); $last_entries_tools->save($last_entries_filename, $this->last_entries); file_put_contents($positions_filename, $this->positions); set_error_handler(null); set_error_handler(C\NS_CONFIGS . "yioop_error_handler"); } /** * Given a $site array of information about a web page/document. Use * CrawlConstant::URL and CrawlConstant::HASH fields to compute a * unique doc id for the array. * * @param array $site site to compute doc_id for * @return string doc_id */ public static function computeDocId($site) { $raw_guid = unbase64Hash($site[self::HASH]); // chr(24) refers to 'feed' doc type $doc_id = crawlHash($site[self::URL], true) . $raw_guid . chr(24) . substr(crawlHash( UrlParser::getHost($site[self::URL]) . "/", true), 1); return $doc_id; } /** * Used to calculate the meta words for RSS feed items * * @param string $lang the locale_tag of the feed item * @param int $pubdate UNIX timestamp publication date of item * @param string $source_name the name of the feed * @param string $guid the guid of the item * @param string $media_category determines what media: metas to inject. * Default is news. * * @return array $meta_ids meta words found */ public function calculateMetas($lang, $pubdate, $source_name, $guid, $media_category = "news") { $meta_ids = ["media:all", "media:$media_category", "media:$media_category:" . urlencode( mb_strtolower($source_name)), "guid:" . strtolower($guid)]; $meta_ids[] = 'date:all'; $meta_ids[] = 'date:' . date('Y', $pubdate); $meta_ids[] = 'date:' . date('Y-m', $pubdate); $meta_ids[] = 'date:' . date('Y-m-d', $pubdate); $meta_ids[] = 'date:' . date('Y-m-d-H', $pubdate); $meta_ids[] = 'date:' . date('Y-m-d-H-i', $pubdate); $meta_ids[] = 'date:' . date('Y-m-d-H-i-s', $pubdate); $meta_ids[] = 'lang:all'; if ($lang != "") { $lang_parts = explode("-", $lang); $meta_ids[] = 'lang:' . $lang_parts[0]; if (isset($lang_parts[1])) { $meta_ids[] = 'lang:' . $lang; } } return $meta_ids; } /** * Whether the active filter for this feed contain thee feed item * of thee supplied key * @param string $key the feed item id to check if in archive * @return bool true if it is in the archive, false otherwise */ public function contains($key) { return $this->filter_a->contains($key); } /** * Forces the current shard to be saved */ public function forceSave() { $this->buildInvertedIndexPartition(); $this->filter_a->save(); chmod($this->dir_name . "/filter_a.ftr", 0777); if ($this->filter_b) { $this->filter_b->save(); chmod($this->dir_name . "/filter_b.ftr", 0777); } } /** * Updates trending term counts based on the string from the current * feed item. * * @param array &$term_counts lang => [term => occurrences] * @param string $source_phrase original non-stemmed phrase from feed * item to adjust $term_counts with. Used to remember non-stemmed * terms. We assume we have already extracted position lists from * @param array $word_or_phrase_list associate array of * stemmed_word_or_phrase => positions in feed item of where occurs * @param string $media_category of feed source the item case from. We * trending counts grouped by media category * @param string $source_name of feed source the item case from. We exclude * from counts the name of the feed source * @param string $lang locale_tag for this feed item * @param int $pubdate timestamp when string was published (used in * weighting) * @param string $source_stop_regex a regex to remove terms which occur * frequently for this particular source */ public function updateTrendingTermCounts(&$term_counts, $source_phrase, $word_or_phrase_list, $media_category, $source_name, $lang, $pubdate, $source_stop_regex = "") { $time = time(); if ($time - $pubdate > C\ONE_DAY) { return; } $time_weight = max(ceil(($time - $pubdate)/C\ONE_HOUR), 1); $tokenizer = PhraseParser::getTokenizer($lang); $stop_words = (empty($tokenizer::$stop_words)) ? [] : $tokenizer::$stop_words; $stop_words[] = "tztzlzngth"; $composites = []; foreach ($word_or_phrase_list as $term => $positions) { if (strpos($term, "-") !== false) { $sub_terms = explode("-", $term); $composites += $sub_terms; } } foreach($word_or_phrase_list as $term => $positions) { if ((!empty($source_stop_regex) && preg_match($source_stop_regex, $term)) || !is_string($term) || $term == "") { continue; } $occurrences = count($positions); $weight = substr_count($term, "-"); if (!empty($term_counts['seen'][$term])) { $term = $term_counts['seen'][$term]; if (empty($term_counts[$lang][$media_category][$term])) { if (empty($term_counts[$lang])) { $term_counts[$lang] = []; } if (empty($term_counts[$lang][$media_category])) { $term_counts[$lang][$media_category] = []; } $term_counts[$lang][$media_category][$term] = 0; } $term_counts[$lang][$media_category][$term] += ($weight * $occurrences) / $time_weight; } else if (strpos($term, " ") === false && !in_array($term, $composites) && !in_array($term[0], ["#", "_", "-"] ) && !in_array($term, $stop_words) && substr($term, 0, 4) != "http") { $original_term = $term; if (preg_match("/\b$term".'[\w|\-]*\b/ui', $source_phrase, $match)) { $term = $match[0]; if (!empty($source_stop_regex) && preg_match($source_stop_regex, $term)) { continue; } } else { $term = str_replace("-", " ", $term); if( mb_stristr($source_name, $term) !== false) { continue; } if (($pre_term = substr($term, -6)) == "\_pos\_s") { $term = $pre_term . "\s*(\'|\&apos\;)\s+"; } if (preg_match("/\b$term".'[\w|\-]*\b/ui', $source_phrase, $match)) { $term = $match[0]; } else { $trunc_term = mb_substr($term, 0, -1); $trunc_original = mb_substr($original_term, 0, -1); if (preg_match("/\b$trunc_term".'[\w|\-]*\b/ui', $source_phrase, $match)) { $term = $match[0]; } else if (preg_match( "/\b$trunc_original".'[\w|\-]*\b/ui', $source_phrase, $match)) { $term = $match[0]; } else { continue; } } if (!empty($source_stop_regex) && preg_match($source_stop_regex, $term)) { continue; } } $lower_term = mb_strtolower($term); $lower_parts = preg_split("/\s+|\-/ui", $lower_term); $uncommon = false; foreach ($lower_parts as $lower_part) { if (!empty($lower_part) && !in_array($lower_part, $stop_words)) { $uncommon = true; break; } } if ($uncommon) { $term = preg_replace("/\-/", " ", $lower_term); $term = mb_convert_case($term, MB_CASE_TITLE, 'UTF-8'); if (empty($term_counts[$lang][$media_category][$term])) { if (empty($term_counts[$lang])) { $term_counts[$lang] = []; } if (empty($term_counts[$lang][$media_category])) { $term_counts[$lang][$media_category] = []; } $term_counts[$lang][$media_category][$term] = 0; } $term_counts['seen'][$original_term] = $term; if (!empty($term_counts['seen'][$lower_term])) { $term = $term_counts['seen'][$lower_term]; if (empty($term_counts[$lang][$media_category][$term])){ $term_counts[$lang][$media_category][$term] = 0; } } $term_counts['seen'][$lower_term] = $term; $term_counts[$lang][$media_category][$term] += ($weight * $occurrences) / $time_weight; } } } } /** * Updates TRENDING_TERM, hourly, daily, and weekly top term occurrences. * Removes entries older than a week * * @param array $term_counts for the most recent update of the * feed index, it should be an array [$lang => [$term => $occurrences]] * for the top NUM_TRENDING terms per language */ public function addTermCountsTrendingTable($term_counts) { $db = $this->db; $time = time(); $update_intervals = [ C\ONE_HOUR => C\ONE_DAY, C\ONE_DAY => C\ONE_WEEK, C\ONE_WEEK => C\ONE_MONTH, C\ONE_MONTH => C\ONE_YEAR ]; $reset_interval_sql = "DELETE FROM TRENDING_TERM WHERE " . "UPDATE_PERIOD = ? AND LANGUAGE = ? AND ". "CATEGORY = ? AND TIMESTAMP >= ? "; $interval_sql = "SELECT TERM, SUM(OCCURRENCES) AS OCCURRENCES ". "FROM TRENDING_TERM WHERE UPDATE_PERIOD = ? AND " . "TIMESTAMP >= ? AND LANGUAGE = ? AND CATEGORY = ? GROUP BY TERM ". "ORDER BY OCCURRENCES DESC ". $db->limitOffset(C\NUM_TRENDING); $insert_sql = "INSERT INTO TRENDING_TERM (TERM, OCCURRENCES, " . "UPDATE_PERIOD, TIMESTAMP, LANGUAGE, CATEGORY) VALUES ". "(?, ?, ?, ?, ?, ?)"; $cull_sql = "DELETE FROM TRENDING_TERM WHERE UPDATE_PERIOD = ? " . " AND TIMESTAMP < ?"; $have_culled = []; foreach ($term_counts as $lang => $category_term_occurrences) { crawlLog("Updating Language $lang trending terms"); foreach ($category_term_occurrences as $category => $term_occurrences) { /* Sort the word occurrence list by occurrence and insert into database. */ $num_inserted = 0; arsort($term_occurrences); foreach ($term_occurrences as $term => $occurrences) { $db->execute($insert_sql, [$term, $occurrences, C\ONE_HOUR, $time, $lang, $category]); $num_inserted++; if ($num_inserted >= C\NUM_TRENDING) { break; } } foreach ($update_intervals as $sub_interval => $interval) { $interval_start = $time - $interval; $recent_timestamp = floor($interval_start/$interval) * $interval; $db->execute($reset_interval_sql, [$interval, $lang, $category, $recent_timestamp]); $result = $db->execute($interval_sql, [$sub_interval, $recent_timestamp, $lang, $category]); while ($interval_info = $db->fetchArray($result)) { $db->execute($insert_sql, [$interval_info['TERM'], $interval_info['OCCURRENCES'], $interval, $recent_timestamp, $lang, $category]); } if (empty($have_culled[$sub_interval])) { $db->execute($cull_sql, [$sub_interval, $recent_timestamp]); $have_culled[$sub_interval] = true; } } } } } }