viewgit/inc/functions.php:22 Function utf8_encode() is deprecated [8192]
Filename | |
---|---|
src/executables/Fetcher.php | |
src/library/IndexManager.php | |
src/library/PhraseParser.php | |
src/library/media_jobs/FeedsUpdateJob.php | |
src/models/PhraseModel.php |
diff --git a/src/executables/Fetcher.php b/src/executables/Fetcher.php index fca344937..9aeee3e8a 100755 --- a/src/executables/Fetcher.php +++ b/src/executables/Fetcher.php @@ -1677,7 +1677,8 @@ class Fetcher implements CrawlConstants if (!isset($this->hosts_with_errors[$host])) { $this->hosts_with_errors[$host] = 0; } - if ($response_code >= 400 || $response_code < 100) { + if (($response_code >= 400 && $response_code != 404) || + $response_code < 100) { // < 100 will capture failures to connect which are returned // as strings $was_error = true; diff --git a/src/library/IndexManager.php b/src/library/IndexManager.php index 1d548d35c..f1ce3a3ce 100644 --- a/src/library/IndexManager.php +++ b/src/library/IndexManager.php @@ -184,11 +184,15 @@ class IndexManager implements CrawlConstants * bits of word id to discard * @param int $threshold after the number of results exceeds this amount * stop looking for more dictionary entries. - * @param int $start_generation - * @param int $num_distinct_generations - * @param bool $with_remaining_total - * @return array sequence of four tuples: - * (index_shard generation, posting_list_offset, length, exact id + * @param int $start_generation what generation in the index to start + * finding occurrence of phrase from + * @param int $num_distinct_generations from $start_generation how + * many generation to search forward to + * @param bool $with_remaining_total whether to total number of + * postings found as well or not + * @return array either [total, sequence of four tuples] + * or sequence of four tuples: + * (index_shard generation, posting_list_offset, length, exact id * that match $hash) */ public static function getWordInfo($index_name, $hash, $shift = 0, @@ -236,9 +240,11 @@ class IndexManager implements CrawlConstants * @param string $index_name index to look up term or phrase in * @param int $threshold if set and positive then once threshold many * documents are found the search for more documents to add to the - * total is stoppe - * @param int $start_generation - * @param int $num_distinct_generations + * total is stopped + * @param int $start_generation what generation in the index to start + * finding occurrence of phrase from + * @param int $num_distinct_generations from $start_generation how + * many generation to search forward to * @return int number of documents */ public static function numDocsTerm($term_or_phrase, $index_name, diff --git a/src/library/PhraseParser.php b/src/library/PhraseParser.php index 66635e1de..d1b49ec1c 100755 --- a/src/library/PhraseParser.php +++ b/src/library/PhraseParser.php @@ -137,68 +137,78 @@ class PhraseParser self::hyphenateEntities($string, $lang); } $terms = self::stemCharGramSegment($string, $lang); - $num = count($terms); - if ($index_name == null || $num <= 1 || (class_exists($char_class) && - isset($char_class::$char_gram_len) )) { + $num_terms = count($terms); + if ($index_name == null || $num_terms <= 1 || + (class_exists($char_class) && isset($char_class::$char_gram_len))) { return $terms; } - if (count($terms) > C\MAX_QUERY_TERMS) { - $first_terms = array_slice($terms, 0, C\MAX_QUERY_TERMS); - $whole_phrase = implode(" ", $first_terms); - } else { - $whole_phrase = implode(" ", $terms); - $first_terms = $terms; - } - if ($exact_match) { - /* for exact phrase search do not use suffix tree stuff for now */ + // keep only first C\MAX_QUERY_TERMS many terms + if ($num_terms > C\MAX_QUERY_TERMS) { + $terms = array_slice($terms, 0, C\MAX_QUERY_TERMS); + } + $whole_phrase = implode(" ", $terms); + if ($exact_match || ($index_name != 'feed' && + IndexManager::getVersion($index_name) == 0)) { + /* for exact phrase search do not use suffix tree stuff for now. + Also, for old style index before max phrase extraction + just return terms + */ return $terms; } $tokenizer = self::getTokenizer($lang); + // query terms are question answer triplet then do no further processing if (!empty($tokenizer::$question_token) && stristr($whole_phrase, $tokenizer::$question_token) !== false) { - $terms = [$whole_phrase, $terms[0]]; + return [$whole_phrase]; + } + $terms = self::extractTermsWholePhrase($terms, $index_name, + $threshold); + return $terms; + } + /** + * + */ + public static function extractTermsWholePhrase($terms, $index_name, + $threshold) + { + $num_terms = count($terms); + if ($num_terms <= 1) { return $terms; } + $whole_phrase = implode(" ", $terms); $count_whole_phrase = IndexManager::numDocsTerm($whole_phrase, $index_name, $threshold); - if ($count_whole_phrase >= $threshold - || $num > C\PHRASE_THRESHOLD) { - $terms = [$whole_phrase, $terms[0]]; + /* + If have more than $threshold (default 10, one page) worth of + whole phrase results then use whole phrase for results + */ + if ($count_whole_phrase >= $threshold) { + return [$whole_phrase]; + } else if ($num_terms <= 2) { return $terms; - } else if ($count_whole_phrase > 0) { - foreach ($terms as $term) { - $count_term = IndexManager::numDocsTerm($term, - $index_name, 5 * $threshold); - if ($count_term > 50 * $count_whole_phrase) { - $terms = [$whole_phrase, $terms[0]]; - return $terms; - } - } - } else if ($num > 2) { - $start_terms = $first_terms; - $last_term = array_pop($start_terms); - $start_phrase = implode(" ", $start_terms); - $count_start = IndexManager::numDocsTerm($start_phrase, + } else { + $first_term = array_shift($terms); + $extract_terms = self::extractTermsWholePhrase($terms, $index_name, $threshold); - if ($count_start >= $threshold) { - $terms = [$start_phrase, $last_term, $terms[0]]; - return $terms; + if (count($extract_terms) <= 1) { + if ($count_whole_phrase > 0) { + $count_extract_terms = IndexManager::numDocsTerm( + $whole_phrase, $index_name, $threshold); + if ($count_whole_phrase * $threshold > + $count_extract_terms) { + return [$whole_phrase]; + } + } + array_unshift($extract_terms, $first_term); + return $extract_terms; } - $end_terms = $first_terms; - $first_term = array_shift($end_terms); - $end_phrase = implode(" ", $end_terms); - $count_end = IndexManager::numDocsTerm($end_phrase, + $last_term = array_pop($extract_terms); + array_unshift($extract_terms, $first_term); + $rest_terms = self::extractTermsWholePhrase($extract_terms, $index_name, $threshold); - if ($count_end >= $threshold) { - $terms = [$first_term, $end_phrase]; - return $terms; - } - } - if ($index_name != 'feed' && - IndexManager::getVersion($index_name) == 0) { - return $terms; //old style index before max phrase extraction + $rest_terms[] = $last_term; + return $rest_terms; } - return $terms; } /** * Extracts all phrases (sequences of adjacent words) from $string. Does @@ -315,10 +325,13 @@ class PhraseParser } } /** + * Given a string, hyphenates words in the string which appear in + * a bloom filter for the given locale as phrases. + * * @param string& $string a string of words, etc which might involve such * terms * @param $lang a language tag to use as part of the canonicalization - * process not used right now + * process */ public static function hyphenateEntities(&$string, $lang = null) { @@ -362,7 +375,7 @@ class PhraseParser $space = " "; $current_entity = ""; $last_entity = ""; - $lower_last_entity =""; + $lower_last_entity = ""; $i = $k; $j = $k - 1; } diff --git a/src/library/media_jobs/FeedsUpdateJob.php b/src/library/media_jobs/FeedsUpdateJob.php index 19ec1574f..8c994f357 100644 --- a/src/library/media_jobs/FeedsUpdateJob.php +++ b/src/library/media_jobs/FeedsUpdateJob.php @@ -665,7 +665,7 @@ class FeedsUpdateJob extends MediaJob $phrase_string, $lang); $raw_guid = L\unbase64Hash($item["GUID"]); $doc_keys = L\crawlHash($item["LINK"], true) . - $raw_guid."d". substr(L\crawlHash( + $raw_guid . "d". substr(L\crawlHash( UrlParser::getHost($item["LINK"])."/", true), 1); $meta_ids = $this->calculateMetas($lang, $item['PUBDATE'], $source_name, $item["GUID"], $media_category); diff --git a/src/models/PhraseModel.php b/src/models/PhraseModel.php index eebbc2664..3860ec496 100755 --- a/src/models/PhraseModel.php +++ b/src/models/PhraseModel.php @@ -579,10 +579,7 @@ class PhraseModel extends ParallelModel } else { $new_words = PhraseParser::extractPhrases($phrase_part, $locale_tag, - $index_name); - if (isset($new_words[0]) && strpos($new_words[0], " ") > 0) { - array_pop($new_words); - } + $index_name); $base_words = array_merge($base_words, $new_words); } $num_words = count($base_words);