viewgit/inc/functions.php:22 Function utf8_encode() is deprecated [8192]
Filename | |
---|---|
src/executables/Fetcher.php | |
src/library/PhraseParser.php | |
src/locale/en_US/resources/Tokenizer.php |
diff --git a/src/executables/Fetcher.php b/src/executables/Fetcher.php index 9aeee3e8a..df11281ac 100755 --- a/src/executables/Fetcher.php +++ b/src/executables/Fetcher.php @@ -2931,8 +2931,7 @@ class Fetcher implements CrawlConstants mb_substr($site[self::DESCRIPTION], 0, C\AD_HOC_TITLE_LENGTH), $site[self::LANG]); $word_and_qa_lists = - PhraseParser::extractPhrasesInLists($phrase_string, - $lang); + PhraseParser::extractPhrasesInLists($phrase_string, $lang); $word_lists = $word_and_qa_lists['WORD_LIST']; $len = strlen($phrase_string); if (isset($this->programming_language_extension[$lang]) || diff --git a/src/library/PhraseParser.php b/src/library/PhraseParser.php index d1b49ec1c..7b9963903 100755 --- a/src/library/PhraseParser.php +++ b/src/library/PhraseParser.php @@ -354,16 +354,16 @@ class PhraseParser array_slice($parts, $i, $j - $i))); $lower_entity = mb_strtolower($current_entity); if ($j - $i > 1) { + $contains = false; if (NWordGrams::ngramsContains( $lower_entity, $lang, "all")) { $last_entity = $current_entity; $lower_last_entity = $lower_entity; $k = $j; + $contains = true; } if (!NWordGrams::ngramsContains( $lower_entity . "*", $lang, "all")) { - $last_entity = trim($last_entity); - $lower_last_entity = trim($lower_last_entity); // extra checks as Bloom filter not 100% if (strpos(substr($last_entity, 4), " ") > 0 && !preg_match('/\-|\(|\)|\[|\]|,|\./', $last_entity) && @@ -380,11 +380,16 @@ class PhraseParser $j = $k - 1; } } else { + $contains = false; $last_entity = $current_entity; $lower_last_entity = $lower_entity; $k = $j; } } + if ($contains && strpos(substr($current_entity, 4), " ") > 0 && + !preg_match('/\-|\(|\)|\[|\]|,|\./', $current_entity)) { + $current_entity = str_replace(" ", "-", $current_entity); + } $string = $out_string . " " . $current_entity; } /** diff --git a/src/locale/en_US/resources/Tokenizer.php b/src/locale/en_US/resources/Tokenizer.php index 40b4ce741..790299cc6 100755 --- a/src/locale/en_US/resources/Tokenizer.php +++ b/src/locale/en_US/resources/Tokenizer.php @@ -475,10 +475,18 @@ class Tokenizer if (in_array($word, self::$no_stem_list)) { return $word; } + // for hyphenated words, only stem last word -- my change CP + if (($last_hyphen = strrpos($word, "-")) > 0) { + $before_hyphen = substr($word, 0, $last_hyphen + 1); + $after_hyphen = substr($word, -$last_hyphen); + return $before_hyphen . self::stem($after_hyphen); + } self::$buffer = $word; self::$k = strlen($word) - 1; self::$j = self::$k; - if (self::$k <= 1) { return $word; } + if (self::$k <= 1) { + return $word; + } self::step1ab(); self::step1c(); self::step2();