Fixes a bug in hyphenated entities, a=chris

Chris Pollett [2019-01-24 16:Jan:th]

Fixes a bug in hyphenated entities, a=chris

Filename
src/executables/Fetcher.php
src/library/PhraseParser.php
src/locale/en_US/resources/Tokenizer.php

diff --git a/src/executables/Fetcher.php b/src/executables/Fetcher.php
index 9aeee3e8a..df11281ac 100755
--- a/src/executables/Fetcher.php
+++ b/src/executables/Fetcher.php
@@ -2931,8 +2931,7 @@ class Fetcher implements CrawlConstants
                     mb_substr($site[self::DESCRIPTION], 0,
                     C\AD_HOC_TITLE_LENGTH), $site[self::LANG]);
                 $word_and_qa_lists =
-                    PhraseParser::extractPhrasesInLists($phrase_string,
-                        $lang);
+                    PhraseParser::extractPhrasesInLists($phrase_string, $lang);
                 $word_lists = $word_and_qa_lists['WORD_LIST'];
                 $len = strlen($phrase_string);
                 if (isset($this->programming_language_extension[$lang]) ||
diff --git a/src/library/PhraseParser.php b/src/library/PhraseParser.php
index d1b49ec1c..7b9963903 100755
--- a/src/library/PhraseParser.php
+++ b/src/library/PhraseParser.php
@@ -354,16 +354,16 @@ class PhraseParser
                 array_slice($parts, $i, $j - $i)));
             $lower_entity = mb_strtolower($current_entity);
             if ($j - $i > 1) {
+                $contains = false;
                 if (NWordGrams::ngramsContains(
                     $lower_entity, $lang, "all")) {
                     $last_entity = $current_entity;
                     $lower_last_entity = $lower_entity;
                     $k = $j;
+                    $contains = true;
                 }
                 if (!NWordGrams::ngramsContains(
                     $lower_entity . "*", $lang, "all")) {
-                    $last_entity = trim($last_entity);
-                    $lower_last_entity = trim($lower_last_entity);
                     // extra checks as Bloom filter not 100%
                     if (strpos(substr($last_entity, 4), " ") > 0 &&
                         !preg_match('/\-|\(|\)|\[|\]|,|\./', $last_entity) &&
@@ -380,11 +380,16 @@ class PhraseParser
                     $j = $k - 1;
                 }
             } else {
+                $contains = false;
                 $last_entity = $current_entity;
                 $lower_last_entity = $lower_entity;
                 $k = $j;
             }
         }
+        if ($contains && strpos(substr($current_entity, 4), " ") > 0 &&
+            !preg_match('/\-|\(|\)|\[|\]|,|\./', $current_entity)) {
+            $current_entity = str_replace(" ", "-", $current_entity);
+        }
         $string = $out_string . " " . $current_entity;
     }
     /**
diff --git a/src/locale/en_US/resources/Tokenizer.php b/src/locale/en_US/resources/Tokenizer.php
index 40b4ce741..790299cc6 100755
--- a/src/locale/en_US/resources/Tokenizer.php
+++ b/src/locale/en_US/resources/Tokenizer.php
@@ -475,10 +475,18 @@ class Tokenizer
         if (in_array($word, self::$no_stem_list)) {
             return $word;
         }
+        // for hyphenated words, only stem last word -- my change CP
+        if (($last_hyphen = strrpos($word, "-")) > 0) {
+            $before_hyphen = substr($word, 0, $last_hyphen + 1);
+            $after_hyphen = substr($word, -$last_hyphen);
+            return $before_hyphen . self::stem($after_hyphen);
+        }
         self::$buffer = $word;
         self::$k = strlen($word) - 1;
         self::$j = self::$k;
-        if (self::$k <= 1) { return $word; }
+        if (self::$k <= 1) {
+            return $word;
+        }
         self::step1ab();
         self::step1c();
         self::step2();

ViewGit