Fixes bug introduced in named entity predict, fixes miscellaneous notices, a=chris

Chris Pollett [2020-07-13 01:Jul:th]
Fixes bug introduced in named entity predict, fixes miscellaneous notices, a=chris
Filename
src/controllers/SearchController.php
src/library/NamedEntityContextTagger.php
src/library/StochasticTermSegmenter.php
src/locale/zh_CN/resources/nect_weights.txt.gz
tests/ZhTokenizerTest.php
diff --git a/src/controllers/SearchController.php b/src/controllers/SearchController.php
index 97b3f23cb..798fccecc 100755
--- a/src/controllers/SearchController.php
+++ b/src/controllers/SearchController.php
@@ -614,6 +614,7 @@ class SearchController extends Controller implements CrawlConstants
                 $data['CATEGORY_TYPE'] = $media_categories[$key]['TYPE'];
             } else {
                 $fail_category = $this->clean($_REQUEST['category'], 'string');
+                $data['SCRIPT'] ??= "";
                 $data['SCRIPT'] .=
                         "doMessage('<h1 class=\"red\" >".
                         tl('search_controller_no_trend_category',
diff --git a/src/library/NamedEntityContextTagger.php b/src/library/NamedEntityContextTagger.php
index c402393fd..62963016d 100644
--- a/src/library/NamedEntityContextTagger.php
+++ b/src/library/NamedEntityContextTagger.php
@@ -308,15 +308,26 @@ class NamedEntityContextTagger extends ContextTagger
      */
     public function predict($sentence)
     {
-        if (!is_array($sentence)) {
-            if ($sentence == "") {
-                $terms = [];
-            } else {
-                $terms = preg_split("/[\s]+/u", $sentence);
-            }
-        } else {
+        if (empty($sentence)) {
+            return [];
+        }
+        if (is_array($sentence)) {
             $terms = $sentence;
+        } else {
+            $terms = preg_split("/[\s]+/u", $sentence);
+        }
+        if (count($terms) > 1) {
+            $results = [];
+            foreach ($terms as $term) {
+                $entities = $this->predict($term);
+                if (!empty($entities)) {
+                    $results = array_merge($results, $entities);
+                }
+            }
+            return $results;
         }
+        $terms = preg_split('//u', $terms[0], null,
+            PREG_SPLIT_NO_EMPTY);
         if (!count($terms)) {
             return [];
         }
diff --git a/src/library/StochasticTermSegmenter.php b/src/library/StochasticTermSegmenter.php
index 766c4eca9..98395e9df 100644
--- a/src/library/StochasticTermSegmenter.php
+++ b/src/library/StochasticTermSegmenter.php
@@ -439,7 +439,8 @@ class StochasticTermSegmenter
                     }
                     $subdic = $subdic[$characters[$j]];
                     if (isset($subdic['$']) && (!isset($score[$j]) ||
-                        (isset($score[$index - 1]) &&
+                        (isset($score[$index - 1]) && is_numeric($subdic['$'])
+                        && is_numeric($subdic['$']) &&
                         $score[$index - 1] + $subdic['$'] < $score[$j]))) {
                         $score[$j] = $score[$index - 1] +
                             $this->getScore($subdic['$']);
diff --git a/src/locale/zh_CN/resources/nect_weights.txt.gz b/src/locale/zh_CN/resources/nect_weights.txt.gz
index a54e319dd..8e6fea731 100755
Binary files a/src/locale/zh_CN/resources/nect_weights.txt.gz and b/src/locale/zh_CN/resources/nect_weights.txt.gz differ
diff --git a/tests/ZhTokenizerTest.php b/tests/ZhTokenizerTest.php
index 1e6698341..aaf6e4cb6 100644
--- a/tests/ZhTokenizerTest.php
+++ b/tests/ZhTokenizerTest.php
@@ -58,8 +58,8 @@ class ZhTokenizerTest extends UnitTest
      */
     public function namedEntityTestCase()
     {
-        $source = "郑振铎 国民党 國家元首 行政權 日本";
-        $expected_tagging = "郑振铎_nr 国民党_nt 日本_ns";
+        $source = "孙向宏喜欢去洛杉矶旅游";
+        $expected_tagging = "孙向宏_nr 洛杉矶_ns";
         $ne_tagger = new L\NamedEntityContextTagger('zh-CN');
         $output_tagging = $ne_tagger->tag($source);
         $this->assertEqual($output_tagging, $expected_tagging,
ViewGit