normalize chinese characters r=chris

Forrest Sun [2020-07-21 12:Jul:st]

normalize chinese characters r=chris

Signed-off-by: Chris Pollett <chris@pollett.org>

Filename
src/configs/TokenTool.php
src/library/NamedEntityContextTagger.php
src/library/StochasticTermSegmenter.php
src/locale/zh_CN/resources/zh2HansChar.txt.gz

diff --git a/src/configs/TokenTool.php b/src/configs/TokenTool.php
index 1e8704284..e579f3da2 100644
--- a/src/configs/TokenTool.php
+++ b/src/configs/TokenTool.php
@@ -357,12 +357,12 @@ function getTrainingFileNames($command_line_args, $start_index = 4)
     }
     $file_names = [];
     for($i = $start_index + 1; $i < count($command_line_args); $i++) {
-        $file_path = glob($file_path . $command_line_args[$i]);
-        if (count($file_path) == 0) {
+        $files = glob($file_path . $command_line_args[$i]);
+        if (count($files) == 0) {
             echo "error: $file_path{$command_line_args[$i]}: File not found\n";
             exit();
         }
-        $file_names = array_merge($file_names, $file_path);
+        $file_names = array_merge($file_names, $files);
     }
     if ($command_line_args[$start_index] > 0) {
         $file_names = array_slice($file_names, 0, $command_line_args[4]);
diff --git a/src/library/NamedEntityContextTagger.php b/src/library/NamedEntityContextTagger.php
index 6553f3115..af0dd7265 100644
--- a/src/library/NamedEntityContextTagger.php
+++ b/src/library/NamedEntityContextTagger.php
@@ -96,6 +96,10 @@ class NamedEntityContextTagger extends ContextTagger
             $text_files = [$text_files];
         }
         echo "Reading files... \n";
+        if(!$term_callback && !empty($this->tokenizer)
+            && method_exists($this->tokenizer,"normalize")) {
+            $term_callback = [$this->tokenizer,"normalize"];
+        }
         // term_tag_sentences[sentence#] = [[words...], [tags...]]
         $term_tag_sentences = self::processTexts($text_files,
             $term_tag_separator, $term_callback, $tag_callback, true);
@@ -302,11 +306,13 @@ class NamedEntityContextTagger extends ContextTagger
      * Predicts named entities that exists in a sentence.
      * @param mixed $sentence is an array of segmented words/terms
      *  or a string that will be split on white space
+     * @param string $normalize return the normalized form
+     *                乾隆->干隆
      * @return array all predicted named entities together with a tag
      *  indicating kind of named entity
      *  ex. [["郑振铎","nr"],["国民党","nt"]]
      */
-    public function predict($sentence)
+    public function predict($sentence, $normalize=true)
     {
         if (empty($sentence)) {
             return [];
@@ -320,7 +326,14 @@ class NamedEntityContextTagger extends ContextTagger
             $this->loadWeights();
         }
         $found_entities = [];
-        foreach ($sentence_vector as $term) {
+        foreach ($sentence_vector as $unnormalized) {
+            if (!empty($this->tokenizer) &&
+                method_exists($this->tokenizer,
+                "normalize")) {
+                $term=$this->tokenizer::normalize($unnormalized);
+            } else {
+                $term=$unnormalized;
+            }
             $characters = preg_split('//u', $term, null,
                 PREG_SPLIT_NO_EMPTY);
             if (empty($characters)) {
@@ -358,6 +371,10 @@ class NamedEntityContextTagger extends ContextTagger
             $pre_tag = 'o';
             $current_entity = "";
             $entities = [];
+            if (!$normalize) {
+                $characters = preg_split('//u', $unnormalized, null,
+                    PREG_SPLIT_NO_EMPTY);
+            }
             for ($i = 0; $i < count($characters); $i++) {
                 if ($pre_tag != $tags[$i] && $pre_tag != "o") {
                     if (mb_strlen($current_entity) < self::MAX_ENTITY_LENGTH) {
@@ -366,11 +383,7 @@ class NamedEntityContextTagger extends ContextTagger
                     $current_entity = "";
                 }
                 if ($tags[$i] != "o") {
-                    if ($current_entity) {
-                        $current_entity .= $characters[$i] ?? "";
-                    } else {
-                        $current_entity = $characters[$i] ?? "";
-                    }
+                    $current_entity .= $characters[$i] ?? "";
                 }
                 $pre_tag = $tags[$i];
             }
diff --git a/src/library/StochasticTermSegmenter.php b/src/library/StochasticTermSegmenter.php
index 98395e9df..80bca2dcc 100644
--- a/src/library/StochasticTermSegmenter.php
+++ b/src/library/StochasticTermSegmenter.php
@@ -211,6 +211,11 @@ class StochasticTermSegmenter
                     foreach ($words as $word) {
                         if ($word != "" && !$this->isException($word) &&
                             !$this->notCurrentLang($word)) {
+                            if (!empty($this->tokenizer) &&
+                              method_exists($this->tokenizer,
+                              "normalize")) {
+                                  $word=$this->tokenizer::normalize($word);
+                            }
                             if (!empty($dictionary[$word])) {
                                 $dictionary[$word]++;
                             } else if (mb_strlen($word) <
@@ -287,9 +292,11 @@ class StochasticTermSegmenter
     /**
      * Segments text into terms separated by space
      * @param string $text to be segmented
+     * @param string $normalize return the normalized form
+     *                乾隆->干隆
      * @return string segmented terms with space
      */
-    public function segmentText($text)
+    public function segmentText($text, $normalize=false)
     {
         $segmented_text = "";
         $lines = explode("\n", $text);
@@ -307,15 +314,17 @@ class StochasticTermSegmenter
      * Segments a single sentence into an array of words.
      * Must NOT contain any new line characters.
      * @param string $sentence is a string without newline to be segmented
+     * @param string $normalize return the normalized form
+     *                乾隆->干隆
      * @return array of segmented words
      */
-    public function segmentSentence($sentence)
+    public function segmentSentence($sentence, $normalize=true)
     {
         $t = preg_split("/[\s　]+/u", trim($sentence));
         if(count($t) > 1) {
             $ret = [];
             foreach($t as $s) {
-                $segments = $this->segmentSentence($s);
+                $segments = $this->segmentSentence($s, $normalize);
                 if (is_array($segments)) {
                     $ret = array_merge($ret, $segments);
                 }
@@ -338,8 +347,13 @@ class StochasticTermSegmenter
         if ($cache_size == 0) {
             $cache_size = 1;
         }
-        preg_match_all('/./u', trim($sentence), $matches);
-        $characters = $matches[0];
+        $unnormalized = trim($sentence);
+        $normalized = (!empty($this->tokenizer) &&
+                       method_exists($this->tokenizer, "normalize")) ?
+                       $this->tokenizer::normalize($unnormalized)
+                       : $unnormalized;
+        $characters = preg_split('//u', $normalized, null,
+                      PREG_SPLIT_NO_EMPTY);
         if (!count($characters)) {
             return [];
         }
@@ -475,6 +489,10 @@ class StochasticTermSegmenter
         }
         $result = [];
         $t = 0;
+        if (!$normalize) {
+            $characters = preg_split('//u', $unnormalized,
+                          null, PREG_SPLIT_NO_EMPTY);
+        }
         foreach(array_reverse($tmp) as $next_node) {
             $result_word = "";
             while($t <= $next_node) {
diff --git a/src/locale/zh_CN/resources/zh2HansChar.txt.gz b/src/locale/zh_CN/resources/zh2HansChar.txt.gz
new file mode 100644
index 000000000..7936ab4c3
Binary files /dev/null and b/src/locale/zh_CN/resources/zh2HansChar.txt.gz differ

ViewGit