Adds rest of immplementation of traditional simplified normalization of Chinese strings, a=chris

Chris Pollett [2020-07-22 00:Jul:nd]

Adds rest of immplementation of traditional simplified normalization of Chinese strings, a=chris

Filename
src/configs/TokenTool.php
src/css/search.css
src/library/LocaleFunctions.php
src/library/NamedEntityContextTagger.php
src/library/StochasticTermSegmenter.php
src/locale/zh_CN/resources/Tokenizer.php
src/locale/zh_CN/resources/traditional_simplified.txt.gz
src/locale/zh_CN/resources/zh2HansChar.txt.gz
tests/ZhTokenizerTest.php

diff --git a/src/configs/TokenTool.php b/src/configs/TokenTool.php
index e579f3da2..28b2704ac 100644
--- a/src/configs/TokenTool.php
+++ b/src/configs/TokenTool.php
@@ -28,7 +28,7 @@
  * A description of its usage is given in the $usage global variable
  *
  * @author Ravi Dhillon  ravi.dhillon@yahoo.com, Chris Pollett (modified for n
- *     ngrams)
+ *     ngrams, added more functionality)
  * @license https://www.gnu.org/licenses/ GPL3
  * @link https://www.seekquarry.com/
  * @copyright 2009 - 2020
diff --git a/src/css/search.css b/src/css/search.css
index 5147f4ba0..3e8126063 100755
--- a/src/css/search.css
+++ b/src/css/search.css
@@ -763,7 +763,7 @@ body.mobile
 }
 #menu-options-background {
     background-color: black;
-    display:none;
+    display: none;
     min-height: 100%;
     left: 0px;
     opacity: 0.5;
diff --git a/src/library/LocaleFunctions.php b/src/library/LocaleFunctions.php
index 5450dda77..37a32c6e3 100755
--- a/src/library/LocaleFunctions.php
+++ b/src/library/LocaleFunctions.php
@@ -129,6 +129,11 @@ function guessLocale()
  */
 function guessLocaleFromString($phrase_string, $locale_tag = null)
 {
+    static $cache;
+    $phrase_string = trim($phrase_string);
+    if (!empty($cache[$phrase_string])) {
+        return $cache[trim($phrase_string)];
+    }
     if (!$locale_tag) {
         $locale_tag = getLocaleTag();
     }
@@ -152,6 +157,10 @@ function guessLocaleFromString($phrase_string, $locale_tag = null)
             }
         }
     }
+    $cache[$phrase_string] = $locale_tag;
+    if (count($cache) > 100) {
+        array_shift($cache);
+    }
     return $locale_tag;
 }
 /**
diff --git a/src/library/NamedEntityContextTagger.php b/src/library/NamedEntityContextTagger.php
index af0dd7265..d234b962c 100644
--- a/src/library/NamedEntityContextTagger.php
+++ b/src/library/NamedEntityContextTagger.php
@@ -97,8 +97,8 @@ class NamedEntityContextTagger extends ContextTagger
         }
         echo "Reading files... \n";
         if(!$term_callback && !empty($this->tokenizer)
-            && method_exists($this->tokenizer,"normalize")) {
-            $term_callback = [$this->tokenizer,"normalize"];
+            && method_exists($this->tokenizer, "normalize")) {
+            $term_callback = [$this->tokenizer, "normalize"];
         }
         // term_tag_sentences[sentence#] = [[words...], [tags...]]
         $term_tag_sentences = self::processTexts($text_files,
@@ -306,13 +306,11 @@ class NamedEntityContextTagger extends ContextTagger
      * Predicts named entities that exists in a sentence.
      * @param mixed $sentence is an array of segmented words/terms
      *  or a string that will be split on white space
-     * @param string $normalize return the normalized form
-     *                乾隆->干隆
      * @return array all predicted named entities together with a tag
      *  indicating kind of named entity
      *  ex. [["郑振铎","nr"],["国民党","nt"]]
      */
-    public function predict($sentence, $normalize=true)
+    public function predict($sentence)
     {
         if (empty($sentence)) {
             return [];
@@ -328,11 +326,13 @@ class NamedEntityContextTagger extends ContextTagger
         $found_entities = [];
         foreach ($sentence_vector as $unnormalized) {
             if (!empty($this->tokenizer) &&
-                method_exists($this->tokenizer,
-                "normalize")) {
-                $term=$this->tokenizer::normalize($unnormalized);
+                method_exists($this->tokenizer, "normalize")) {
+                /* Mainly used to map Chinese traditional to
+                   simplified character
+                 */
+                $term = $this->tokenizer::normalize($unnormalized);
             } else {
-                $term=$unnormalized;
+                $term = $unnormalized;
             }
             $characters = preg_split('//u', $term, null,
                 PREG_SPLIT_NO_EMPTY);
@@ -371,10 +371,6 @@ class NamedEntityContextTagger extends ContextTagger
             $pre_tag = 'o';
             $current_entity = "";
             $entities = [];
-            if (!$normalize) {
-                $characters = preg_split('//u', $unnormalized, null,
-                    PREG_SPLIT_NO_EMPTY);
-            }
             for ($i = 0; $i < count($characters); $i++) {
                 if ($pre_tag != $tags[$i] && $pre_tag != "o") {
                     if (mb_strlen($current_entity) < self::MAX_ENTITY_LENGTH) {
diff --git a/src/library/StochasticTermSegmenter.php b/src/library/StochasticTermSegmenter.php
index 80bca2dcc..77464086a 100644
--- a/src/library/StochasticTermSegmenter.php
+++ b/src/library/StochasticTermSegmenter.php
@@ -209,11 +209,10 @@ class StochasticTermSegmenter
                     }
                     $words = preg_split("/[\s　]+/u", $line);
                     foreach ($words as $word) {
-                        if ($word != "" && !$this->isException($word) &&
+                        if (!empty($word) && !$this->isException($word) &&
                             !$this->notCurrentLang($word)) {
                             if (!empty($this->tokenizer) &&
-                              method_exists($this->tokenizer,
-                              "normalize")) {
+                              method_exists($this->tokenizer, "normalize")) {
                                   $word=$this->tokenizer::normalize($word);
                             }
                             if (!empty($dictionary[$word])) {
@@ -314,17 +313,15 @@ class StochasticTermSegmenter
      * Segments a single sentence into an array of words.
      * Must NOT contain any new line characters.
      * @param string $sentence is a string without newline to be segmented
-     * @param string $normalize return the normalized form
-     *                乾隆->干隆
      * @return array of segmented words
      */
-    public function segmentSentence($sentence, $normalize=true)
+    public function segmentSentence($sentence)
     {
         $t = preg_split("/[\s　]+/u", trim($sentence));
         if(count($t) > 1) {
             $ret = [];
             foreach($t as $s) {
-                $segments = $this->segmentSentence($s, $normalize);
+                $segments = $this->segmentSentence($s);
                 if (is_array($segments)) {
                     $ret = array_merge($ret, $segments);
                 }
@@ -349,11 +346,10 @@ class StochasticTermSegmenter
         }
         $unnormalized = trim($sentence);
         $normalized = (!empty($this->tokenizer) &&
-                       method_exists($this->tokenizer, "normalize")) ?
-                       $this->tokenizer::normalize($unnormalized)
-                       : $unnormalized;
+           method_exists($this->tokenizer, "normalize")) ?
+           $this->tokenizer::normalize($unnormalized) : $unnormalized;
         $characters = preg_split('//u', $normalized, null,
-                      PREG_SPLIT_NO_EMPTY);
+            PREG_SPLIT_NO_EMPTY);
         if (!count($characters)) {
             return [];
         }
@@ -489,10 +485,6 @@ class StochasticTermSegmenter
         }
         $result = [];
         $t = 0;
-        if (!$normalize) {
-            $characters = preg_split('//u', $unnormalized,
-                          null, PREG_SPLIT_NO_EMPTY);
-        }
         foreach(array_reverse($tmp) as $next_node) {
             $result_word = "";
             while($t <= $next_node) {
diff --git a/src/locale/zh_CN/resources/Tokenizer.php b/src/locale/zh_CN/resources/Tokenizer.php
index 5bf9ef0a4..0daf63e11 100755
--- a/src/locale/zh_CN/resources/Tokenizer.php
+++ b/src/locale/zh_CN/resources/Tokenizer.php
@@ -99,7 +99,6 @@ class Tokenizer
     "\x{FF1A}-\x{FF20}\x{FF3B}-\x{FF40}\x{FF5B}-\x{FF65}" .
     "\x{FFE0}-\x{FFEE}\x{21}-\x{2F}\x{21}-\x{2F}" .
     "\x{3A}-\x{40}\x{5B}-\x{60}\x{25cf}])\\1*$/u";
-
     /**
      * Any unique identifier corresponding to the component of a triplet which
      * can be answered using a question answer list
@@ -199,20 +198,28 @@ class Tokenizer
         "AS", "ETC", "DEC", "DEG", "DEV", "MSP",
         "DER", "SP", "IJ", "FW"];
     /**
-     * Stochastic Term Segmenter instance
+     * StochasticTermSegmenter instance used for segmenting chines
      * @var object
      */
     private static $stochastic_term_segmenter;
     /**
-     * Named Entity tagger instance
+     * Named Entity tagger instance used to recognizer noun entities in
+     * Chinese text
      * @var object
      */
     private static $named_entity_tagger;
     /**
-     * PosTagger instance
+     * PartOfSpeechContextTagger instance used in adding part of speech
+     * annotations to Chinese text
      * @var object
      */
     private static $pos_tagger;
+    /**
+     * Holds a associative array with keys which are traditional characters
+     * and values their simplified character correspondents.
+     * @var array
+     */
+    private static $traditional_simplified_map;
     /**
      * Removes the stop words from the page (used for Word Cloud generation
      * and language detection)
@@ -225,8 +232,9 @@ class Tokenizer
     {
         static $pattern = "";
         if (empty($pattern)) {
-            $pattern = '/\b(' . implode('|', self::$stop_words) . ')\b/u';
+            $pattern = '/(' . implode('|', self::$stop_words) . ')/u';
         }
+        $data = self::normalize($data);
         $data = preg_replace($pattern, '', $data);
         return $data;
     }
@@ -308,7 +316,7 @@ class Tokenizer
         return preg_match(self::$non_char_preg, $term);
     }
     /**
-     * Get the segmenter instance
+     * Get the segmenter instance, instantiating it if necessary
      * @return StochasticTermSegmenter
      */
     public static function getStochasticTermSegmenter()
@@ -380,6 +388,31 @@ class Tokenizer
         }
         return self::$pos_tagger;
     }
+    /**
+     * Converts traditional Chinese characters to simplified characters
+     * @param  string $text is a string of Chinese Char
+     * @return string normalized form of the text
+     */
+    public static function normalize($text)
+    {
+        if (empty(self::$traditional_simplified_map)) {
+            $path = C\LOCALE_DIR .
+                "/zh_CN/resources/traditional_simplified.txt.gz";
+            if (!file_exists($path)) {
+                return $text;
+            }
+            self::$traditional_simplified_map =
+                unserialize(gzdecode(file_get_contents($path)));
+        }
+        $chars = preg_split('//u', $text, null, PREG_SPLIT_NO_EMPTY);
+        $num_chars = count($chars);
+        for($i = 0; $i < $num_chars; $i++) {
+            if (isset(self::$traditional_simplified_map[$chars[$i]])) {
+                $chars[$i] = self::$traditional_simplified_map[$chars[$i]];
+            }
+        }
+        return implode($chars);
+    }
     /**
      * Scans a word list for phrases. For phrases found generate
      * a list of question and answer pairs at two levels of granularity:
diff --git a/src/locale/zh_CN/resources/traditional_simplified.txt.gz b/src/locale/zh_CN/resources/traditional_simplified.txt.gz
new file mode 100644
index 000000000..01686164c
Binary files /dev/null and b/src/locale/zh_CN/resources/traditional_simplified.txt.gz differ
diff --git a/src/locale/zh_CN/resources/zh2HansChar.txt.gz b/src/locale/zh_CN/resources/zh2HansChar.txt.gz
deleted file mode 100644
index 7936ab4c3..000000000
Binary files a/src/locale/zh_CN/resources/zh2HansChar.txt.gz and /dev/null differ
diff --git a/tests/ZhTokenizerTest.php b/tests/ZhTokenizerTest.php
index aaf6e4cb6..52e347ff2 100644
--- a/tests/ZhTokenizerTest.php
+++ b/tests/ZhTokenizerTest.php
@@ -32,6 +32,7 @@ namespace seekquarry\yioop\tests;

 use seekquarry\yioop\configs as C;
 use seekquarry\yioop\library as L;
+use seekquarry\yioop\locale\zh_CN\resources\Tokenizer;
 use seekquarry\yioop\library\UnitTest;

 /**
@@ -79,4 +80,14 @@ class ZhTokenizerTest extends UnitTest
         $this->assertEqual($output_tagging, $expected_tagging,
             "Parts of Speech Correctly Tagged in Chinese Source String");
     }
+    /**
+     * Traditional to Simplified mapping test
+     */
+    public function traditionalSimplifiedTestCase()
+    {
+        $traditional = "那是一個黑暗而暴風雨的夜晚。";
+        $simplified = "那是一个黑暗而暴风雨的夜晚。";
+        $this->assertEqual(Tokenizer::normalize($source), $simplified,
+            "Traditional characters correctly mapped to simplied ones");
+    }
 }

ViewGit