Added more features in Chinese Segmenter;Moved partial code from StochasticTermSegmenter class to zh_CN\Tokenizer class for readability;Changed the default segmenter for Chinese from reverseMaximalMatch to StochasticTermSegmenter; r=chris

Forrest Sun [2019-12-07 05:Dec:th]

Added more features in Chinese Segmenter;Moved partial code from StochasticTermSegmenter class to zh_CN\Tokenizer class for readability;Changed the default segmenter for Chinese from reverseMaximalMatch to StochasticTermSegmenter; r=chris

Signed-off-by: Chris Pollett <chris@pollett.org>

Filename
src/library/StochasticTermSegmenter.php
src/locale/zh_CN/resources/Tokenizer.php

diff --git a/src/library/StochasticTermSegmenter.php b/src/library/StochasticTermSegmenter.php
index bd2a9d4f5..ba6f6bdbc 100644
--- a/src/library/StochasticTermSegmenter.php
+++ b/src/library/StochasticTermSegmenter.php
@@ -29,6 +29,7 @@
 namespace seekquarry\yioop\library;

 use seekquarry\yioop\library\Trie;
+use seekquarry\yioop\locale\zh_CN\resources as ZH;
 use seekquarry\yioop\configs as C;
 /**
  * A Stochastic Finite-State Word-Segmenter.
@@ -92,53 +93,21 @@ class StochasticTermSegmenter
         {
             case "zh_CN":
                 $this->non_char_preg = "/^[^\p{Han}]+$/u";
-                $this->num_dict =
-                   "1234567890○零一二三四五六七八九十百千万亿".
-                   "０１２３４５６７８９壹贰叁肆伍陆柒捌玖拾廿卅卌佰仟萬億";
-                $this->dot = "\.．";
-                $this->num_end = "％%";
-                $this->punctuation =
-                "\x{3000}-\x{303F}\x{FF00}-\x{FF0F}\x{FF1A}-\x{FF20}" .
-                "\x{FF3B}-\x{FF40}\x{FF5B}-\x{FF65}\x{FFE0}-\x{FFEE}" .
-                "\x{21}-\x{2F}\x{21}-\x{2F}\x{3A}-\x{40}\x{5B}-\x{60}";
-                /**
-                 * Check if the term passed in is a Cardinal Number
-                 */
-                $this->isCardinalNumber = function($term) {
-                    return preg_match("/^[" . $this->num_dict .
-                        $this->dot . "]+[" . $this->num_end .
-                        "]?[余餘]?[百千万亿佰仟萬億]?$/u", $term);
-                };
-                /*
-                 * Check if the term passed in is a Ordinal Number
-                 */
-                $this->isOrdinalNumber = function($term) {
-                    return preg_match("/^第[" . $this->num_dict .
-                    "]*$/u", $term);
-                };
-                /*
-                 * Check if the term passed in is a date
-                 */
-                $this->isDate = function($term) {
-                    return preg_match("/^[" . $this->num_dict .
-                    "]+(年|年代|月|日|时|小时|時|小時|" .
-                    "点|点钟|點|點鐘|分|分鐘|秒|秒鐘)$/u",$term);
-                };
                 /*
                  * Check if the term passed in is an exception term
                  */
                 $this->isExceptionImpl = function($term) {
-                    return $this->isCardinalNumber($term)
-                    || $this->isOrdinalNumber($term)
-                    || $this->isDate($term);
+                    return ZH\Tokenizer::isCardinalNumber($term)
+                    || ZH\Tokenizer::isOrdinalNumber($term)
+                    || ZH\Tokenizer::isDate($term);
                 };
                 /*
                  * Check if the term passed in is a punctuation
                  */
                 $this->isPunctuation = function($term)
                 {
-                    return preg_match("/^[" . $this->punctuation .
-                        "]$/u", $term);
+                    return preg_match("/^[" . ZH\Tokenizer::$punctuation .
+                        "]+$/u", $term);
                 };
                 break;
             case "ja":
@@ -331,6 +300,15 @@ class StochasticTermSegmenter
      */
     public function segmentSentence($sentence)
     {
+        $t=explode(" ",$sentence);
+        if(count($t) > 1) {
+            $ret = [];
+            foreach($t as $s) {
+                $ret=array_merge($ret,$this->segmentSentence($s));
+            }
+            return $ret;
+        }
+
         if (!$this->dictionary_file) {
             $dic_file = C\LOCALE_DIR .
                 "/{$this->lang}/resources/term_weight.txt.gz";
@@ -353,10 +331,12 @@ class StochasticTermSegmenter
         $score[-1] = 0;
         for($index = 0; $index < count($characters); $index++) {
             //if not current language
-            if ($this->notCurrentLang($characters[$index]) ) {
+            if ($this->notCurrentLang($characters[$index])
+                && !$this->isPunctuation($characters[$index])) {
                 $current_char = $characters[$index];
                 for($j = $index + 1; $j < count($characters); $j++) {
-                    if ($this->notCurrentLang($current_char.$characters[$j])) {
+                    if ($this->notCurrentLang($current_char.$characters[$j])
+                        && !$this->isPunctuation($characters[$j])) {
                         $current_char .= $characters[$j];
                     } else {
                         break;
diff --git a/src/locale/zh_CN/resources/Tokenizer.php b/src/locale/zh_CN/resources/Tokenizer.php
index bb46318c7..70c409a7c 100755
--- a/src/locale/zh_CN/resources/Tokenizer.php
+++ b/src/locale/zh_CN/resources/Tokenizer.php
@@ -29,6 +29,7 @@
 namespace seekquarry\yioop\locale\zh_CN\resources;

 use seekquarry\yioop\library\PhraseParser;
+use seekquarry\yioop\library\StochasticTermSegmenter;

 /**
  * Chinese specific tokenization code. Typically, tokenizer.php
@@ -56,6 +57,32 @@ class Tokenizer
         '与', '次', '狗', '决', '金', '史', '姆', '部', '正在', '活', '刚',
         '回家', '贝', '如何', '须', '战', '不會', '夫', '喂', '父', '亚', '肯定',
         '女孩', '世界'];
+    /**
+     * The dictionary of characters can be used as Chinese Numbers
+     * @string
+     */
+    public static $num_dict =
+       "1234567890○零一二三四五六七八九十百千万亿".
+       "０１２３４５６７８９壹贰叁肆伍陆柒捌玖拾廿卅卌佰仟萬億";
+    /**
+     * Dots used in Chinese Numbers
+     * @string
+     */
+    public static $dot = "\.．";
+    /**
+     * A list of characters can be used at the end of numbers
+     * @string
+     */
+    public static $num_end = "％%";
+    /**
+     * A list of characters can be used as Chinese punctuations
+     * @string
+     */
+    public static $punctuation =
+    "\x{2000}-\x{206F}\x{3000}-\x{303F}\x{FF00}-\x{FF0F}" .
+    "\x{FF1A}-\x{FF20}\x{FF3B}-\x{FF40}\x{FF5B}-\x{FF65}" .
+    "\x{FFE0}-\x{FFEE}\x{21}-\x{2F}\x{21}-\x{2F}" .
+    "\x{3A}-\x{40}\x{5B}-\x{60}";
     /**
      * Removes the stop words from the page (used for Word Cloud generation
      * and language detection)
@@ -79,11 +106,46 @@ class Tokenizer
      * this is a bunch of words
      *
      * @param string $pre_segment  before segmentation
+     * @param string $method  indicates which method to use
      * @return string with words separated by space
      */
-    public static function segment($pre_segment)
+    public static function segment($pre_segment, $method="STS")
+    {
+        switch($method) {
+            case("RMM"):
+                return PhraseParser::reverseMaximalMatch($pre_segment, "zh-CN",
+                ['/\d+/', '/[a-zA-Z]+/']);
+                break;
+            case("STS"):
+                $segmenter = new StochasticTermSegmenter("zh_CN");
+                return $segmenter->segmentText($pre_segment,true);
+                break;
+        }
+    }
+    /**
+     * Check if the term passed in is a Cardinal Number
+     */
+    public static function isCardinalNumber($term)
+    {
+        return preg_match("/^[" . self::$num_dict .
+            self::$dot . "]+[" . self::$num_end .
+            "]?[余餘]?[百千万亿佰仟萬億]?$/u", $term);
+    }
+    /*
+     * Check if the term passed in is a Ordinal Number
+     */
+    public static function isOrdinalNumber($term)
+    {
+        return preg_match("/^第[" . self::$num_dict .
+        "]*$/u", $term);
+    }
+    /*
+     * Check if the term passed in is a date
+     */
+    public static function isDate($term)
     {
-        return PhraseParser::reverseMaximalMatch($pre_segment, "zh-CN",
-            ['/\d+/', '/[a-zA-Z]+/']);
+        return preg_match("/^[" . self::$num_dict .
+        "]+(年|年代|月|日|时|小时|時|小時|" .
+        "点|点钟|點|點鐘|分|分鐘|秒|秒鐘)$/u",$term);
     }
 }

ViewGit