diff --git a/src/library/StochasticTermSegmenter.php b/src/library/StochasticTermSegmenter.php index bd2a9d4f5..ba6f6bdbc 100644 --- a/src/library/StochasticTermSegmenter.php +++ b/src/library/StochasticTermSegmenter.php @@ -29,6 +29,7 @@ namespace seekquarry\yioop\library; use seekquarry\yioop\library\Trie; +use seekquarry\yioop\locale\zh_CN\resources as ZH; use seekquarry\yioop\configs as C; /** * A Stochastic Finite-State Word-Segmenter. @@ -92,53 +93,21 @@ class StochasticTermSegmenter { case "zh_CN": $this->non_char_preg = "/^[^\p{Han}]+$/u"; - $this->num_dict = - "1234567890○零一二三四五六七八九十百千万亿". - "0123456789壹贰叁肆伍陆柒捌玖拾廿卅卌佰仟萬億"; - $this->dot = "\.."; - $this->num_end = "%%"; - $this->punctuation = - "\x{3000}-\x{303F}\x{FF00}-\x{FF0F}\x{FF1A}-\x{FF20}" . - "\x{FF3B}-\x{FF40}\x{FF5B}-\x{FF65}\x{FFE0}-\x{FFEE}" . - "\x{21}-\x{2F}\x{21}-\x{2F}\x{3A}-\x{40}\x{5B}-\x{60}"; - /** - * Check if the term passed in is a Cardinal Number - */ - $this->isCardinalNumber = function($term) { - return preg_match("/^[" . $this->num_dict . - $this->dot . "]+[" . $this->num_end . - "]?[余餘]?[百千万亿佰仟萬億]?$/u", $term); - }; - /* - * Check if the term passed in is a Ordinal Number - */ - $this->isOrdinalNumber = function($term) { - return preg_match("/^第[" . $this->num_dict . - "]*$/u", $term); - }; - /* - * Check if the term passed in is a date - */ - $this->isDate = function($term) { - return preg_match("/^[" . $this->num_dict . - "]+(年|年代|月|日|时|小时|時|小時|" . - "点|点钟|點|點鐘|分|分鐘|秒|秒鐘)$/u",$term); - }; /* * Check if the term passed in is an exception term */ $this->isExceptionImpl = function($term) { - return $this->isCardinalNumber($term) - || $this->isOrdinalNumber($term) - || $this->isDate($term); + return ZH\Tokenizer::isCardinalNumber($term) + || ZH\Tokenizer::isOrdinalNumber($term) + || ZH\Tokenizer::isDate($term); }; /* * Check if the term passed in is a punctuation */ $this->isPunctuation = function($term) { - return preg_match("/^[" . $this->punctuation . - "]$/u", $term); + return preg_match("/^[" . ZH\Tokenizer::$punctuation . + "]+$/u", $term); }; break; case "ja": @@ -331,6 +300,15 @@ class StochasticTermSegmenter */ public function segmentSentence($sentence) { + $t=explode(" ",$sentence); + if(count($t) > 1) { + $ret = []; + foreach($t as $s) { + $ret=array_merge($ret,$this->segmentSentence($s)); + } + return $ret; + } + if (!$this->dictionary_file) { $dic_file = C\LOCALE_DIR . "/{$this->lang}/resources/term_weight.txt.gz"; @@ -353,10 +331,12 @@ class StochasticTermSegmenter $score[-1] = 0; for($index = 0; $index < count($characters); $index++) { //if not current language - if ($this->notCurrentLang($characters[$index]) ) { + if ($this->notCurrentLang($characters[$index]) + && !$this->isPunctuation($characters[$index])) { $current_char = $characters[$index]; for($j = $index + 1; $j < count($characters); $j++) { - if ($this->notCurrentLang($current_char.$characters[$j])) { + if ($this->notCurrentLang($current_char.$characters[$j]) + && !$this->isPunctuation($characters[$j])) { $current_char .= $characters[$j]; } else { break; diff --git a/src/locale/zh_CN/resources/Tokenizer.php b/src/locale/zh_CN/resources/Tokenizer.php index bb46318c7..70c409a7c 100755 --- a/src/locale/zh_CN/resources/Tokenizer.php +++ b/src/locale/zh_CN/resources/Tokenizer.php @@ -29,6 +29,7 @@ namespace seekquarry\yioop\locale\zh_CN\resources; use seekquarry\yioop\library\PhraseParser; +use seekquarry\yioop\library\StochasticTermSegmenter; /** * Chinese specific tokenization code. Typically, tokenizer.php @@ -56,6 +57,32 @@ class Tokenizer '与', '次', '狗', '决', '金', '史', '姆', '部', '正在', '活', '刚', '回家', '贝', '如何', '须', '战', '不會', '夫', '喂', '父', '亚', '肯定', '女孩', '世界']; + /** + * The dictionary of characters can be used as Chinese Numbers + * @string + */ + public static $num_dict = + "1234567890○零一二三四五六七八九十百千万亿". + "0123456789壹贰叁肆伍陆柒捌玖拾廿卅卌佰仟萬億"; + /** + * Dots used in Chinese Numbers + * @string + */ + public static $dot = "\.."; + /** + * A list of characters can be used at the end of numbers + * @string + */ + public static $num_end = "%%"; + /** + * A list of characters can be used as Chinese punctuations + * @string + */ + public static $punctuation = + "\x{2000}-\x{206F}\x{3000}-\x{303F}\x{FF00}-\x{FF0F}" . + "\x{FF1A}-\x{FF20}\x{FF3B}-\x{FF40}\x{FF5B}-\x{FF65}" . + "\x{FFE0}-\x{FFEE}\x{21}-\x{2F}\x{21}-\x{2F}" . + "\x{3A}-\x{40}\x{5B}-\x{60}"; /** * Removes the stop words from the page (used for Word Cloud generation * and language detection) @@ -79,11 +106,46 @@ class Tokenizer * this is a bunch of words * * @param string $pre_segment before segmentation + * @param string $method indicates which method to use * @return string with words separated by space */ - public static function segment($pre_segment) + public static function segment($pre_segment, $method="STS") + { + switch($method) { + case("RMM"): + return PhraseParser::reverseMaximalMatch($pre_segment, "zh-CN", + ['/\d+/', '/[a-zA-Z]+/']); + break; + case("STS"): + $segmenter = new StochasticTermSegmenter("zh_CN"); + return $segmenter->segmentText($pre_segment,true); + break; + } + } + /** + * Check if the term passed in is a Cardinal Number + */ + public static function isCardinalNumber($term) + { + return preg_match("/^[" . self::$num_dict . + self::$dot . "]+[" . self::$num_end . + "]?[余餘]?[百千万亿佰仟萬億]?$/u", $term); + } + /* + * Check if the term passed in is a Ordinal Number + */ + public static function isOrdinalNumber($term) + { + return preg_match("/^第[" . self::$num_dict . + "]*$/u", $term); + } + /* + * Check if the term passed in is a date + */ + public static function isDate($term) { - return PhraseParser::reverseMaximalMatch($pre_segment, "zh-CN", - ['/\d+/', '/[a-zA-Z]+/']); + return preg_match("/^[" . self::$num_dict . + "]+(年|年代|月|日|时|小时|時|小時|" . + "点|点钟|點|點鐘|分|分鐘|秒|秒鐘)$/u",$term); } }