POS tagger and NER. r=chris

Forrest Sun [2020-03-30 19:Mar:th]

POS tagger and NER. r=chris

Signed-off-by: Chris Pollett <chris@pollett.org>

Filename
src/library/ContextWeightedNamedEntityRecognizer.php
src/library/ContextWeightedPosTagger.php
src/library/StochasticTermSegmenter.php
src/locale/zh_CN/resources/Tokenizer.php

diff --git a/src/library/ContextWeightedNamedEntityRecognizer.php b/src/library/ContextWeightedNamedEntityRecognizer.php
new file mode 100644
index 000000000..0790fdabc
--- /dev/null
+++ b/src/library/ContextWeightedNamedEntityRecognizer.php
@@ -0,0 +1,608 @@
+<?php
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009 - 2019  Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ *
+ * @author Xianghong Sun sxh19911230@gmail.com
+ * @license https://www.gnu.org/licenses/ GPL3
+ * @link https://www.seekquarry.com/
+ * @copyright 2009 - 2019
+ * @filesource
+ */
+
+namespace seekquarry\yioop\library;
+
+use seekquarry\yioop\configs as C;
+use seekquarry\yioop\locale\zh_CN\resources as ZH;
+
+/**
+ * Machine learning based NER tagger. Typically, ContextWeightedNERTagger.php
+ * can train the language with some dataset and predict
+ * the tag given a list of word.
+ *
+ * @author Xianghong Sun
+ */
+
+class ContextWeightedNamedEntityRecognizer
+{
+    /**
+     * Current Language, only tested on Simplified Chinese
+     * Might be extensable for other languages in the furture
+     * @var string
+     */
+    public $lang;
+    /**
+     * The word weight feature
+     * y = wx + b
+     * Generized by training method
+     * @var array
+     */
+    public $word_feature;
+    /**
+     * The tag weight feature
+     * y = wx + b
+     * Generized by training method
+     * @var array
+     */
+    public $tag_feature;
+    /**
+     * The bias
+     * y = wx + b
+     * Generized by training method
+     * @var array
+     */
+    public $bias;
+     /**
+     * All Possiable tag set
+     * Generized by training method
+     * @var associative array [tag => tag index]
+     */
+    private $tag_set;
+     /**
+     * The constructer of the pos tagger
+     * To extend to other languages, some work are needed:
+     * Define $this->getKeyImpl, $this->rule_defined_key
+     * See Chinese example.
+     * @param @string $lang describes current langauge
+     * @param @book $packed describes how weight and bias would look like
+     */
+    public function __construct($lang)
+    {
+        switch($lang) {
+            case("zh_CN"):
+            case("zh-CH"):
+                $this->lang = "zh_CN";
+                break;
+            default:
+                $this->lang = $lang;
+        }
+    }
+
+    /**
+     * A function that process the trainning data
+     * @param @mixed $text_files can be a file or an array of file names
+     * @return @array of seperated sentences, each sentenfce have the format of
+     *               [[words...],[tags...]]
+     * Data format MSRA:
+     * 我们/o 是/o 受到/o 郑振铎/nr 先生/o 、/o 阿英/nr 先生/o 著作/o 的/o
+     * 启示/o ，/o 从/o 个人/o 条件/o 出发/o ，/o 瞄准/o 现代/o 出版/o 史/o
+     * 研究/o 的/o 空白/o ，/o 重点/o 集/o 藏/o 解放区/o 、/o 国民党/nt 毁/o
+     * 禁/o 出版物/o 。/o
+     * To adapt to other language, some modifications are needed
+     */
+    public static function processTexts($text_files, $term_tag_splier="/",
+        $term_process = null, $tag_process = null)
+    {
+        $ret=[];
+        foreach($text_files as $text_file) {
+            if (file_exists($text_file)) {
+                $fn = fopen($text_file,"r");
+                while(! feof($fn))  {
+                    $line = fgets($fn);
+                    if(strpos($line, '<') !== false) {
+                        continue;
+                    }
+                    $word_tag_pairs = preg_split("/[\s　]+/u", $line);
+                    if (!count($word_tag_pairs)) {
+                        continue;
+                    }
+                    $ret[]=[];
+                    $ret[count($ret)-1][0]=[];
+                    $ret[count($ret)-1][1]=[];
+                    foreach ($word_tag_pairs as $word_tag_pair) {
+                        $t = explode("/", $word_tag_pair);
+                        //echo $word_tag_pair;
+                        //print_r($t);
+                        if (count($t) == 2) {
+                            $tag = $tag_process ? $tag_process($t[1]) : $t[1];
+                            foreach(preg_split('//u', $t[0], null, PREG_SPLIT_NO_EMPTY) as $ch) {
+                                $ret[count($ret)-1][0][] =
+                                    $term_process ? $term_process($ch) : $ch;
+                                $ret[count($ret)-1][1][] = $tag;
+                            }
+                        }
+                    }
+                }
+                fclose($fn);
+            }
+        }
+        return $ret;
+    }
+
+    /**
+    * Function to train a data
+    * Notice: This function might run very long time, depending on training set
+    * @param @mixed $text_files are training data
+    *               can be a file or an array of file names
+    * @param @float $learning_rate
+    * @param @int  $max_epoch 1200 might be a good one,
+                   the weight will overfit if it's greater than this number
+    * @param @function $term_process is a preporcess on term before training
+    * @param @function $tag_process is a preporcess on tag before training
+    */
+    public function train($text_files,
+                          $learning_rate=0.1, $max_epoch = 1200,
+                          $term_process = null, $tag_process = null)
+    {
+        if (is_string($text_files)) {
+            $text_files = [$text_files];
+        }
+        echo "Reading files\n";
+        // term_tag_sentences[sentence#]=[[words...],[tags...]]
+        $term_tag_sentences = self::processTexts($text_files,
+            $term_process, $tag_process);
+        $this->word_feature=[];
+        $this->tag_set=[];
+        $tag_index = 0;
+        for ($i = -4; $i <= -1; $i++) {
+            $this->word_feature[$i] = [];
+        }
+        foreach ($term_tag_sentences as $term_tag_pairs) {
+            $terms=$term_tag_pairs[0];
+            $tags=$term_tag_pairs[1];
+            $this->tag_feature["start"]=[];
+            $this->tag_feature["start-start"]=[];
+            for ($i = 0; $i < count($terms); $i++) {
+                if (!isset($this->tag_set[$tags[$i]])) {
+                    $this->tag_set[$tags[$i]] = $tag_index++;
+                }
+                if ($i == 0) {}
+                else if ($i == 1) {
+                    if (!isset($this->tag_feature["start-".$tags[$i-1]])) {
+                        $this->tag_feature["start-".$tags[$i-1]]=[];
+                    }
+                    if (!isset($this->tag_feature[$tags[$i-1]])) {
+                        $this->tag_feature[$tags[$i-1]]=[];
+                    }
+                } else {
+                    if (!isset($this->tag_feature[$tags[$i-2]."-".$tags[$i-1]])) {
+                        $this->tag_feature[$tags[$i-2]."-".$tags[$i-1]]=[];
+                    }
+                    if (!isset($this->tag_feature[$tags[$i-1]])) {
+                        $this->tag_feature[$tags[$i-1]]=[];
+                    }
+                }
+
+                if (!isset($this->word_feature[$terms[$i]])) {
+                    $this->word_feature[$terms[$i]] = [];
+                }
+            }
+        }
+        foreach (array_keys($this->word_feature) as $key) {
+            for ($i=-2; $i<=2;$i++) {
+                if (!isset($this->word_feature[$key][$i])) {
+                    $this->word_feature[$key][$i] = [];
+                }
+                foreach($this->tag_set as $possiable_tag => $tag_index) {
+                    if (!isset($this->word_feature[$key][$i][$tag_index])) {
+                        $this->word_feature[$key][$i][$tag_index] = 0;
+                    }
+                }
+            }
+        }
+        foreach (array_keys($this->tag_feature) as $key) {
+            foreach($this->tag_set as $possiable_tag => $tag_index) {
+                if (!isset($this->tag_feature[$key][$tag_index])) {
+                    $this->tag_feature[$key][$tag_index] = 0;
+                }
+            }
+        }
+        foreach($this->tag_set as $possiable_tag => $tag_index) {
+            if (!isset($this->bias[$tag_index])) {
+                $this->bias[$tag_index] = 0;
+            }
+        }
+        echo "Training...\n";
+        //train the weight
+        $cross_entropy_loss = 1;
+        $pre_cross_entropy_loss = 2;
+        for ($epoch = 0; ($epoch < $max_epoch) &&
+            $pre_cross_entropy_loss - $cross_entropy_loss > 0.000001; $epoch++) {
+            $this->min_w=0;
+            $this->max_w=0;
+            $time = time();
+            $dy_dw = [];
+            $dy_dw_n = [];
+            $pre_cross_entropy_loss = $cross_entropy_loss;
+            $cross_entropy_loss = 0;
+            $cross_entropy_loss_n = 0;
+
+            $dy_db=[];
+            $dy_db_n=[];
+
+            $dy_dt=[];
+            $dy_dt_n=[];
+            for($i = 0; $i < count($this->tag_set); $i++) {
+                $dy_db[$i] = 0;
+                $dy_db_n[$i] = 0;
+            }
+            //for each sentence
+            foreach ($term_tag_sentences as $term_tag_pairs) {
+                $terms=$term_tag_pairs[0];
+                $tags=$term_tag_pairs[1];
+                for ($i = 0; $i < count($terms); $i++) {
+                    $k=[];
+                    for ($j=-2; $j<=2;$j++) {
+                        $k[$j]= $this->getIndex($i+$j,$terms);
+                    }
+                    foreach ($this->tag_set as $possiable_tag => $tag_index) {
+                        $equality = $possiable_tag == $tags[$i] ? 1 : 0;
+                        $sum=0;
+                        //5 words including itself
+                        for ($j=-2; $j<=2;$j++) {
+                            $sum += $this->word_feature[$k[$j]][$j][$tag_index];
+                        }
+                        //previous 2 tags
+                        if ($i == 0) {
+                            $tf1="start";
+                            $tf2="start-start";
+                        } else if ($i == 1) {
+                            $tf1=$tags[$i-1];
+                            $tf2="start-".$tags[$i-1];
+                        } else {
+                            $tf1=$tags[$i-1];
+                            $tf2=$tags[$i-2]."-".$tags[$i-1];
+                        }
+                        $sum += $this->tag_feature[$tf1][$tag_index];
+                        $sum += $this->tag_feature[$tf2][$tag_index];
+                        //bias
+                        $sum += $this->bias[$tag_index];
+                        $sigmoid = 1 / (1 + exp(-1 * $sum));
+                        for ($j=-2; $j<=2;$j++) {
+                            if (!isset($dy_dw[$k[$j]])) {
+                                $dy_dw[$k[$j]] = [];
+                                $dy_dw_n[$k[$j]] = [];
+                            }
+                            if (!isset($dy_dw[$k[$j]][$j])) {
+                                $dy_dw[$k[$j]][$j] = [];
+                                $dy_dw_n[$k[$j]][$j] = [];
+                            }
+                            if (!isset($dy_dw[$k[$j]][$j][$tag_index])) {
+                                $dy_dw[$k[$j]][$j][$tag_index] = 0;
+                                $dy_dw_n[$k[$j]][$j][$tag_index] = 0;
+                            }
+
+                            $dy_dw[$k[$j]][$j][$tag_index] +=
+                                ($sigmoid - $equality);
+                            $dy_dw_n[$k[$j]][$j][$tag_index] += 1;
+
+                        }
+                        //dy_dt
+                        if (!isset($dy_dt[$tf1])) {
+                            $dy_dt[$tf1] = [];
+                            $dy_dt_n[$tf1] = [];
+                        }
+                        if (!isset($dy_dt[$tf1][$tag_index])) {
+                            $dy_dt[$tf1][$tag_index] = 0;
+                            $dy_dt_n[$tf1][$tag_index] = 0;
+                        }
+                        if (!isset($dy_dt[$tf2])) {
+                            $dy_dt[$tf2] = [];
+                            $dy_dt_n[$tf2] = [];
+                        }
+                        if (!isset($dy_dt[$tf2][$tag_index])) {
+                            $dy_dt[$tf2][$tag_index] = 0;
+                            $dy_dt_n[$tf2][$tag_index] = 0;
+                        }
+                        $dy_dt[$tf1][$tag_index] += ($sigmoid - $equality);
+                        $dy_dt_n[$tf1][$tag_index] += 1;
+                        $dy_dt[$tf2][$tag_index] += ($sigmoid - $equality);
+                        $dy_dt_n[$tf2][$tag_index] += 1;
+                        //dy_db
+                        $dy_db[$tag_index] += ($sigmoid - $equality);
+                        $dy_db_n[$tag_index] += 1;
+                        $cross_entropy_loss+=
+                            - $equality*log($sigmoid)
+                            - (1-$equality)*log(1-$sigmoid);
+                        $cross_entropy_loss_n++;
+                    }
+                }
+            }
+            $cross_entropy_loss /= $cross_entropy_loss_n;
+            $duration = time() - $time;
+            echo "epoch {$epoch} cross_entropy {$cross_entropy_loss}".
+                " Takes {$duration} seconds\n";
+            foreach ($dy_dw as $i =>$v1) {
+                foreach ($v1 as $j =>$v2) {
+                    foreach ($v2 as $k =>$v3) {
+                        $this->word_feature[$i][$j][$k] -=
+                            $dy_dw[$i][$j][$k] /
+                            $dy_dw_n[$i][$j][$k] *
+                            $learning_rate;
+                        if ($this->word_feature[$i][$j][$k] < $this->min_w) {
+                            $this->min_w = $this->word_feature[$i][$j][$k];
+                        }
+                        if ($this->word_feature[$i][$j][$k] > $this->max_w) {
+                            $this->max_w = $this->word_feature[$i][$j][$k];
+                        }
+                    }
+                }
+            }
+            foreach ($dy_dt as $i => $v1) {
+                foreach ($v1 as $j => $v2) {
+                    $this->tag_feature[$i][$j] -=
+                        $dy_dt[$i][$j] /
+                        $dy_dt_n[$i][$j] *
+                        $learning_rate;
+                }
+            }
+            foreach ($dy_db as $k => $v) {
+                $this->bias[$k]-=
+                    $dy_db[$k] /
+                    $dy_db_n[$k] *
+                    $learning_rate;
+            }
+            if ($epoch % 10 == 9 ) {
+                $this->save_weight();
+            }
+        }
+        $this->save_weight();
+        return true;
+    }
+    /**
+     * The primary function to predit the tag
+     * @param mixed $sentence is an array of segmented words/terms
+     *     or a string needs to be splited by $splitter
+     * @param function $splitter to process $sentence if $sentence
+     *                 is a string
+     * @return @array all predicted named entities with its tag
+     *                ex. [["郑振铎","nr"],["国民党","nt"]]
+     */
+    public function predict($sentence, $delimiter="",$splitter=null)
+    {
+        if (!is_array($sentence)) {
+            if ($sentence == "") {
+                $terms=[];
+            } else {
+                $terms=preg_split("/[\s]+/",$sentence);
+            }
+        } else {
+            $terms=$sentence;
+        }
+        if (!count($terms)) {
+            return [];
+        }
+        if (!$this->word_feature) {
+            $this->load_weight();
+        }
+        $result = [];
+        for($i = 0; $i < count($terms); $i++) {
+            $term = $terms[$i];
+            $score =[];
+            foreach($this->tag_set as $possiable_tag => $tag_index) {
+                $score[$possiable_tag]=0;
+                for ($j=-2; $j <=2; $j++) {
+                    $k=$this->getIndex($i+$j, $terms);
+                    if (isset($this->word_feature[$k])) {
+                        $score[$possiable_tag] +=
+                                $this->getW($k,$j,$tag_index);
+                    }
+                }
+                if ($i == 0) {
+                    $tf1="start";
+                    $tf2="start-start";
+                } else if ($i == 1) {
+                    $tf1=$result[$i-1];
+                    $tf2="start-".$result[$i-1];
+                } else {
+                    $tf1=$result[$i-1];
+                    $tf2=$result[$i-2]."-".$result[$i-1];
+                }
+                $score[$possiable_tag] += $this->getT($tf1,$tag_index);
+                $score[$possiable_tag] += $this->getT($tf2,$tag_index);
+                $score[$possiable_tag] += $this->getB($tag_index);
+            }
+            $result[]=array_keys($score, max($score))[0];
+        }
+        $pre_tag='o';
+        $current_entity=null;
+        $ret=[];
+        for ($i = 0; $i < count($terms); $i++) {
+            if ($pre_tag != $result[$i] && $pre_tag != "o") {
+                if (mb_strlen($current_entity) < 10) {
+                    $ret[]=[$current_entity,$pre_tag];
+                }
+                $current_entity=null;
+            }
+            if ($result[$i] != "o") {
+                if ($current_entity) {
+                    $current_entity.=$delimiter.$terms[$i];
+                } else {
+                    $current_entity=$terms[$i];
+                }
+            }
+            $pre_tag=$result[$i];
+        }
+        return $ret;
+    }
+
+    /**
+     * A list of private helper functions
+     * Given a setence ($term), find the key at position $index
+     */
+    private function getIndex($index, $terms)
+    {
+        if ($index < 0) $k = $index - 2;
+        else if ($index >= count($terms)) {
+            $k = $index - count($terms) - 2;
+        }
+        else {
+            $k = $terms[$index];
+        }
+        return $k;
+    }
+
+    /**
+     * save the trained weight to disk
+     */
+    private function save_weight()
+    {
+        $out_file = C\LOCALE_DIR . "/{$this->lang}/resources/ner_weight.txt.gz";
+        $out = [];
+        $out["min_w"] = $this->min_w;
+        $out["max_w"] = $this->max_w;
+        $out["w"]=[];
+        foreach(array_keys($this->word_feature) as $key) {
+            $out["w"][$key] = $this->pack_w($key);
+        }
+        foreach(array_keys($this->tag_feature) as $key) {
+            $out["t"][$key] = $this->pack_t($key);
+        }
+        $out["b"] = $this->pack_b();
+        $out["tag_set"] = $this->tag_set;
+        echo "Saving...";
+        file_put_contents($out_file,
+            gzencode(serialize($out),9));
+        echo " ok\n";
+    }
+    /**
+     * load the trained weight from disk
+     */
+    private function load_weight($trainning_load=false)
+    {
+        $dic_file
+            = C\LOCALE_DIR . "/{$this->lang}/resources/ner_weight.txt.gz";
+        if (!file_exists($dic_file)) {
+            echo "$dic_file does not exist!";
+            exit();
+        }
+        $f = unserialize(gzdecode(file_get_contents($dic_file))
+            ,['allowed_classes' => false]);
+        $this->word_feature=$f["w"];
+        $this->tag_feature=$f["t"];
+        $this->bias=$f["b"];
+        $this->min_w=$f["min_w"];
+        $this->max_w=$f["max_w"];
+        $this->tag_set=$f["tag_set"];
+        if ($trainning_load) {
+            foreach(array_keys($this->word_feature) as $key) {
+                $this->word_feature[$key] = $this->unpack_w($key);
+            }
+            foreach(array_keys($this->tag_feature) as $key) {
+                $this->tag_feature[$key] = $this->unpack_t($key);
+            }
+            $this->bias = $this->unpack_b();
+        }
+    }
+    /**
+     * Pack the bias
+     */
+    private function pack_b()
+    {
+        return pack("f*", ...$this->bias);
+    }
+    /**
+     * Unpack the bias
+     */
+    private function unpack_b()
+    {
+        return array_merge(unpack("f".strval(count($this->tag_set)),$this->bias));
+    }
+    /**
+     * Pack the tag_feature
+     */
+    private function pack_t($key)
+    {
+        return pack("f*", ...$this->tag_feature[$key]);
+    }
+    /**
+     * Unpack the tag_feature
+     */
+    private function unpack_t($key)
+    {
+        return array_merge(unpack("f".strval(count($this->tag_set)),$this->tag_feature[$key]));
+    }
+    /**
+     * Pack the word_feature
+     */
+    private function pack_w($key)
+    {
+        $bin_str = "";
+        foreach($this->word_feature[$key] as $i => $t) {
+            foreach($t as $u) {
+                $v = 65535 * ($u-$this->min_w) / ($this->max_w-$this->min_w);
+                $bin_str .= pack("S", intval($v));
+            }
+        }
+        return $bin_str;
+    }
+    /**
+     * Unpack the word_feature
+     */
+    private function unpack_w($key)
+    {
+        $tmp = [];
+        $size = count($this->tag_set);
+        for ($i = 0; $i < 5; $i++) {
+            $tmp[$i-2] = array_merge(unpack("S".strval($size),
+                $this->word_feature[$key], 2*$i*count($this->tag_set)));
+            for($j = 0; $j < $size; $j++) {
+                $tmp[$i-2][$j] = $tmp[$i-2][$j] / 65535
+                    * ($this->max_w-$this->min_w) + $this->min_w;
+            }
+        }
+        return $tmp;
+    }
+    /**
+     * Get the bias value for tag
+     */
+    private function getB($tag_index)
+    {
+        return unpack("f",$this->bias,$tag_index*4)[1];
+    }
+    /**
+     * Get the bias value for tag
+     */
+    private function getT($key, $tag_index)
+    {
+        return unpack("f",$this->tag_feature[$key],$tag_index*4)[1];
+    }
+    /**
+     * Get the weight value for term at postion for tag
+     */
+    private function getW($term, $position, $tag_index)
+    {
+        $t = unpack("S",$this->word_feature[$term],
+            2*($position+2)*count($this->tag_set)+$tag_index*2)[1]
+            / 65535
+            * ($this->max_w-$this->min_w) + $this->min_w;;
+        return $t;
+    }
+}
diff --git a/src/library/ContextWeightedPosTagger.php b/src/library/ContextWeightedPosTagger.php
new file mode 100644
index 000000000..140097b95
--- /dev/null
+++ b/src/library/ContextWeightedPosTagger.php
@@ -0,0 +1,599 @@
+<?php
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009 - 2019  Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ *
+ * @author Xianghong Sun sxh19911230@gmail.com
+ * @license https://www.gnu.org/licenses/ GPL3
+ * @link https://www.seekquarry.com/
+ * @copyright 2009 - 2019
+ * @filesource
+ */
+
+namespace seekquarry\yioop\library;
+
+use seekquarry\yioop\configs as C;
+use seekquarry\yioop\locale\zh_CN\resources as ZH;
+
+/**
+ * Machine learning based POS tagger. Typically, ContextWeightedPosTagger.php
+ * can train the language with some dataset and predict
+ * the tag given a list of word.
+ *
+ * Instruction to add a new language:
+ * Add a switch case in the constructor.
+ * Define the following functions:
+ * getKeyImpl
+ * See the class function 'getKey' for more information
+ *
+ * @author Xianghong Sun
+ */
+
+class ContextWeightedPosTagger {
+    /**
+     * Current Language, only tested on Simplified Chinese
+     * Might be extensable for other languages in the furture
+     * @var string
+     */
+    public $lang;
+    /**
+     * The weight for predicting the pos tag
+     * y = wx + b
+     * Generized by training method
+     * @var array
+     */
+    public $w;
+    /**
+     * The bias for predicting the pos tag
+     * y = wx + b
+     * Generized by training method
+     * @var array
+     */
+    public $b;
+    /**
+     * range of w
+     */
+    private $min_w;
+    private $max_w;
+
+    /**
+     * All Possiable tag set
+     * Generized by training method
+     * @var associative array [tag => tag index]
+     */
+    private $tag_set;
+    /**
+     * Check if all the chars in the term is not current language
+     * @param $term is a string that to be checked
+     * @return true if all the chars in $term is not current language
+     *         false otherwise
+     */
+    public function notCurrentLang($term)
+    {
+        return preg_match("/^[^\p{Han}]+$/u", $term);
+    }
+    /**
+     * The constructer of the pos tagger
+     * To extend to other languages, some work are needed:
+     * Define $this->getKeyImpl, $this->rule_defined_key
+     * See Chinese example.
+     * @param @string $lang describes current langauge
+     * @param @book $packed describes how weight and bias would look like
+     */
+    public function __construct($lang, $packed = true)
+    {
+        //$this->packed = $packed;
+        switch($lang) {
+            case("zh_CN"):
+            case("zh-CH"):
+                $this->lang = "zh_CN";
+                /*
+                 * Some Exception of Tags. Some tags are detemined by ruls.
+                 * e.x. There are infinity amount of Arabic numerals.
+                 */
+                $this->getKeyImpl = function($term) {
+                    $key = ZH\Tokenizer::POSGetKey($term);
+                    return $key ? $this->tag_set[$key] : $term;
+                };
+                //Tags from above
+                $this->rule_defined_key = ['PU','CD','OD','NT','FW'];
+                break;
+            default:
+                $this->lang = $lang;
+        }
+    }
+    /**
+     * __call  for calling dynamic methods
+     * @param string $method method of this class to call
+     * @param array $args arguments to pass to method
+     * @return mixed result of method calculation
+     */
+    public function __call($method, $args)
+    {
+        return call_user_func_array($this->$method, $args);
+    }
+    /**
+     *  __get  for getting dynamic variables
+     * @param string $var_name variable to retrieve
+     * @return mixed result of retrieval
+     */
+    public function __get($var_name)
+    {
+        return $this->$var_name;
+    }
+    /**
+     *  __set  for assigning dynamic variables
+     * @param string $var_name variable to assign
+     * @param  mixed $value value to assign to it
+     */
+    public function __set($var_name, $value)
+    {
+        $this->$var_name = $value;
+    }
+
+    /**
+     * check if the term can be determined by algorithm,
+     * usually by regualr expression, because there are infinity
+     * amount of them.
+     * ex. 13th is an ordinal number, 123 is a cardinal number
+     * then use the determined tag to be the weight key
+     * @param @string $term is the term to be checked
+     * @return right key in feature matrix
+     */
+    public function getKey($term)
+    {
+        if (isset($this->getKeyImpl)) {
+            return $this->getKeyImpl($term);
+        }
+        return $term;
+    }
+
+    /**
+     * A function that process the trainning data
+     * @param @mixed $text_files can be a file or an array of file names
+     * @return @array of seperated sentences, each sentenfce have the format of
+     *               [[words...],[tags...]]
+     * Currently, the trainning data needs to fit CTB format:
+     * term followed by a underscore and followed by the tag
+     * e.g. "新_VA 的_DEC 南斯拉夫_NR 会国_NN"
+     * To adapt to other language, some modifications are needed
+     */
+    public static function processTexts($text_files, $term_tag_splier="_",
+        $term_process = null, $tag_process = null)
+    {
+        $ret=[];
+        foreach($text_files as $text_file) {
+            if (file_exists($text_file)) {
+                $fn = fopen($text_file,"r");
+                while(! feof($fn))  {
+                    $line = fgets($fn);
+                    if(strpos($line, '<') !== false) {
+                        continue;
+                    }
+                    $word_tag_pairs = preg_split("/[\s　]+/u", $line);
+                    if (!count($word_tag_pairs)) {
+                        continue;
+                    }
+                    $ret[]=[];
+                    $ret[count($ret)-1][0]=[];
+                    $ret[count($ret)-1][1]=[];
+                    foreach ($word_tag_pairs as $word_tag_pair) {
+                        $t = explode($term_tag_splier, $word_tag_pair);
+
+                        if (count($t) == 2) {
+                            $ret[count($ret)-1][0][] =
+                                $term_process ? $term_process($t[0]) : $t[0];
+                            $ret[count($ret)-1][1][] =
+                                $tag_process ? $tag_process($t[1]) : $t[1];
+                        }
+                    }
+                }
+                fclose($fn);
+            }
+        }
+        return $ret;
+    }
+
+    /**
+    * Function to train a data
+    * Notice: This function might run very long time, depending on training set
+    * @param @mixed $text_files are training data
+    *               can be a file or an array of file names
+    * @param @float $learning_rate
+    * @param @int  $max_epoch 1200 might be a good one,
+                   the weight will overfit if it's greater than this number
+    * @parama @bool $resume if true, read the weight file and continue training
+                            if false, start from beginning
+    */
+    public function train($text_files, $term_tag_splier="_",
+                          $learning_rate=0.1, $max_epoch = 1200,
+                          $term_process = null, $tag_process = null,
+                          $resume=false)
+    {
+        if (is_string($text_files)) {
+            $text_files = [$text_files];
+        }
+        echo "Reading files\n";
+        // term_tag_sentences[sentence#]=[[words...],[tags...]]
+        $term_tag_sentences = self::processTexts($text_files, $term_tag_splier,
+            $term_process, $tag_process);
+        if ($resume) {
+            echo "Loading weights... ";
+            $this->load_weight(true);
+            $tag_index = count($this->tag_set);
+            echo "ok\n";
+        } else {
+            $this->w=[];
+            $this->tag_set=[];
+            $tag_index = 0;
+            if (isset($this->rule_defined_key)) {
+                foreach($this->rule_defined_key as $k) {
+                    $this->tag_set[$k] = $tag_index++;
+                }
+            }
+            for ($i = -4; $i <= -1; $i++) {
+                $this->w[$i] = [];
+            }
+        }
+        foreach ($term_tag_sentences as $term_tag_pairs) {
+            $terms=$term_tag_pairs[0];
+            $tags=$term_tag_pairs[1];
+            for ($i = 0; $i < count($terms); $i++) {
+                if (!isset($this->tag_set[$tags[$i]])) {
+                    $this->tag_set[$tags[$i]] = $tag_index++;
+                }
+                $k = $this->getIndex($i,$terms);
+                if (!isset($this->w[$k])) {
+                    $this->w[$k] = [];
+                }
+            }
+        }
+        foreach (array_keys($this->w) as $key) {
+            for ($i=-2; $i<=2;$i++) {
+                if (!isset($this->w[$key][$i])) {
+                    $this->w[$key][$i] = [];
+                }
+                foreach($this->tag_set as $possiable_tag => $tag_index) {
+                    if (!isset($this->w[$key][$i][$tag_index])) {
+                        $this->w[$key][$i][$tag_index] = 0;
+                    }
+                }
+            }
+        }
+        foreach($this->tag_set as $possiable_tag => $tag_index) {
+            if (!isset($this->b[$tag_index])) {
+                $this->b[$tag_index] = 0;
+            }
+        }
+        echo "Training\n";
+        //train the weight
+        $cross_entropy_loss = 1;
+        $pre_cross_entropy_loss = 2;
+        for ($epoch = 0; $epoch < $max_epoch &&
+            $pre_cross_entropy_loss - $cross_entropy_loss > 0.000001; $epoch++) {
+            $this->min_w=0;
+            $this->max_w=0;
+            $time = time();
+            $dy_dw = [];
+            $dy_dw_n = [];
+            $pre_cross_entropy_loss = $cross_entropy_loss;
+            $cross_entropy_loss = 0;
+            $cross_entropy_loss_n = 0;
+
+            $dy_db=[];
+            $dy_db_n=[];
+            for($i = 0; $i < count($this->tag_set); $i++) {
+                $dy_db[$i] = 0;
+                $dy_db_n[$i] = 0;
+            }
+            //for each sentence
+            foreach ($term_tag_sentences as $term_tag_pairs) {
+                $terms=$term_tag_pairs[0];
+                $tags=$term_tag_pairs[1];
+                for ($i = 0; $i < count($terms); $i++) {
+                    $k=[];
+                    for ($j=-2; $j<=2;$j++) {
+                        $k[$j]= $this->getIndex($i+$j,$terms);
+                    }
+                    foreach ($this->tag_set as $possiable_tag => $tag_index) {
+                        $equality = $possiable_tag == $tags[$i] ? 1 : 0;
+                        $sum=0;
+                        for ($j=-2; $j<=2;$j++) {
+                            $sum += $this->w[$k[$j]][$j][$tag_index];
+                        }
+                        $sum += $this->b[$tag_index];
+                        $sigmoid = 1 / (1 + exp(-1 * $sum));
+                        for ($j=-2; $j<=2;$j++) {
+                            if (!isset($dy_dw[$k[$j]])) {
+                                $dy_dw[$k[$j]] = [];
+                                $dy_dw_n[$k[$j]] = [];
+                            }
+                            if (!isset($dy_dw[$k[$j]][$j])) {
+                                $dy_dw[$k[$j]][$j] = [];
+                                $dy_dw_n[$k[$j]][$j] = [];
+                            }
+                            if (!isset($dy_dw[$k[$j]][$j][$tag_index])) {
+                                $dy_dw[$k[$j]][$j][$tag_index] = 0;
+                                $dy_dw_n[$k[$j]][$j][$tag_index] = 0;
+                            }
+
+                            $dy_dw[$k[$j]][$j][$tag_index] +=
+                                ($sigmoid - $equality);
+                            $dy_dw_n[$k[$j]][$j][$tag_index] += 1;
+
+                        }
+                        //dy_db
+                        $dy_db[$tag_index] += ($sigmoid - $equality);
+                        $dy_db_n[$tag_index] += 1;
+                        $cross_entropy_loss+=
+                            - $equality*log($sigmoid)
+                            - (1-$equality)*log(1-$sigmoid);
+                        $cross_entropy_loss_n++;
+                    }
+                }
+            }
+            $cross_entropy_loss /= $cross_entropy_loss_n;
+            $duration = time() - $time;
+            echo "epoch {$epoch} cross_entropy {$cross_entropy_loss}".
+                " Takes {$duration} seconds\n";
+            foreach ($dy_dw as $i =>$v1) {
+                foreach ($v1 as $j =>$v2) {
+                    foreach ($v2 as $k =>$v3) {
+                        $this->w[$i][$j][$k] -=
+                            $dy_dw[$i][$j][$k] /
+                            $dy_dw_n[$i][$j][$k] *
+                            $learning_rate;
+                        if ($this->w[$i][$j][$k] < $this->min_w) {
+                            $this->min_w = $this->w[$i][$j][$k];
+                        }
+                        if ($this->w[$i][$j][$k] > $this->max_w) {
+                            $this->max_w = $this->w[$i][$j][$k];
+                        }
+                    }
+                }
+            }
+            foreach ($dy_db as $k =>$v) {
+                $this->b[$k]-=
+                    $dy_db[$k] /
+                    $dy_db_n[$k] *
+                    $learning_rate;
+            }
+            if ($epoch % 10 == 9 ) {
+                $this->save_weight();
+            }
+        }
+        $this->save_weight();
+        return true;
+    }
+    /**
+     * The primary function to predit the tag
+     * @param mixed $sentence is an array of segmented words/terms
+     *     or a string with words/terms seperated by space
+     * @return @array of tags
+     */
+    public function predict($sentence)
+    {
+        if (!is_array($sentence)) {
+            if ($sentence == "") {
+                $terms=[];
+            } else {
+                $terms=preg_split("/[\s]+/",$sentence);
+            }
+        } else {
+            $terms=$sentence;
+        }
+        if (!count($terms)) {
+            return [];
+        }
+        if (!$this->w) {
+            $this->load_weight();
+        }
+        $ret = [];
+        for($i = 0; $i < count($terms); $i++) {
+            $term = $terms[$i];
+            $score =[];
+            $key=$this->getKey($term);
+            foreach($this->tag_set as $possiable_tag => $tag_index) {
+                $score[$possiable_tag]=0;
+                for ($j=-2; $j <=2; $j++) {
+                    $k=$this->getIndex($i+$j, $terms);
+                    if (isset($this->w[$k])) {
+                        $score[$possiable_tag] +=
+                                $this->getW($k,$j,$tag_index);
+                    } else if ($j==0&&in_array($possiable_tag,$this->rule_defined_key)) {
+                        $score[$possiable_tag] += $this->min_w;
+                    }
+                }
+
+                $score[$possiable_tag] += $this->getB($tag_index);
+
+                //$score[$possiable_tag]
+                //    += 1 / (1 + exp(-1 * $score[$possiable_tag]));
+            }
+            $ret[]=array_keys($score, max($score))[0];
+        }
+        return $ret;
+    }
+    /**
+     * Wrap function for predict
+     * @param $texts to be a @string of texts
+     * @param $return_string is a boolean to determing if the user
+     *        want it to out put to stdout or a return value
+     * @return @string if $return_string is true;
+               @boolean true otherwise
+     * e.g. 中国_NR 人民_NN 将_AD 满怀信心_VV
+            地_DEV 开创_VV 新_VA 的_DEC 业绩_NN 。_PU
+     */
+    public function tag($texts, $return_string=false)
+    {
+        if ($return_string) {
+            $ret = "";
+        }
+        $sentences = preg_split('/\r\n|\r|\n/', $texts);
+        foreach($sentences as $sentence) {
+            $sentence=explode(" ",trim($sentence));
+            $term_pos = $this->predict($sentence);
+            for($i = 0; $i < count($term_pos); $i++) {
+                $term_pos[$i]=$sentence[$i]."_".$term_pos[$i];
+            }
+            $t = join(" ", $term_pos);
+            if ($return_string) {
+                $ret .= $t;
+            } else {
+                echo $t, "\n";
+            }
+        }
+        if ($return_string) {
+            return $ret;
+        } else {
+            return true;
+        }
+    }
+    /**
+     * A list of private helper functions
+     * Given a setence ($term), find the key at position $index
+     */
+    private function getIndex($index, $terms)
+    {
+        if ($index < 0) $k = $index - 2;
+        else if ($index >= count($terms)) {
+            $k = $index - count($terms) - 2;
+        }
+        else {
+            $k = $this->getKey($terms[$index]);
+        }
+        return $k;
+    }
+    /**
+     * Get the bias value for tag
+     */
+    private function getB($tag_index)
+    {
+        return unpack("f",$this->b,$tag_index*4)[1];
+    }
+    /**
+     * Set the bias value for tag
+     */
+    private function setB($tag_index, $value)
+    {
+        $this->b = substr_replace($this->b,pack("f",$value),$tag_index*4,4);
+    }
+    /**
+     * Get the weight value for term at postion for tag
+     */
+    private function getW($term, $position, $tag_index)
+    {
+        $t = unpack("S",$this->w[$term],
+            2*($position+2)*count($this->tag_set)+$tag_index*2)[1]
+            / 65535
+            * ($this->max_w-$this->min_w) + $this->min_w;;
+        return $t;
+    }
+    /**
+     * save the trained weight to disk
+     */
+    private function save_weight()
+    {
+        $out_file = C\LOCALE_DIR . "/{$this->lang}/resources/pos_weight.txt.gz";
+        $out = [];
+        $out["min_w"] = $this->min_w;
+        $out["max_w"] = $this->max_w;
+        $out["w"]=[];
+        foreach(array_keys($this->w) as $key) {
+            $out["w"][$key] = $this->pack_w($key);
+        }
+        $out["b"] = $this->pack_b();
+        $out["tag_set"] = $this->tag_set;
+        echo "Saving...";
+        file_put_contents($out_file,
+            gzencode(serialize($out),9));
+        echo " ok\n";
+    }
+    /**
+     * load the trained weight from disk
+     */
+    private function load_weight($trainning_load=false)
+    {
+        $dic_file
+            = C\LOCALE_DIR . "/{$this->lang}/resources/pos_weight.txt.gz";
+        if (!file_exists($dic_file)) {
+            echo "$dic_file does not exist!";
+            exit();
+        }
+        $f = unserialize(gzdecode(file_get_contents($dic_file))
+            ,['allowed_classes' => false]);
+        $this->w=$f["w"];
+        $this->b=$f["b"];
+        $this->min_w=$f["min_w"];
+        $this->max_w=$f["max_w"];
+        $this->tag_set=$f["tag_set"];
+        if ($trainning_load) {
+            foreach(array_keys($this->w) as $key) {
+                $this->w[$key] = $this->unpack_w($key);
+            }
+            $this->b = $this->unpack_b($this->b);
+        }
+    }
+    /**
+     * Pack the bias
+     */
+    private function pack_b()
+    {
+        return pack("f*", ...$this->b);
+    }
+    /**
+     * Unpack the bias
+     */
+    private function unpack_b()
+    {
+        return array_merge(unpack("f".strval(count($this->tag_set)),$this->b));
+    }
+    /**
+     * Pack the weight
+     */
+    private function pack_w($key)
+    {
+        $bin_str = "";
+        foreach($this->w[$key] as $i => $t) {
+            foreach($t as $u) {
+                $v = 65535 * ($u-$this->min_w) / ($this->max_w-$this->min_w);
+                $bin_str .= pack("S", intval($v));
+            }
+        }
+        return $bin_str;
+    }
+    /**
+     * Unpack the weight
+     */
+    private function unpack_w($key)
+    {
+        $tmp = [];
+        $size = count($this->tag_set);
+        for ($i = 0; $i < 5; $i++) {
+            $tmp[$i-2] = array_merge(unpack("S".strval($size),
+                $this->w[$key], 2*$i*count($this->tag_set)));
+            for($j = 0; $j < $size; $j++) {
+                $tmp[$i-2][$j] = $tmp[$i-2][$j] / 65535
+                    * ($this->max_w-$this->min_w) + $this->min_w;
+            }
+        }
+        return $tmp;
+    }
+}
diff --git a/src/library/StochasticTermSegmenter.php b/src/library/StochasticTermSegmenter.php
index 474be1262..4d64658d9 100644
--- a/src/library/StochasticTermSegmenter.php
+++ b/src/library/StochasticTermSegmenter.php
@@ -28,7 +28,6 @@
  */
 namespace seekquarry\yioop\library;

-use seekquarry\yioop\library\Trie;
 use seekquarry\yioop\locale\zh_CN\resources as ZH;
 use seekquarry\yioop\configs as C;
 /**
@@ -39,12 +38,13 @@ use seekquarry\yioop\configs as C;
  * Currently only supports Chinese.
  * Instruction to add a new language:
  * Add a switch case in the constructor.
- * Define the following variable:
- * $non_char_preg
  * Define the following function:
  * isExceptionImpl
  * See the class function 'isException' for more information
- *
+ * isPunctuationImpl
+ * See the class function 'isPunctuation' for more information
+ * isNotCurrentLangImpl
+ * See the class function 'notCurrentLang' for more information
  * Chinese example is provided in the constructor
  *
  * @author Xianghong Sun
@@ -66,7 +66,7 @@ class StochasticTermSegmenter
      */
     private $cache_pct;
     /**
-     * Used in memory saving mode
+     * Cache. Will have runtime data for the segmentation
      * @var array
      */
     private $cache=[];
@@ -111,9 +111,8 @@ class StochasticTermSegmenter
         switch($lang)
         {
             case "zh_CN":
-                case "zh-CN":
+            case "zh-CN":
                 $this->lang = "zh_CN";
-                $this->non_char_preg = "/^[^\p{Han}]+$/u";
                 /*
                  * Check if the term passed in is an exception term
                  */
@@ -127,23 +126,22 @@ class StochasticTermSegmenter
                  */
                 $this->isPunctuationImpl = function($term)
                 {
-                    return preg_match(ZH\Tokenizer::$punctuation_preg, $term);
+                    return ZH\Tokenizer::isPunctuation($term);
                 };
-                break;
-            case "ja":
-                $this->lang = $lang;
-                $this->non_char_preg =
-                    "/^[^\x{4E00}-\x{9FBF}\x{3040}-\x{309F}".
-                    "\x{30A0}-\x{30FF}]+$/u";
-                break;
-            case "ko":
-                $this->lang = $lang;
-                $this->non_char_preg = "/^[^\x{3130}-\x{318F}".
-                    "\x{AC00}-\x{D7AF}]+$/u";
+                /*
+                 * Check if all the chars in the term is NOT current language
+                 */
+                $this->isNotCurrentLangImpl = function($term)
+                {
+                    return ZH\Tokenizer::isNotCurrentLang($term);
+                };
+                /*
+                 * named entity recognizer;
+                 */
+                $this->NER = ZH\Tokenizer::getNER();
                 break;
             default:
                 $this->lang = $lang;
-                $this->non_char_preg = "/^[^.]+$/u";
         }
     }
     /**
@@ -208,18 +206,18 @@ class StochasticTermSegmenter
      */
     public function notCurrentLang($term)
     {
-        return preg_match($this->non_char_preg, $term);
+        if (isset($this->isNotCurrentLangImpl))
+            return $this->isNotCurrentLangImpl($term);
+        return false;
     }
     /**
      * Generate a term dictionary file for later segmentation
      * @param mixed $text_files is a string name or an array of files
      *  that to be trained; words in the files need to be segmented by space
      * @param string $format currently only support default and CTB
-     * @param int $max_term_weights maximum number of terms ot keep
      * @return bool true if success
      */
-    public function train($text_files, $format = "default",
-        $max_num_term_weights = 40000)
+    public function train($text_files, $format = "default")
     {
         $ctb_fmt=false;
         switch ($format) {
@@ -254,9 +252,7 @@ class StochasticTermSegmenter
                             && !$this->notCurrentLang($word)) {
                             if (!empty($dictionary[$word])) {
                                 $dictionary[$word]++;
-                            } else if (count($dictionary) <
-                                $max_num_term_weights
-                                && mb_strlen($word) < 7) {
+                            } else if (mb_strlen($word) < 7) {
                                 $dictionary[$word] = 1;
                             }
                         }
@@ -319,6 +315,8 @@ class StochasticTermSegmenter
                     }
                 }
                 fclose($fh);
+            } else {
+                echo "cannot open $text_file\n";
             }
         }
         if ($return_string) {
@@ -392,6 +390,13 @@ class StochasticTermSegmenter
         if (!count($characters)) {
             return [];
         }
+        $ner_dict=[];
+        if (isset($this->NER)) {
+            $named_entities=$this->NER->predict($characters);
+            foreach($named_entities as $e) {
+                $this->add($e[0],1,$ner_dict);
+            }
+        }
         $score = [];
         $path = [];
         //init base
@@ -487,6 +492,22 @@ class StochasticTermSegmenter
                     }
                 }
             }
+            //check NER dictionary
+            if (isset($ner_dict[$characters[$index]])) {
+                $subdic = $ner_dict;
+                for ($j = $index; $j < count($characters); $j++) {
+                    if (!isset($subdic[$characters[$j]])) {
+                        break;
+                    }
+                    $subdic = $subdic[$characters[$j]];
+                    if (isset($subdic['$']) && (!isset($score[$j]) ||
+                        $score[$index - 1] + $subdic['$'] < $score[$j])) {
+                        $score[$j] = $score[$index - 1] +
+                            $this->getScore($subdic['$']);
+                        $path[$j] = $index - 1;
+                    }
+                }
+            }
         }
         //trace path
         $t = max(array_keys($path));
diff --git a/src/locale/zh_CN/resources/Tokenizer.php b/src/locale/zh_CN/resources/Tokenizer.php
index 02113c793..ede2bd986 100755
--- a/src/locale/zh_CN/resources/Tokenizer.php
+++ b/src/locale/zh_CN/resources/Tokenizer.php
@@ -30,6 +30,7 @@ namespace seekquarry\yioop\locale\zh_CN\resources;

 use seekquarry\yioop\library\PhraseParser;
 use seekquarry\yioop\library\StochasticTermSegmenter;
+use seekquarry\yioop\library\ContextWeightedNamedEntityRecognizer;

 /**
  * Chinese specific tokenization code. Typically, tokenizer.php
@@ -57,23 +58,39 @@ class Tokenizer
         '与', '次', '狗', '决', '金', '史', '姆', '部', '正在', '活', '刚',
         '回家', '贝', '如何', '须', '战', '不會', '夫', '喂', '父', '亚', '肯定',
         '女孩', '世界'];
+    /**
+     * regular expression to determine if the None of the char in this
+     * term is in current language.
+     * @var string
+     */
+    public static $non_char_preg = "/^[^\p{Han}]+$/u";
     /**
      * The dictionary of characters can be used as Chinese Numbers
      * @string
      */
     public static $num_dict =
-       "1234567890○零一二三四五六七八九十百千万亿".
+       "1234567890○〇零一二两三四五六七八九十百千万亿".
        "０１２３４５６７８９壹贰叁肆伍陆柒捌玖拾廿卅卌佰仟萬億";
     /**
      * Dots used in Chinese Numbers
      * @string
      */
-    public static $dot = "\.．";
+    public static $dot = "\.．点";
     /**
      * A list of characters can be used at the end of numbers
      * @string
      */
     public static $num_end = "％%";
+    /**
+     * Exception words of the regex found by functions:
+     * isCardinalNumber, isOrdinalNumber, isDate
+     * ex. "十分" in most of time means "very", but it will
+     * be determined to be "10 minutes" by the function so we
+     * need to remove it
+     * @array of string
+     */
+    public static $exception_list= ["十分","一","一点","千万",
+    "万一", "一一", "拾", "一时", "千千", "万万", "陆"];
     /**
      * A list of characters can be used as Chinese punctuations
      * @string
@@ -82,7 +99,17 @@ class Tokenizer
     "/^([\x{2000}-\x{206F}\x{3000}-\x{303F}\x{FF00}-\x{FF0F}" .
     "\x{FF1A}-\x{FF20}\x{FF3B}-\x{FF40}\x{FF5B}-\x{FF65}" .
     "\x{FFE0}-\x{FFEE}\x{21}-\x{2F}\x{21}-\x{2F}" .
-    "\x{3A}-\x{40}\x{5B}-\x{60}]|[\—]+|[\.]+)$/u";
+    "\x{3A}-\x{40}\x{5B}-\x{60}\x{25cf}])\\1*$/u";
+    /**
+     * Stochastic Term Segmenter instance
+     * @object
+     */
+    private static $stochasticTermSegmenter;
+    /**
+     * named Entity Recognizer instance
+     * @object
+     */
+    private static $namedEntityRecognizer;
     /**
      * Removes the stop words from the page (used for Word Cloud generation
      * and language detection)
@@ -117,8 +144,8 @@ class Tokenizer
                 ['/^\d+$/', '/^[a-zA-Z]+$/']);
                 break;
             case("STS"):
-                $segmenter = new StochasticTermSegmenter("zh_CN");
-                return $segmenter->segmentText($pre_segment,true);
+                return self::getStochasticTermSegmenter()
+                        ->segmentText($pre_segment,true);
                 break;
         }
     }
@@ -127,25 +154,111 @@ class Tokenizer
      */
     public static function isCardinalNumber($term)
     {
-        return preg_match("/^[" . self::$num_dict .
-            self::$dot . "]+[" . self::$num_end .
-            "]?[余餘]?[百千万亿佰仟萬億]?$/u", $term);
+        return !in_array($term,self::$exception_list)
+            && preg_match("/^([" . self::$num_dict .
+            "]+([" . self::$dot . "][" .self::$num_dict .
+            "]+)?[" . self::$num_end .
+            "]?[余餘多]?[百千万亿佰仟萬億]*)".
+            "$|^([".self::$num_dict."]+分之[" . self::$num_dict .
+            "]+([" . self::$dot . "][" .self::$num_dict .
+            "]+)?)$/u", $term);
     }
     /*
      * Check if the term passed in is a Ordinal Number
      */
     public static function isOrdinalNumber($term)
     {
-        return preg_match("/^第[" . self::$num_dict .
-        "]*$/u", $term);
+        return !in_array($term,self::$exception_list)
+            && preg_match("/^第[" . self::$num_dict .
+            "]*$/u", $term);
     }
     /*
      * Check if the term passed in is a date
      */
     public static function isDate($term)
     {
-        return preg_match("/^[" . self::$num_dict .
-        "]+(年|年代|月|日|时|小时|時|小時|" .
-        "点|点钟|點|點鐘|分|分鐘|秒|秒鐘)$/u",$term);
+        return !in_array($term,self::$exception_list)
+            && preg_match("/^[" . self::$num_dict .
+            "]+(年|年代|月|日|时|小时|時|小時|" .
+            "点|点钟|點|點鐘|分|分鐘|秒|秒鐘)$/u",$term);
+    }
+    /*
+     * Check if the term is a punctuation
+     */
+    public static function isPunctuation($term)
+    {
+        return preg_match(self::$punctuation_preg, $term);
+    }
+    /**
+     * Check if all the chars in the term is NOT current language
+     * @param $term is a string that to be checked
+     * @return bool true if all the chars in $term is NOT current language
+     *         false otherwise
+     */
+    public static function isNotCurrentLang($term)
+    {
+        return preg_match(self::$non_char_preg, $term);
+    }
+    /*
+     * Create stochastic term segmenter
+     */
+    public static function createStochasticTermSegmenter($cache_pct=0.06)
+    {
+        self::$stochasticTermSegmenter
+            = new StochasticTermSegmenter("zh_CN", $cache_pct);
+    }
+    /*
+     * Destory stochastic term segmenter
+     */
+    public static function destoryStochasticTermSegmenter()
+    {
+        self::$stochasticTermSegmenter = null;
+    }
+    /*
+     * Get the segmenter instance
+     */
+    public static function getStochasticTermSegmenter() {
+        if (!self::$stochasticTermSegmenter) {
+            self::createStochasticTermSegmenter();
+        }
+        return self::$stochasticTermSegmenter;
+    }
+    public static function POSGetKey($term) {
+        if (self::isPunctuation($term)) {
+            return 'PU';
+        } else if (self::isCardinalNumber($term)) {
+            return 'CD';
+        } else if (self::isOrdinalNumber($term)) {
+            return 'OD';
+        } else if (self::isDate($term)) {
+            return 'NT';
+        } else if (self::isNotCurrentLang($term)) {
+            return 'FW';
+        }
+        return null;
+    }
+    /*
+     * Create named entity recognizer instance
+     */
+    public static function createNER()
+    {
+        self::$namedEntityRecognizer
+            = new ContextWeightedNamedEntityRecognizer("zh_CN");
+    }
+    /*
+     * Destory named entity recognizer instance
+     */
+    public static function destoryNER()
+    {
+        self::$namedEntityRecognizer = null;
+    }
+    /*
+     * Get the named entity recognizer instance
+     */
+    public static function getNER() {
+        if (!self::$namedEntityRecognizer) {
+            self::createNER();
+        }
+        return self::$namedEntityRecognizer;
     }
 }

ViewGit