diff --git a/src/library/ContextWeightedNamedEntityRecognizer.php b/src/library/ContextWeightedNamedEntityRecognizer.php
new file mode 100644
index 000000000..0790fdabc
--- /dev/null
+++ b/src/library/ContextWeightedNamedEntityRecognizer.php
@@ -0,0 +1,608 @@
+<?php
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009 - 2019 Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <https://www.gnu.org/licenses/>.
+ *
+ * @author Xianghong Sun sxh19911230@gmail.com
+ * @license https://www.gnu.org/licenses/ GPL3
+ * @link https://www.seekquarry.com/
+ * @copyright 2009 - 2019
+ * @filesource
+ */
+
+namespace seekquarry\yioop\library;
+
+use seekquarry\yioop\configs as C;
+use seekquarry\yioop\locale\zh_CN\resources as ZH;
+
+/**
+ * Machine learning based NER tagger. Typically, ContextWeightedNERTagger.php
+ * can train the language with some dataset and predict
+ * the tag given a list of word.
+ *
+ * @author Xianghong Sun
+ */
+
+class ContextWeightedNamedEntityRecognizer
+{
+ /**
+ * Current Language, only tested on Simplified Chinese
+ * Might be extensable for other languages in the furture
+ * @var string
+ */
+ public $lang;
+ /**
+ * The word weight feature
+ * y = wx + b
+ * Generized by training method
+ * @var array
+ */
+ public $word_feature;
+ /**
+ * The tag weight feature
+ * y = wx + b
+ * Generized by training method
+ * @var array
+ */
+ public $tag_feature;
+ /**
+ * The bias
+ * y = wx + b
+ * Generized by training method
+ * @var array
+ */
+ public $bias;
+ /**
+ * All Possiable tag set
+ * Generized by training method
+ * @var associative array [tag => tag index]
+ */
+ private $tag_set;
+ /**
+ * The constructer of the pos tagger
+ * To extend to other languages, some work are needed:
+ * Define $this->getKeyImpl, $this->rule_defined_key
+ * See Chinese example.
+ * @param @string $lang describes current langauge
+ * @param @book $packed describes how weight and bias would look like
+ */
+ public function __construct($lang)
+ {
+ switch($lang) {
+ case("zh_CN"):
+ case("zh-CH"):
+ $this->lang = "zh_CN";
+ break;
+ default:
+ $this->lang = $lang;
+ }
+ }
+
+ /**
+ * A function that process the trainning data
+ * @param @mixed $text_files can be a file or an array of file names
+ * @return @array of seperated sentences, each sentenfce have the format of
+ * [[words...],[tags...]]
+ * Data format MSRA:
+ * 我们/o 是/o 受到/o 郑振铎/nr 先生/o 、/o 阿英/nr 先生/o 著作/o 的/o
+ * 启示/o ,/o 从/o 个人/o 条件/o 出发/o ,/o 瞄准/o 现代/o 出版/o 史/o
+ * 研究/o 的/o 空白/o ,/o 重点/o 集/o 藏/o 解放区/o 、/o 国民党/nt 毁/o
+ * 禁/o 出版物/o 。/o
+ * To adapt to other language, some modifications are needed
+ */
+ public static function processTexts($text_files, $term_tag_splier="/",
+ $term_process = null, $tag_process = null)
+ {
+ $ret=[];
+ foreach($text_files as $text_file) {
+ if (file_exists($text_file)) {
+ $fn = fopen($text_file,"r");
+ while(! feof($fn)) {
+ $line = fgets($fn);
+ if(strpos($line, '<') !== false) {
+ continue;
+ }
+ $word_tag_pairs = preg_split("/[\s ]+/u", $line);
+ if (!count($word_tag_pairs)) {
+ continue;
+ }
+ $ret[]=[];
+ $ret[count($ret)-1][0]=[];
+ $ret[count($ret)-1][1]=[];
+ foreach ($word_tag_pairs as $word_tag_pair) {
+ $t = explode("/", $word_tag_pair);
+ //echo $word_tag_pair;
+ //print_r($t);
+ if (count($t) == 2) {
+ $tag = $tag_process ? $tag_process($t[1]) : $t[1];
+ foreach(preg_split('//u', $t[0], null, PREG_SPLIT_NO_EMPTY) as $ch) {
+ $ret[count($ret)-1][0][] =
+ $term_process ? $term_process($ch) : $ch;
+ $ret[count($ret)-1][1][] = $tag;
+ }
+ }
+ }
+ }
+ fclose($fn);
+ }
+ }
+ return $ret;
+ }
+
+ /**
+ * Function to train a data
+ * Notice: This function might run very long time, depending on training set
+ * @param @mixed $text_files are training data
+ * can be a file or an array of file names
+ * @param @float $learning_rate
+ * @param @int $max_epoch 1200 might be a good one,
+ the weight will overfit if it's greater than this number
+ * @param @function $term_process is a preporcess on term before training
+ * @param @function $tag_process is a preporcess on tag before training
+ */
+ public function train($text_files,
+ $learning_rate=0.1, $max_epoch = 1200,
+ $term_process = null, $tag_process = null)
+ {
+ if (is_string($text_files)) {
+ $text_files = [$text_files];
+ }
+ echo "Reading files\n";
+ // term_tag_sentences[sentence#]=[[words...],[tags...]]
+ $term_tag_sentences = self::processTexts($text_files,
+ $term_process, $tag_process);
+ $this->word_feature=[];
+ $this->tag_set=[];
+ $tag_index = 0;
+ for ($i = -4; $i <= -1; $i++) {
+ $this->word_feature[$i] = [];
+ }
+ foreach ($term_tag_sentences as $term_tag_pairs) {
+ $terms=$term_tag_pairs[0];
+ $tags=$term_tag_pairs[1];
+ $this->tag_feature["start"]=[];
+ $this->tag_feature["start-start"]=[];
+ for ($i = 0; $i < count($terms); $i++) {
+ if (!isset($this->tag_set[$tags[$i]])) {
+ $this->tag_set[$tags[$i]] = $tag_index++;
+ }
+ if ($i == 0) {}
+ else if ($i == 1) {
+ if (!isset($this->tag_feature["start-".$tags[$i-1]])) {
+ $this->tag_feature["start-".$tags[$i-1]]=[];
+ }
+ if (!isset($this->tag_feature[$tags[$i-1]])) {
+ $this->tag_feature[$tags[$i-1]]=[];
+ }
+ } else {
+ if (!isset($this->tag_feature[$tags[$i-2]."-".$tags[$i-1]])) {
+ $this->tag_feature[$tags[$i-2]."-".$tags[$i-1]]=[];
+ }
+ if (!isset($this->tag_feature[$tags[$i-1]])) {
+ $this->tag_feature[$tags[$i-1]]=[];
+ }
+ }
+
+ if (!isset($this->word_feature[$terms[$i]])) {
+ $this->word_feature[$terms[$i]] = [];
+ }
+ }
+ }
+ foreach (array_keys($this->word_feature) as $key) {
+ for ($i=-2; $i<=2;$i++) {
+ if (!isset($this->word_feature[$key][$i])) {
+ $this->word_feature[$key][$i] = [];
+ }
+ foreach($this->tag_set as $possiable_tag => $tag_index) {
+ if (!isset($this->word_feature[$key][$i][$tag_index])) {
+ $this->word_feature[$key][$i][$tag_index] = 0;
+ }
+ }
+ }
+ }
+ foreach (array_keys($this->tag_feature) as $key) {
+ foreach($this->tag_set as $possiable_tag => $tag_index) {
+ if (!isset($this->tag_feature[$key][$tag_index])) {
+ $this->tag_feature[$key][$tag_index] = 0;
+ }
+ }
+ }
+ foreach($this->tag_set as $possiable_tag => $tag_index) {
+ if (!isset($this->bias[$tag_index])) {
+ $this->bias[$tag_index] = 0;
+ }
+ }
+ echo "Training...\n";
+ //train the weight
+ $cross_entropy_loss = 1;
+ $pre_cross_entropy_loss = 2;
+ for ($epoch = 0; ($epoch < $max_epoch) &&
+ $pre_cross_entropy_loss - $cross_entropy_loss > 0.000001; $epoch++) {
+ $this->min_w=0;
+ $this->max_w=0;
+ $time = time();
+ $dy_dw = [];
+ $dy_dw_n = [];
+ $pre_cross_entropy_loss = $cross_entropy_loss;
+ $cross_entropy_loss = 0;
+ $cross_entropy_loss_n = 0;
+
+ $dy_db=[];
+ $dy_db_n=[];
+
+ $dy_dt=[];
+ $dy_dt_n=[];
+ for($i = 0; $i < count($this->tag_set); $i++) {
+ $dy_db[$i] = 0;
+ $dy_db_n[$i] = 0;
+ }
+ //for each sentence
+ foreach ($term_tag_sentences as $term_tag_pairs) {
+ $terms=$term_tag_pairs[0];
+ $tags=$term_tag_pairs[1];
+ for ($i = 0; $i < count($terms); $i++) {
+ $k=[];
+ for ($j=-2; $j<=2;$j++) {
+ $k[$j]= $this->getIndex($i+$j,$terms);
+ }
+ foreach ($this->tag_set as $possiable_tag => $tag_index) {
+ $equality = $possiable_tag == $tags[$i] ? 1 : 0;
+ $sum=0;
+ //5 words including itself
+ for ($j=-2; $j<=2;$j++) {
+ $sum += $this->word_feature[$k[$j]][$j][$tag_index];
+ }
+ //previous 2 tags
+ if ($i == 0) {
+ $tf1="start";
+ $tf2="start-start";
+ } else if ($i == 1) {
+ $tf1=$tags[$i-1];
+ $tf2="start-".$tags[$i-1];
+ } else {
+ $tf1=$tags[$i-1];
+ $tf2=$tags[$i-2]."-".$tags[$i-1];
+ }
+ $sum += $this->tag_feature[$tf1][$tag_index];
+ $sum += $this->tag_feature[$tf2][$tag_index];
+ //bias
+ $sum += $this->bias[$tag_index];
+ $sigmoid = 1 / (1 + exp(-1 * $sum));
+ for ($j=-2; $j<=2;$j++) {
+ if (!isset($dy_dw[$k[$j]])) {
+ $dy_dw[$k[$j]] = [];
+ $dy_dw_n[$k[$j]] = [];
+ }
+ if (!isset($dy_dw[$k[$j]][$j])) {
+ $dy_dw[$k[$j]][$j] = [];
+ $dy_dw_n[$k[$j]][$j] = [];
+ }
+ if (!isset($dy_dw[$k[$j]][$j][$tag_index])) {
+ $dy_dw[$k[$j]][$j][$tag_index] = 0;
+ $dy_dw_n[$k[$j]][$j][$tag_index] = 0;
+ }
+
+ $dy_dw[$k[$j]][$j][$tag_index] +=
+ ($sigmoid - $equality);
+ $dy_dw_n[$k[$j]][$j][$tag_index] += 1;
+
+ }
+ //dy_dt
+ if (!isset($dy_dt[$tf1])) {
+ $dy_dt[$tf1] = [];
+ $dy_dt_n[$tf1] = [];
+ }
+ if (!isset($dy_dt[$tf1][$tag_index])) {
+ $dy_dt[$tf1][$tag_index] = 0;
+ $dy_dt_n[$tf1][$tag_index] = 0;
+ }
+ if (!isset($dy_dt[$tf2])) {
+ $dy_dt[$tf2] = [];
+ $dy_dt_n[$tf2] = [];
+ }
+ if (!isset($dy_dt[$tf2][$tag_index])) {
+ $dy_dt[$tf2][$tag_index] = 0;
+ $dy_dt_n[$tf2][$tag_index] = 0;
+ }
+ $dy_dt[$tf1][$tag_index] += ($sigmoid - $equality);
+ $dy_dt_n[$tf1][$tag_index] += 1;
+ $dy_dt[$tf2][$tag_index] += ($sigmoid - $equality);
+ $dy_dt_n[$tf2][$tag_index] += 1;
+ //dy_db
+ $dy_db[$tag_index] += ($sigmoid - $equality);
+ $dy_db_n[$tag_index] += 1;
+ $cross_entropy_loss+=
+ - $equality*log($sigmoid)
+ - (1-$equality)*log(1-$sigmoid);
+ $cross_entropy_loss_n++;
+ }
+ }
+ }
+ $cross_entropy_loss /= $cross_entropy_loss_n;
+ $duration = time() - $time;
+ echo "epoch {$epoch} cross_entropy {$cross_entropy_loss}".
+ " Takes {$duration} seconds\n";
+ foreach ($dy_dw as $i =>$v1) {
+ foreach ($v1 as $j =>$v2) {
+ foreach ($v2 as $k =>$v3) {
+ $this->word_feature[$i][$j][$k] -=
+ $dy_dw[$i][$j][$k] /
+ $dy_dw_n[$i][$j][$k] *
+ $learning_rate;
+ if ($this->word_feature[$i][$j][$k] < $this->min_w) {
+ $this->min_w = $this->word_feature[$i][$j][$k];
+ }
+ if ($this->word_feature[$i][$j][$k] > $this->max_w) {
+ $this->max_w = $this->word_feature[$i][$j][$k];
+ }
+ }
+ }
+ }
+ foreach ($dy_dt as $i => $v1) {
+ foreach ($v1 as $j => $v2) {
+ $this->tag_feature[$i][$j] -=
+ $dy_dt[$i][$j] /
+ $dy_dt_n[$i][$j] *
+ $learning_rate;
+ }
+ }
+ foreach ($dy_db as $k => $v) {
+ $this->bias[$k]-=
+ $dy_db[$k] /
+ $dy_db_n[$k] *
+ $learning_rate;
+ }
+ if ($epoch % 10 == 9 ) {
+ $this->save_weight();
+ }
+ }
+ $this->save_weight();
+ return true;
+ }
+ /**
+ * The primary function to predit the tag
+ * @param mixed $sentence is an array of segmented words/terms
+ * or a string needs to be splited by $splitter
+ * @param function $splitter to process $sentence if $sentence
+ * is a string
+ * @return @array all predicted named entities with its tag
+ * ex. [["郑振铎","nr"],["国民党","nt"]]
+ */
+ public function predict($sentence, $delimiter="",$splitter=null)
+ {
+ if (!is_array($sentence)) {
+ if ($sentence == "") {
+ $terms=[];
+ } else {
+ $terms=preg_split("/[\s]+/",$sentence);
+ }
+ } else {
+ $terms=$sentence;
+ }
+ if (!count($terms)) {
+ return [];
+ }
+ if (!$this->word_feature) {
+ $this->load_weight();
+ }
+ $result = [];
+ for($i = 0; $i < count($terms); $i++) {
+ $term = $terms[$i];
+ $score =[];
+ foreach($this->tag_set as $possiable_tag => $tag_index) {
+ $score[$possiable_tag]=0;
+ for ($j=-2; $j <=2; $j++) {
+ $k=$this->getIndex($i+$j, $terms);
+ if (isset($this->word_feature[$k])) {
+ $score[$possiable_tag] +=
+ $this->getW($k,$j,$tag_index);
+ }
+ }
+ if ($i == 0) {
+ $tf1="start";
+ $tf2="start-start";
+ } else if ($i == 1) {
+ $tf1=$result[$i-1];
+ $tf2="start-".$result[$i-1];
+ } else {
+ $tf1=$result[$i-1];
+ $tf2=$result[$i-2]."-".$result[$i-1];
+ }
+ $score[$possiable_tag] += $this->getT($tf1,$tag_index);
+ $score[$possiable_tag] += $this->getT($tf2,$tag_index);
+ $score[$possiable_tag] += $this->getB($tag_index);
+ }
+ $result[]=array_keys($score, max($score))[0];
+ }
+ $pre_tag='o';
+ $current_entity=null;
+ $ret=[];
+ for ($i = 0; $i < count($terms); $i++) {
+ if ($pre_tag != $result[$i] && $pre_tag != "o") {
+ if (mb_strlen($current_entity) < 10) {
+ $ret[]=[$current_entity,$pre_tag];
+ }
+ $current_entity=null;
+ }
+ if ($result[$i] != "o") {
+ if ($current_entity) {
+ $current_entity.=$delimiter.$terms[$i];
+ } else {
+ $current_entity=$terms[$i];
+ }
+ }
+ $pre_tag=$result[$i];
+ }
+ return $ret;
+ }
+
+ /**
+ * A list of private helper functions
+ * Given a setence ($term), find the key at position $index
+ */
+ private function getIndex($index, $terms)
+ {
+ if ($index < 0) $k = $index - 2;
+ else if ($index >= count($terms)) {
+ $k = $index - count($terms) - 2;
+ }
+ else {
+ $k = $terms[$index];
+ }
+ return $k;
+ }
+
+ /**
+ * save the trained weight to disk
+ */
+ private function save_weight()
+ {
+ $out_file = C\LOCALE_DIR . "/{$this->lang}/resources/ner_weight.txt.gz";
+ $out = [];
+ $out["min_w"] = $this->min_w;
+ $out["max_w"] = $this->max_w;
+ $out["w"]=[];
+ foreach(array_keys($this->word_feature) as $key) {
+ $out["w"][$key] = $this->pack_w($key);
+ }
+ foreach(array_keys($this->tag_feature) as $key) {
+ $out["t"][$key] = $this->pack_t($key);
+ }
+ $out["b"] = $this->pack_b();
+ $out["tag_set"] = $this->tag_set;
+ echo "Saving...";
+ file_put_contents($out_file,
+ gzencode(serialize($out),9));
+ echo " ok\n";
+ }
+ /**
+ * load the trained weight from disk
+ */
+ private function load_weight($trainning_load=false)
+ {
+ $dic_file
+ = C\LOCALE_DIR . "/{$this->lang}/resources/ner_weight.txt.gz";
+ if (!file_exists($dic_file)) {
+ echo "$dic_file does not exist!";
+ exit();
+ }
+ $f = unserialize(gzdecode(file_get_contents($dic_file))
+ ,['allowed_classes' => false]);
+ $this->word_feature=$f["w"];
+ $this->tag_feature=$f["t"];
+ $this->bias=$f["b"];
+ $this->min_w=$f["min_w"];
+ $this->max_w=$f["max_w"];
+ $this->tag_set=$f["tag_set"];
+ if ($trainning_load) {
+ foreach(array_keys($this->word_feature) as $key) {
+ $this->word_feature[$key] = $this->unpack_w($key);
+ }
+ foreach(array_keys($this->tag_feature) as $key) {
+ $this->tag_feature[$key] = $this->unpack_t($key);
+ }
+ $this->bias = $this->unpack_b();
+ }
+ }
+ /**
+ * Pack the bias
+ */
+ private function pack_b()
+ {
+ return pack("f*", ...$this->bias);
+ }
+ /**
+ * Unpack the bias
+ */
+ private function unpack_b()
+ {
+ return array_merge(unpack("f".strval(count($this->tag_set)),$this->bias));
+ }
+ /**
+ * Pack the tag_feature
+ */
+ private function pack_t($key)
+ {
+ return pack("f*", ...$this->tag_feature[$key]);
+ }
+ /**
+ * Unpack the tag_feature
+ */
+ private function unpack_t($key)
+ {
+ return array_merge(unpack("f".strval(count($this->tag_set)),$this->tag_feature[$key]));
+ }
+ /**
+ * Pack the word_feature
+ */
+ private function pack_w($key)
+ {
+ $bin_str = "";
+ foreach($this->word_feature[$key] as $i => $t) {
+ foreach($t as $u) {
+ $v = 65535 * ($u-$this->min_w) / ($this->max_w-$this->min_w);
+ $bin_str .= pack("S", intval($v));
+ }
+ }
+ return $bin_str;
+ }
+ /**
+ * Unpack the word_feature
+ */
+ private function unpack_w($key)
+ {
+ $tmp = [];
+ $size = count($this->tag_set);
+ for ($i = 0; $i < 5; $i++) {
+ $tmp[$i-2] = array_merge(unpack("S".strval($size),
+ $this->word_feature[$key], 2*$i*count($this->tag_set)));
+ for($j = 0; $j < $size; $j++) {
+ $tmp[$i-2][$j] = $tmp[$i-2][$j] / 65535
+ * ($this->max_w-$this->min_w) + $this->min_w;
+ }
+ }
+ return $tmp;
+ }
+ /**
+ * Get the bias value for tag
+ */
+ private function getB($tag_index)
+ {
+ return unpack("f",$this->bias,$tag_index*4)[1];
+ }
+ /**
+ * Get the bias value for tag
+ */
+ private function getT($key, $tag_index)
+ {
+ return unpack("f",$this->tag_feature[$key],$tag_index*4)[1];
+ }
+ /**
+ * Get the weight value for term at postion for tag
+ */
+ private function getW($term, $position, $tag_index)
+ {
+ $t = unpack("S",$this->word_feature[$term],
+ 2*($position+2)*count($this->tag_set)+$tag_index*2)[1]
+ / 65535
+ * ($this->max_w-$this->min_w) + $this->min_w;;
+ return $t;
+ }
+}
diff --git a/src/library/ContextWeightedPosTagger.php b/src/library/ContextWeightedPosTagger.php
new file mode 100644
index 000000000..140097b95
--- /dev/null
+++ b/src/library/ContextWeightedPosTagger.php
@@ -0,0 +1,599 @@
+<?php
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009 - 2019 Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <https://www.gnu.org/licenses/>.
+ *
+ * @author Xianghong Sun sxh19911230@gmail.com
+ * @license https://www.gnu.org/licenses/ GPL3
+ * @link https://www.seekquarry.com/
+ * @copyright 2009 - 2019
+ * @filesource
+ */
+
+namespace seekquarry\yioop\library;
+
+use seekquarry\yioop\configs as C;
+use seekquarry\yioop\locale\zh_CN\resources as ZH;
+
+/**
+ * Machine learning based POS tagger. Typically, ContextWeightedPosTagger.php
+ * can train the language with some dataset and predict
+ * the tag given a list of word.
+ *
+ * Instruction to add a new language:
+ * Add a switch case in the constructor.
+ * Define the following functions:
+ * getKeyImpl
+ * See the class function 'getKey' for more information
+ *
+ * @author Xianghong Sun
+ */
+
+class ContextWeightedPosTagger {
+ /**
+ * Current Language, only tested on Simplified Chinese
+ * Might be extensable for other languages in the furture
+ * @var string
+ */
+ public $lang;
+ /**
+ * The weight for predicting the pos tag
+ * y = wx + b
+ * Generized by training method
+ * @var array
+ */
+ public $w;
+ /**
+ * The bias for predicting the pos tag
+ * y = wx + b
+ * Generized by training method
+ * @var array
+ */
+ public $b;
+ /**
+ * range of w
+ */
+ private $min_w;
+ private $max_w;
+
+ /**
+ * All Possiable tag set
+ * Generized by training method
+ * @var associative array [tag => tag index]
+ */
+ private $tag_set;
+ /**
+ * Check if all the chars in the term is not current language
+ * @param $term is a string that to be checked
+ * @return true if all the chars in $term is not current language
+ * false otherwise
+ */
+ public function notCurrentLang($term)
+ {
+ return preg_match("/^[^\p{Han}]+$/u", $term);
+ }
+ /**
+ * The constructer of the pos tagger
+ * To extend to other languages, some work are needed:
+ * Define $this->getKeyImpl, $this->rule_defined_key
+ * See Chinese example.
+ * @param @string $lang describes current langauge
+ * @param @book $packed describes how weight and bias would look like
+ */
+ public function __construct($lang, $packed = true)
+ {
+ //$this->packed = $packed;
+ switch($lang) {
+ case("zh_CN"):
+ case("zh-CH"):
+ $this->lang = "zh_CN";
+ /*
+ * Some Exception of Tags. Some tags are detemined by ruls.
+ * e.x. There are infinity amount of Arabic numerals.
+ */
+ $this->getKeyImpl = function($term) {
+ $key = ZH\Tokenizer::POSGetKey($term);
+ return $key ? $this->tag_set[$key] : $term;
+ };
+ //Tags from above
+ $this->rule_defined_key = ['PU','CD','OD','NT','FW'];
+ break;
+ default:
+ $this->lang = $lang;
+ }
+ }
+ /**
+ * __call for calling dynamic methods
+ * @param string $method method of this class to call
+ * @param array $args arguments to pass to method
+ * @return mixed result of method calculation
+ */
+ public function __call($method, $args)
+ {
+ return call_user_func_array($this->$method, $args);
+ }
+ /**
+ * __get for getting dynamic variables
+ * @param string $var_name variable to retrieve
+ * @return mixed result of retrieval
+ */
+ public function __get($var_name)
+ {
+ return $this->$var_name;
+ }
+ /**
+ * __set for assigning dynamic variables
+ * @param string $var_name variable to assign
+ * @param mixed $value value to assign to it
+ */
+ public function __set($var_name, $value)
+ {
+ $this->$var_name = $value;
+ }
+
+ /**
+ * check if the term can be determined by algorithm,
+ * usually by regualr expression, because there are infinity
+ * amount of them.
+ * ex. 13th is an ordinal number, 123 is a cardinal number
+ * then use the determined tag to be the weight key
+ * @param @string $term is the term to be checked
+ * @return right key in feature matrix
+ */
+ public function getKey($term)
+ {
+ if (isset($this->getKeyImpl)) {
+ return $this->getKeyImpl($term);
+ }
+ return $term;
+ }
+
+ /**
+ * A function that process the trainning data
+ * @param @mixed $text_files can be a file or an array of file names
+ * @return @array of seperated sentences, each sentenfce have the format of
+ * [[words...],[tags...]]
+ * Currently, the trainning data needs to fit CTB format:
+ * term followed by a underscore and followed by the tag
+ * e.g. "新_VA 的_DEC 南斯拉夫_NR 会国_NN"
+ * To adapt to other language, some modifications are needed
+ */
+ public static function processTexts($text_files, $term_tag_splier="_",
+ $term_process = null, $tag_process = null)
+ {
+ $ret=[];
+ foreach($text_files as $text_file) {
+ if (file_exists($text_file)) {
+ $fn = fopen($text_file,"r");
+ while(! feof($fn)) {
+ $line = fgets($fn);
+ if(strpos($line, '<') !== false) {
+ continue;
+ }
+ $word_tag_pairs = preg_split("/[\s ]+/u", $line);
+ if (!count($word_tag_pairs)) {
+ continue;
+ }
+ $ret[]=[];
+ $ret[count($ret)-1][0]=[];
+ $ret[count($ret)-1][1]=[];
+ foreach ($word_tag_pairs as $word_tag_pair) {
+ $t = explode($term_tag_splier, $word_tag_pair);
+
+ if (count($t) == 2) {
+ $ret[count($ret)-1][0][] =
+ $term_process ? $term_process($t[0]) : $t[0];
+ $ret[count($ret)-1][1][] =
+ $tag_process ? $tag_process($t[1]) : $t[1];
+ }
+ }
+ }
+ fclose($fn);
+ }
+ }
+ return $ret;
+ }
+
+ /**
+ * Function to train a data
+ * Notice: This function might run very long time, depending on training set
+ * @param @mixed $text_files are training data
+ * can be a file or an array of file names
+ * @param @float $learning_rate
+ * @param @int $max_epoch 1200 might be a good one,
+ the weight will overfit if it's greater than this number
+ * @parama @bool $resume if true, read the weight file and continue training
+ if false, start from beginning
+ */
+ public function train($text_files, $term_tag_splier="_",
+ $learning_rate=0.1, $max_epoch = 1200,
+ $term_process = null, $tag_process = null,
+ $resume=false)
+ {
+ if (is_string($text_files)) {
+ $text_files = [$text_files];
+ }
+ echo "Reading files\n";
+ // term_tag_sentences[sentence#]=[[words...],[tags...]]
+ $term_tag_sentences = self::processTexts($text_files, $term_tag_splier,
+ $term_process, $tag_process);
+ if ($resume) {
+ echo "Loading weights... ";
+ $this->load_weight(true);
+ $tag_index = count($this->tag_set);
+ echo "ok\n";
+ } else {
+ $this->w=[];
+ $this->tag_set=[];
+ $tag_index = 0;
+ if (isset($this->rule_defined_key)) {
+ foreach($this->rule_defined_key as $k) {
+ $this->tag_set[$k] = $tag_index++;
+ }
+ }
+ for ($i = -4; $i <= -1; $i++) {
+ $this->w[$i] = [];
+ }
+ }
+ foreach ($term_tag_sentences as $term_tag_pairs) {
+ $terms=$term_tag_pairs[0];
+ $tags=$term_tag_pairs[1];
+ for ($i = 0; $i < count($terms); $i++) {
+ if (!isset($this->tag_set[$tags[$i]])) {
+ $this->tag_set[$tags[$i]] = $tag_index++;
+ }
+ $k = $this->getIndex($i,$terms);
+ if (!isset($this->w[$k])) {
+ $this->w[$k] = [];
+ }
+ }
+ }
+ foreach (array_keys($this->w) as $key) {
+ for ($i=-2; $i<=2;$i++) {
+ if (!isset($this->w[$key][$i])) {
+ $this->w[$key][$i] = [];
+ }
+ foreach($this->tag_set as $possiable_tag => $tag_index) {
+ if (!isset($this->w[$key][$i][$tag_index])) {
+ $this->w[$key][$i][$tag_index] = 0;
+ }
+ }
+ }
+ }
+ foreach($this->tag_set as $possiable_tag => $tag_index) {
+ if (!isset($this->b[$tag_index])) {
+ $this->b[$tag_index] = 0;
+ }
+ }
+ echo "Training\n";
+ //train the weight
+ $cross_entropy_loss = 1;
+ $pre_cross_entropy_loss = 2;
+ for ($epoch = 0; $epoch < $max_epoch &&
+ $pre_cross_entropy_loss - $cross_entropy_loss > 0.000001; $epoch++) {
+ $this->min_w=0;
+ $this->max_w=0;
+ $time = time();
+ $dy_dw = [];
+ $dy_dw_n = [];
+ $pre_cross_entropy_loss = $cross_entropy_loss;
+ $cross_entropy_loss = 0;
+ $cross_entropy_loss_n = 0;
+
+ $dy_db=[];
+ $dy_db_n=[];
+ for($i = 0; $i < count($this->tag_set); $i++) {
+ $dy_db[$i] = 0;
+ $dy_db_n[$i] = 0;
+ }
+ //for each sentence
+ foreach ($term_tag_sentences as $term_tag_pairs) {
+ $terms=$term_tag_pairs[0];
+ $tags=$term_tag_pairs[1];
+ for ($i = 0; $i < count($terms); $i++) {
+ $k=[];
+ for ($j=-2; $j<=2;$j++) {
+ $k[$j]= $this->getIndex($i+$j,$terms);
+ }
+ foreach ($this->tag_set as $possiable_tag => $tag_index) {
+ $equality = $possiable_tag == $tags[$i] ? 1 : 0;
+ $sum=0;
+ for ($j=-2; $j<=2;$j++) {
+ $sum += $this->w[$k[$j]][$j][$tag_index];
+ }
+ $sum += $this->b[$tag_index];
+ $sigmoid = 1 / (1 + exp(-1 * $sum));
+ for ($j=-2; $j<=2;$j++) {
+ if (!isset($dy_dw[$k[$j]])) {
+ $dy_dw[$k[$j]] = [];
+ $dy_dw_n[$k[$j]] = [];
+ }
+ if (!isset($dy_dw[$k[$j]][$j])) {
+ $dy_dw[$k[$j]][$j] = [];
+ $dy_dw_n[$k[$j]][$j] = [];
+ }
+ if (!isset($dy_dw[$k[$j]][$j][$tag_index])) {
+ $dy_dw[$k[$j]][$j][$tag_index] = 0;
+ $dy_dw_n[$k[$j]][$j][$tag_index] = 0;
+ }
+
+ $dy_dw[$k[$j]][$j][$tag_index] +=
+ ($sigmoid - $equality);
+ $dy_dw_n[$k[$j]][$j][$tag_index] += 1;
+
+ }
+ //dy_db
+ $dy_db[$tag_index] += ($sigmoid - $equality);
+ $dy_db_n[$tag_index] += 1;
+ $cross_entropy_loss+=
+ - $equality*log($sigmoid)
+ - (1-$equality)*log(1-$sigmoid);
+ $cross_entropy_loss_n++;
+ }
+ }
+ }
+ $cross_entropy_loss /= $cross_entropy_loss_n;
+ $duration = time() - $time;
+ echo "epoch {$epoch} cross_entropy {$cross_entropy_loss}".
+ " Takes {$duration} seconds\n";
+ foreach ($dy_dw as $i =>$v1) {
+ foreach ($v1 as $j =>$v2) {
+ foreach ($v2 as $k =>$v3) {
+ $this->w[$i][$j][$k] -=
+ $dy_dw[$i][$j][$k] /
+ $dy_dw_n[$i][$j][$k] *
+ $learning_rate;
+ if ($this->w[$i][$j][$k] < $this->min_w) {
+ $this->min_w = $this->w[$i][$j][$k];
+ }
+ if ($this->w[$i][$j][$k] > $this->max_w) {
+ $this->max_w = $this->w[$i][$j][$k];
+ }
+ }
+ }
+ }
+ foreach ($dy_db as $k =>$v) {
+ $this->b[$k]-=
+ $dy_db[$k] /
+ $dy_db_n[$k] *
+ $learning_rate;
+ }
+ if ($epoch % 10 == 9 ) {
+ $this->save_weight();
+ }
+ }
+ $this->save_weight();
+ return true;
+ }
+ /**
+ * The primary function to predit the tag
+ * @param mixed $sentence is an array of segmented words/terms
+ * or a string with words/terms seperated by space
+ * @return @array of tags
+ */
+ public function predict($sentence)
+ {
+ if (!is_array($sentence)) {
+ if ($sentence == "") {
+ $terms=[];
+ } else {
+ $terms=preg_split("/[\s]+/",$sentence);
+ }
+ } else {
+ $terms=$sentence;
+ }
+ if (!count($terms)) {
+ return [];
+ }
+ if (!$this->w) {
+ $this->load_weight();
+ }
+ $ret = [];
+ for($i = 0; $i < count($terms); $i++) {
+ $term = $terms[$i];
+ $score =[];
+ $key=$this->getKey($term);
+ foreach($this->tag_set as $possiable_tag => $tag_index) {
+ $score[$possiable_tag]=0;
+ for ($j=-2; $j <=2; $j++) {
+ $k=$this->getIndex($i+$j, $terms);
+ if (isset($this->w[$k])) {
+ $score[$possiable_tag] +=
+ $this->getW($k,$j,$tag_index);
+ } else if ($j==0&&in_array($possiable_tag,$this->rule_defined_key)) {
+ $score[$possiable_tag] += $this->min_w;
+ }
+ }
+
+ $score[$possiable_tag] += $this->getB($tag_index);
+
+ //$score[$possiable_tag]
+ // += 1 / (1 + exp(-1 * $score[$possiable_tag]));
+ }
+ $ret[]=array_keys($score, max($score))[0];
+ }
+ return $ret;
+ }
+ /**
+ * Wrap function for predict
+ * @param $texts to be a @string of texts
+ * @param $return_string is a boolean to determing if the user
+ * want it to out put to stdout or a return value
+ * @return @string if $return_string is true;
+ @boolean true otherwise
+ * e.g. 中国_NR 人民_NN 将_AD 满怀信心_VV
+ 地_DEV 开创_VV 新_VA 的_DEC 业绩_NN 。_PU
+ */
+ public function tag($texts, $return_string=false)
+ {
+ if ($return_string) {
+ $ret = "";
+ }
+ $sentences = preg_split('/\r\n|\r|\n/', $texts);
+ foreach($sentences as $sentence) {
+ $sentence=explode(" ",trim($sentence));
+ $term_pos = $this->predict($sentence);
+ for($i = 0; $i < count($term_pos); $i++) {
+ $term_pos[$i]=$sentence[$i]."_".$term_pos[$i];
+ }
+ $t = join(" ", $term_pos);
+ if ($return_string) {
+ $ret .= $t;
+ } else {
+ echo $t, "\n";
+ }
+ }
+ if ($return_string) {
+ return $ret;
+ } else {
+ return true;
+ }
+ }
+ /**
+ * A list of private helper functions
+ * Given a setence ($term), find the key at position $index
+ */
+ private function getIndex($index, $terms)
+ {
+ if ($index < 0) $k = $index - 2;
+ else if ($index >= count($terms)) {
+ $k = $index - count($terms) - 2;
+ }
+ else {
+ $k = $this->getKey($terms[$index]);
+ }
+ return $k;
+ }
+ /**
+ * Get the bias value for tag
+ */
+ private function getB($tag_index)
+ {
+ return unpack("f",$this->b,$tag_index*4)[1];
+ }
+ /**
+ * Set the bias value for tag
+ */
+ private function setB($tag_index, $value)
+ {
+ $this->b = substr_replace($this->b,pack("f",$value),$tag_index*4,4);
+ }
+ /**
+ * Get the weight value for term at postion for tag
+ */
+ private function getW($term, $position, $tag_index)
+ {
+ $t = unpack("S",$this->w[$term],
+ 2*($position+2)*count($this->tag_set)+$tag_index*2)[1]
+ / 65535
+ * ($this->max_w-$this->min_w) + $this->min_w;;
+ return $t;
+ }
+ /**
+ * save the trained weight to disk
+ */
+ private function save_weight()
+ {
+ $out_file = C\LOCALE_DIR . "/{$this->lang}/resources/pos_weight.txt.gz";
+ $out = [];
+ $out["min_w"] = $this->min_w;
+ $out["max_w"] = $this->max_w;
+ $out["w"]=[];
+ foreach(array_keys($this->w) as $key) {
+ $out["w"][$key] = $this->pack_w($key);
+ }
+ $out["b"] = $this->pack_b();
+ $out["tag_set"] = $this->tag_set;
+ echo "Saving...";
+ file_put_contents($out_file,
+ gzencode(serialize($out),9));
+ echo " ok\n";
+ }
+ /**
+ * load the trained weight from disk
+ */
+ private function load_weight($trainning_load=false)
+ {
+ $dic_file
+ = C\LOCALE_DIR . "/{$this->lang}/resources/pos_weight.txt.gz";
+ if (!file_exists($dic_file)) {
+ echo "$dic_file does not exist!";
+ exit();
+ }
+ $f = unserialize(gzdecode(file_get_contents($dic_file))
+ ,['allowed_classes' => false]);
+ $this->w=$f["w"];
+ $this->b=$f["b"];
+ $this->min_w=$f["min_w"];
+ $this->max_w=$f["max_w"];
+ $this->tag_set=$f["tag_set"];
+ if ($trainning_load) {
+ foreach(array_keys($this->w) as $key) {
+ $this->w[$key] = $this->unpack_w($key);
+ }
+ $this->b = $this->unpack_b($this->b);
+ }
+ }
+ /**
+ * Pack the bias
+ */
+ private function pack_b()
+ {
+ return pack("f*", ...$this->b);
+ }
+ /**
+ * Unpack the bias
+ */
+ private function unpack_b()
+ {
+ return array_merge(unpack("f".strval(count($this->tag_set)),$this->b));
+ }
+ /**
+ * Pack the weight
+ */
+ private function pack_w($key)
+ {
+ $bin_str = "";
+ foreach($this->w[$key] as $i => $t) {
+ foreach($t as $u) {
+ $v = 65535 * ($u-$this->min_w) / ($this->max_w-$this->min_w);
+ $bin_str .= pack("S", intval($v));
+ }
+ }
+ return $bin_str;
+ }
+ /**
+ * Unpack the weight
+ */
+ private function unpack_w($key)
+ {
+ $tmp = [];
+ $size = count($this->tag_set);
+ for ($i = 0; $i < 5; $i++) {
+ $tmp[$i-2] = array_merge(unpack("S".strval($size),
+ $this->w[$key], 2*$i*count($this->tag_set)));
+ for($j = 0; $j < $size; $j++) {
+ $tmp[$i-2][$j] = $tmp[$i-2][$j] / 65535
+ * ($this->max_w-$this->min_w) + $this->min_w;
+ }
+ }
+ return $tmp;
+ }
+}
diff --git a/src/library/StochasticTermSegmenter.php b/src/library/StochasticTermSegmenter.php
index 474be1262..4d64658d9 100644
--- a/src/library/StochasticTermSegmenter.php
+++ b/src/library/StochasticTermSegmenter.php
@@ -28,7 +28,6 @@
*/
namespace seekquarry\yioop\library;
-use seekquarry\yioop\library\Trie;
use seekquarry\yioop\locale\zh_CN\resources as ZH;
use seekquarry\yioop\configs as C;
/**
@@ -39,12 +38,13 @@ use seekquarry\yioop\configs as C;
* Currently only supports Chinese.
* Instruction to add a new language:
* Add a switch case in the constructor.
- * Define the following variable:
- * $non_char_preg
* Define the following function:
* isExceptionImpl
* See the class function 'isException' for more information
- *
+ * isPunctuationImpl
+ * See the class function 'isPunctuation' for more information
+ * isNotCurrentLangImpl
+ * See the class function 'notCurrentLang' for more information
* Chinese example is provided in the constructor
*
* @author Xianghong Sun
@@ -66,7 +66,7 @@ class StochasticTermSegmenter
*/
private $cache_pct;
/**
- * Used in memory saving mode
+ * Cache. Will have runtime data for the segmentation
* @var array
*/
private $cache=[];
@@ -111,9 +111,8 @@ class StochasticTermSegmenter
switch($lang)
{
case "zh_CN":
- case "zh-CN":
+ case "zh-CN":
$this->lang = "zh_CN";
- $this->non_char_preg = "/^[^\p{Han}]+$/u";
/*
* Check if the term passed in is an exception term
*/
@@ -127,23 +126,22 @@ class StochasticTermSegmenter
*/
$this->isPunctuationImpl = function($term)
{
- return preg_match(ZH\Tokenizer::$punctuation_preg, $term);
+ return ZH\Tokenizer::isPunctuation($term);
};
- break;
- case "ja":
- $this->lang = $lang;
- $this->non_char_preg =
- "/^[^\x{4E00}-\x{9FBF}\x{3040}-\x{309F}".
- "\x{30A0}-\x{30FF}]+$/u";
- break;
- case "ko":
- $this->lang = $lang;
- $this->non_char_preg = "/^[^\x{3130}-\x{318F}".
- "\x{AC00}-\x{D7AF}]+$/u";
+ /*
+ * Check if all the chars in the term is NOT current language
+ */
+ $this->isNotCurrentLangImpl = function($term)
+ {
+ return ZH\Tokenizer::isNotCurrentLang($term);
+ };
+ /*
+ * named entity recognizer;
+ */
+ $this->NER = ZH\Tokenizer::getNER();
break;
default:
$this->lang = $lang;
- $this->non_char_preg = "/^[^.]+$/u";
}
}
/**
@@ -208,18 +206,18 @@ class StochasticTermSegmenter
*/
public function notCurrentLang($term)
{
- return preg_match($this->non_char_preg, $term);
+ if (isset($this->isNotCurrentLangImpl))
+ return $this->isNotCurrentLangImpl($term);
+ return false;
}
/**
* Generate a term dictionary file for later segmentation
* @param mixed $text_files is a string name or an array of files
* that to be trained; words in the files need to be segmented by space
* @param string $format currently only support default and CTB
- * @param int $max_term_weights maximum number of terms ot keep
* @return bool true if success
*/
- public function train($text_files, $format = "default",
- $max_num_term_weights = 40000)
+ public function train($text_files, $format = "default")
{
$ctb_fmt=false;
switch ($format) {
@@ -254,9 +252,7 @@ class StochasticTermSegmenter
&& !$this->notCurrentLang($word)) {
if (!empty($dictionary[$word])) {
$dictionary[$word]++;
- } else if (count($dictionary) <
- $max_num_term_weights
- && mb_strlen($word) < 7) {
+ } else if (mb_strlen($word) < 7) {
$dictionary[$word] = 1;
}
}
@@ -319,6 +315,8 @@ class StochasticTermSegmenter
}
}
fclose($fh);
+ } else {
+ echo "cannot open $text_file\n";
}
}
if ($return_string) {
@@ -392,6 +390,13 @@ class StochasticTermSegmenter
if (!count($characters)) {
return [];
}
+ $ner_dict=[];
+ if (isset($this->NER)) {
+ $named_entities=$this->NER->predict($characters);
+ foreach($named_entities as $e) {
+ $this->add($e[0],1,$ner_dict);
+ }
+ }
$score = [];
$path = [];
//init base
@@ -487,6 +492,22 @@ class StochasticTermSegmenter
}
}
}
+ //check NER dictionary
+ if (isset($ner_dict[$characters[$index]])) {
+ $subdic = $ner_dict;
+ for ($j = $index; $j < count($characters); $j++) {
+ if (!isset($subdic[$characters[$j]])) {
+ break;
+ }
+ $subdic = $subdic[$characters[$j]];
+ if (isset($subdic['$']) && (!isset($score[$j]) ||
+ $score[$index - 1] + $subdic['$'] < $score[$j])) {
+ $score[$j] = $score[$index - 1] +
+ $this->getScore($subdic['$']);
+ $path[$j] = $index - 1;
+ }
+ }
+ }
}
//trace path
$t = max(array_keys($path));
diff --git a/src/locale/zh_CN/resources/Tokenizer.php b/src/locale/zh_CN/resources/Tokenizer.php
index 02113c793..ede2bd986 100755
--- a/src/locale/zh_CN/resources/Tokenizer.php
+++ b/src/locale/zh_CN/resources/Tokenizer.php
@@ -30,6 +30,7 @@ namespace seekquarry\yioop\locale\zh_CN\resources;
use seekquarry\yioop\library\PhraseParser;
use seekquarry\yioop\library\StochasticTermSegmenter;
+use seekquarry\yioop\library\ContextWeightedNamedEntityRecognizer;
/**
* Chinese specific tokenization code. Typically, tokenizer.php
@@ -57,23 +58,39 @@ class Tokenizer
'与', '次', '狗', '决', '金', '史', '姆', '部', '正在', '活', '刚',
'回家', '贝', '如何', '须', '战', '不會', '夫', '喂', '父', '亚', '肯定',
'女孩', '世界'];
+ /**
+ * regular expression to determine if the None of the char in this
+ * term is in current language.
+ * @var string
+ */
+ public static $non_char_preg = "/^[^\p{Han}]+$/u";
/**
* The dictionary of characters can be used as Chinese Numbers
* @string
*/
public static $num_dict =
- "1234567890○零一二三四五六七八九十百千万亿".
+ "1234567890○〇零一二两三四五六七八九十百千万亿".
"0123456789壹贰叁肆伍陆柒捌玖拾廿卅卌佰仟萬億";
/**
* Dots used in Chinese Numbers
* @string
*/
- public static $dot = "\..";
+ public static $dot = "\..点";
/**
* A list of characters can be used at the end of numbers
* @string
*/
public static $num_end = "%%";
+ /**
+ * Exception words of the regex found by functions:
+ * isCardinalNumber, isOrdinalNumber, isDate
+ * ex. "十分" in most of time means "very", but it will
+ * be determined to be "10 minutes" by the function so we
+ * need to remove it
+ * @array of string
+ */
+ public static $exception_list= ["十分","一","一点","千万",
+ "万一", "一一", "拾", "一时", "千千", "万万", "陆"];
/**
* A list of characters can be used as Chinese punctuations
* @string
@@ -82,7 +99,17 @@ class Tokenizer
"/^([\x{2000}-\x{206F}\x{3000}-\x{303F}\x{FF00}-\x{FF0F}" .
"\x{FF1A}-\x{FF20}\x{FF3B}-\x{FF40}\x{FF5B}-\x{FF65}" .
"\x{FFE0}-\x{FFEE}\x{21}-\x{2F}\x{21}-\x{2F}" .
- "\x{3A}-\x{40}\x{5B}-\x{60}]|[\—]+|[\.]+)$/u";
+ "\x{3A}-\x{40}\x{5B}-\x{60}\x{25cf}])\\1*$/u";
+ /**
+ * Stochastic Term Segmenter instance
+ * @object
+ */
+ private static $stochasticTermSegmenter;
+ /**
+ * named Entity Recognizer instance
+ * @object
+ */
+ private static $namedEntityRecognizer;
/**
* Removes the stop words from the page (used for Word Cloud generation
* and language detection)
@@ -117,8 +144,8 @@ class Tokenizer
['/^\d+$/', '/^[a-zA-Z]+$/']);
break;
case("STS"):
- $segmenter = new StochasticTermSegmenter("zh_CN");
- return $segmenter->segmentText($pre_segment,true);
+ return self::getStochasticTermSegmenter()
+ ->segmentText($pre_segment,true);
break;
}
}
@@ -127,25 +154,111 @@ class Tokenizer
*/
public static function isCardinalNumber($term)
{
- return preg_match("/^[" . self::$num_dict .
- self::$dot . "]+[" . self::$num_end .
- "]?[余餘]?[百千万亿佰仟萬億]?$/u", $term);
+ return !in_array($term,self::$exception_list)
+ && preg_match("/^([" . self::$num_dict .
+ "]+([" . self::$dot . "][" .self::$num_dict .
+ "]+)?[" . self::$num_end .
+ "]?[余餘多]?[百千万亿佰仟萬億]*)".
+ "$|^([".self::$num_dict."]+分之[" . self::$num_dict .
+ "]+([" . self::$dot . "][" .self::$num_dict .
+ "]+)?)$/u", $term);
}
/*
* Check if the term passed in is a Ordinal Number
*/
public static function isOrdinalNumber($term)
{
- return preg_match("/^第[" . self::$num_dict .
- "]*$/u", $term);
+ return !in_array($term,self::$exception_list)
+ && preg_match("/^第[" . self::$num_dict .
+ "]*$/u", $term);
}
/*
* Check if the term passed in is a date
*/
public static function isDate($term)
{
- return preg_match("/^[" . self::$num_dict .
- "]+(年|年代|月|日|时|小时|時|小時|" .
- "点|点钟|點|點鐘|分|分鐘|秒|秒鐘)$/u",$term);
+ return !in_array($term,self::$exception_list)
+ && preg_match("/^[" . self::$num_dict .
+ "]+(年|年代|月|日|时|小时|時|小時|" .
+ "点|点钟|點|點鐘|分|分鐘|秒|秒鐘)$/u",$term);
+ }
+ /*
+ * Check if the term is a punctuation
+ */
+ public static function isPunctuation($term)
+ {
+ return preg_match(self::$punctuation_preg, $term);
+ }
+ /**
+ * Check if all the chars in the term is NOT current language
+ * @param $term is a string that to be checked
+ * @return bool true if all the chars in $term is NOT current language
+ * false otherwise
+ */
+ public static function isNotCurrentLang($term)
+ {
+ return preg_match(self::$non_char_preg, $term);
+ }
+ /*
+ * Create stochastic term segmenter
+ */
+ public static function createStochasticTermSegmenter($cache_pct=0.06)
+ {
+ self::$stochasticTermSegmenter
+ = new StochasticTermSegmenter("zh_CN", $cache_pct);
+ }
+ /*
+ * Destory stochastic term segmenter
+ */
+ public static function destoryStochasticTermSegmenter()
+ {
+ self::$stochasticTermSegmenter = null;
+ }
+ /*
+ * Get the segmenter instance
+ */
+ public static function getStochasticTermSegmenter() {
+ if (!self::$stochasticTermSegmenter) {
+ self::createStochasticTermSegmenter();
+ }
+ return self::$stochasticTermSegmenter;
+ }
+ public static function POSGetKey($term) {
+ if (self::isPunctuation($term)) {
+ return 'PU';
+ } else if (self::isCardinalNumber($term)) {
+ return 'CD';
+ } else if (self::isOrdinalNumber($term)) {
+ return 'OD';
+ } else if (self::isDate($term)) {
+ return 'NT';
+ } else if (self::isNotCurrentLang($term)) {
+ return 'FW';
+ }
+ return null;
+ }
+ /*
+ * Create named entity recognizer instance
+ */
+ public static function createNER()
+ {
+ self::$namedEntityRecognizer
+ = new ContextWeightedNamedEntityRecognizer("zh_CN");
+ }
+ /*
+ * Destory named entity recognizer instance
+ */
+ public static function destoryNER()
+ {
+ self::$namedEntityRecognizer = null;
+ }
+ /*
+ * Get the named entity recognizer instance
+ */
+ public static function getNER() {
+ if (!self::$namedEntityRecognizer) {
+ self::createNER();
+ }
+ return self::$namedEntityRecognizer;
}
}