diff --git a/src/configs/TokenTool.php b/src/configs/TokenTool.php index e579f3da2..28b2704ac 100644 --- a/src/configs/TokenTool.php +++ b/src/configs/TokenTool.php @@ -28,7 +28,7 @@ * A description of its usage is given in the $usage global variable * * @author Ravi Dhillon ravi.dhillon@yahoo.com, Chris Pollett (modified for n - * ngrams) + * ngrams, added more functionality) * @license https://www.gnu.org/licenses/ GPL3 * @link https://www.seekquarry.com/ * @copyright 2009 - 2020 diff --git a/src/css/search.css b/src/css/search.css index 5147f4ba0..3e8126063 100755 --- a/src/css/search.css +++ b/src/css/search.css @@ -763,7 +763,7 @@ body.mobile } #menu-options-background { background-color: black; - display:none; + display: none; min-height: 100%; left: 0px; opacity: 0.5; diff --git a/src/library/LocaleFunctions.php b/src/library/LocaleFunctions.php index 5450dda77..37a32c6e3 100755 --- a/src/library/LocaleFunctions.php +++ b/src/library/LocaleFunctions.php @@ -129,6 +129,11 @@ function guessLocale() */ function guessLocaleFromString($phrase_string, $locale_tag = null) { + static $cache; + $phrase_string = trim($phrase_string); + if (!empty($cache[$phrase_string])) { + return $cache[trim($phrase_string)]; + } if (!$locale_tag) { $locale_tag = getLocaleTag(); } @@ -152,6 +157,10 @@ function guessLocaleFromString($phrase_string, $locale_tag = null) } } } + $cache[$phrase_string] = $locale_tag; + if (count($cache) > 100) { + array_shift($cache); + } return $locale_tag; } /** diff --git a/src/library/NamedEntityContextTagger.php b/src/library/NamedEntityContextTagger.php index af0dd7265..d234b962c 100644 --- a/src/library/NamedEntityContextTagger.php +++ b/src/library/NamedEntityContextTagger.php @@ -97,8 +97,8 @@ class NamedEntityContextTagger extends ContextTagger } echo "Reading files... \n"; if(!$term_callback && !empty($this->tokenizer) - && method_exists($this->tokenizer,"normalize")) { - $term_callback = [$this->tokenizer,"normalize"]; + && method_exists($this->tokenizer, "normalize")) { + $term_callback = [$this->tokenizer, "normalize"]; } // term_tag_sentences[sentence#] = [[words...], [tags...]] $term_tag_sentences = self::processTexts($text_files, @@ -306,13 +306,11 @@ class NamedEntityContextTagger extends ContextTagger * Predicts named entities that exists in a sentence. * @param mixed $sentence is an array of segmented words/terms * or a string that will be split on white space - * @param string $normalize return the normalized form - * 乾隆->干隆 * @return array all predicted named entities together with a tag * indicating kind of named entity * ex. [["郑振铎","nr"],["国民党","nt"]] */ - public function predict($sentence, $normalize=true) + public function predict($sentence) { if (empty($sentence)) { return []; @@ -328,11 +326,13 @@ class NamedEntityContextTagger extends ContextTagger $found_entities = []; foreach ($sentence_vector as $unnormalized) { if (!empty($this->tokenizer) && - method_exists($this->tokenizer, - "normalize")) { - $term=$this->tokenizer::normalize($unnormalized); + method_exists($this->tokenizer, "normalize")) { + /* Mainly used to map Chinese traditional to + simplified character + */ + $term = $this->tokenizer::normalize($unnormalized); } else { - $term=$unnormalized; + $term = $unnormalized; } $characters = preg_split('//u', $term, null, PREG_SPLIT_NO_EMPTY); @@ -371,10 +371,6 @@ class NamedEntityContextTagger extends ContextTagger $pre_tag = 'o'; $current_entity = ""; $entities = []; - if (!$normalize) { - $characters = preg_split('//u', $unnormalized, null, - PREG_SPLIT_NO_EMPTY); - } for ($i = 0; $i < count($characters); $i++) { if ($pre_tag != $tags[$i] && $pre_tag != "o") { if (mb_strlen($current_entity) < self::MAX_ENTITY_LENGTH) { diff --git a/src/library/StochasticTermSegmenter.php b/src/library/StochasticTermSegmenter.php index 80bca2dcc..77464086a 100644 --- a/src/library/StochasticTermSegmenter.php +++ b/src/library/StochasticTermSegmenter.php @@ -209,11 +209,10 @@ class StochasticTermSegmenter } $words = preg_split("/[\s ]+/u", $line); foreach ($words as $word) { - if ($word != "" && !$this->isException($word) && + if (!empty($word) && !$this->isException($word) && !$this->notCurrentLang($word)) { if (!empty($this->tokenizer) && - method_exists($this->tokenizer, - "normalize")) { + method_exists($this->tokenizer, "normalize")) { $word=$this->tokenizer::normalize($word); } if (!empty($dictionary[$word])) { @@ -314,17 +313,15 @@ class StochasticTermSegmenter * Segments a single sentence into an array of words. * Must NOT contain any new line characters. * @param string $sentence is a string without newline to be segmented - * @param string $normalize return the normalized form - * 乾隆->干隆 * @return array of segmented words */ - public function segmentSentence($sentence, $normalize=true) + public function segmentSentence($sentence) { $t = preg_split("/[\s ]+/u", trim($sentence)); if(count($t) > 1) { $ret = []; foreach($t as $s) { - $segments = $this->segmentSentence($s, $normalize); + $segments = $this->segmentSentence($s); if (is_array($segments)) { $ret = array_merge($ret, $segments); } @@ -349,11 +346,10 @@ class StochasticTermSegmenter } $unnormalized = trim($sentence); $normalized = (!empty($this->tokenizer) && - method_exists($this->tokenizer, "normalize")) ? - $this->tokenizer::normalize($unnormalized) - : $unnormalized; + method_exists($this->tokenizer, "normalize")) ? + $this->tokenizer::normalize($unnormalized) : $unnormalized; $characters = preg_split('//u', $normalized, null, - PREG_SPLIT_NO_EMPTY); + PREG_SPLIT_NO_EMPTY); if (!count($characters)) { return []; } @@ -489,10 +485,6 @@ class StochasticTermSegmenter } $result = []; $t = 0; - if (!$normalize) { - $characters = preg_split('//u', $unnormalized, - null, PREG_SPLIT_NO_EMPTY); - } foreach(array_reverse($tmp) as $next_node) { $result_word = ""; while($t <= $next_node) { diff --git a/src/locale/zh_CN/resources/Tokenizer.php b/src/locale/zh_CN/resources/Tokenizer.php index 5bf9ef0a4..0daf63e11 100755 --- a/src/locale/zh_CN/resources/Tokenizer.php +++ b/src/locale/zh_CN/resources/Tokenizer.php @@ -99,7 +99,6 @@ class Tokenizer "\x{FF1A}-\x{FF20}\x{FF3B}-\x{FF40}\x{FF5B}-\x{FF65}" . "\x{FFE0}-\x{FFEE}\x{21}-\x{2F}\x{21}-\x{2F}" . "\x{3A}-\x{40}\x{5B}-\x{60}\x{25cf}])\\1*$/u"; - /** * Any unique identifier corresponding to the component of a triplet which * can be answered using a question answer list @@ -199,20 +198,28 @@ class Tokenizer "AS", "ETC", "DEC", "DEG", "DEV", "MSP", "DER", "SP", "IJ", "FW"]; /** - * Stochastic Term Segmenter instance + * StochasticTermSegmenter instance used for segmenting chines * @var object */ private static $stochastic_term_segmenter; /** - * Named Entity tagger instance + * Named Entity tagger instance used to recognizer noun entities in + * Chinese text * @var object */ private static $named_entity_tagger; /** - * PosTagger instance + * PartOfSpeechContextTagger instance used in adding part of speech + * annotations to Chinese text * @var object */ private static $pos_tagger; + /** + * Holds a associative array with keys which are traditional characters + * and values their simplified character correspondents. + * @var array + */ + private static $traditional_simplified_map; /** * Removes the stop words from the page (used for Word Cloud generation * and language detection) @@ -225,8 +232,9 @@ class Tokenizer { static $pattern = ""; if (empty($pattern)) { - $pattern = '/\b(' . implode('|', self::$stop_words) . ')\b/u'; + $pattern = '/(' . implode('|', self::$stop_words) . ')/u'; } + $data = self::normalize($data); $data = preg_replace($pattern, '', $data); return $data; } @@ -308,7 +316,7 @@ class Tokenizer return preg_match(self::$non_char_preg, $term); } /** - * Get the segmenter instance + * Get the segmenter instance, instantiating it if necessary * @return StochasticTermSegmenter */ public static function getStochasticTermSegmenter() @@ -380,6 +388,31 @@ class Tokenizer } return self::$pos_tagger; } + /** + * Converts traditional Chinese characters to simplified characters + * @param string $text is a string of Chinese Char + * @return string normalized form of the text + */ + public static function normalize($text) + { + if (empty(self::$traditional_simplified_map)) { + $path = C\LOCALE_DIR . + "/zh_CN/resources/traditional_simplified.txt.gz"; + if (!file_exists($path)) { + return $text; + } + self::$traditional_simplified_map = + unserialize(gzdecode(file_get_contents($path))); + } + $chars = preg_split('//u', $text, null, PREG_SPLIT_NO_EMPTY); + $num_chars = count($chars); + for($i = 0; $i < $num_chars; $i++) { + if (isset(self::$traditional_simplified_map[$chars[$i]])) { + $chars[$i] = self::$traditional_simplified_map[$chars[$i]]; + } + } + return implode($chars); + } /** * Scans a word list for phrases. For phrases found generate * a list of question and answer pairs at two levels of granularity: diff --git a/src/locale/zh_CN/resources/traditional_simplified.txt.gz b/src/locale/zh_CN/resources/traditional_simplified.txt.gz new file mode 100644 index 000000000..01686164c Binary files /dev/null and b/src/locale/zh_CN/resources/traditional_simplified.txt.gz differ diff --git a/src/locale/zh_CN/resources/zh2HansChar.txt.gz b/src/locale/zh_CN/resources/zh2HansChar.txt.gz deleted file mode 100644 index 7936ab4c3..000000000 Binary files a/src/locale/zh_CN/resources/zh2HansChar.txt.gz and /dev/null differ diff --git a/tests/ZhTokenizerTest.php b/tests/ZhTokenizerTest.php index aaf6e4cb6..52e347ff2 100644 --- a/tests/ZhTokenizerTest.php +++ b/tests/ZhTokenizerTest.php @@ -32,6 +32,7 @@ namespace seekquarry\yioop\tests; use seekquarry\yioop\configs as C; use seekquarry\yioop\library as L; +use seekquarry\yioop\locale\zh_CN\resources\Tokenizer; use seekquarry\yioop\library\UnitTest; /** @@ -79,4 +80,14 @@ class ZhTokenizerTest extends UnitTest $this->assertEqual($output_tagging, $expected_tagging, "Parts of Speech Correctly Tagged in Chinese Source String"); } + /** + * Traditional to Simplified mapping test + */ + public function traditionalSimplifiedTestCase() + { + $traditional = "那是一個黑暗而暴風雨的夜晚。"; + $simplified = "那是一个黑暗而暴风雨的夜晚。"; + $this->assertEqual(Tokenizer::normalize($source), $simplified, + "Traditional characters correctly mapped to simplied ones"); + } }