<?php /** * SeekQuarry/Yioop -- * Open Source Pure PHP Search Engine, Crawler, and Indexer * * Copyright (C) 2009 - 2020 Chris Pollett chris@pollett.org * * LICENSE: * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <https://www.gnu.org/licenses/>. * * @author Chris Pollett chris@pollett.org * @license https://www.gnu.org/licenses/ GPL3 * @link https://www.seekquarry.com/ * @copyright 2009 - 2020 * @filesource */ namespace seekquarry\yioop\locale\zh_CN\resources; use seekquarry\yioop\library\PhraseParser; use seekquarry\yioop\library\StochasticTermSegmenter; use seekquarry\yioop\library\ContextWeightedNamedEntityRecognizer; /** * Chinese specific tokenization code. Typically, tokenizer.php * either contains a stemmer for the language in question or * it specifies how many characters in a char gram * * @author Chris Pollett */ class Tokenizer { /** * A list of frequently occurring terms for this locale which should * be excluded from certain kinds of queries. This is also used * for language detection * @array */ public static $stop_words = ['一', '人', '里', '会', '没', '她', '吗', '去', '也', '有', '这', '那', '不', '什', '个', '来', '要', '就', '我', '你', '的', '是', '了', '他', '么', '们', '在', '说', '为', '好', '吧', '知道', '我的', '和', '你的', '想', '只', '很', '都', '对', '把', '啊', '怎', '得', '还', '过', '不是', '到', '样', '飞', '远', '身', '任何', '生活', '够', '号', '兰', '瑞', '达', '或', '愿', '蒂', '別', '军', '正', '是不是', '证', '不用', '三', '乐', '吉', '男人', '告訴', '路', '搞', '可是', '与', '次', '狗', '决', '金', '史', '姆', '部', '正在', '活', '刚', '回家', '贝', '如何', '须', '战', '不會', '夫', '喂', '父', '亚', '肯定', '女孩', '世界']; /** * regular expression to determine if the None of the char in this * term is in current language. * @var string */ public static $non_char_preg = "/^[^\p{Han}]+$/u"; /** * The dictionary of characters can be used as Chinese Numbers * @string */ public static $num_dict = "1234567890○〇零一二两三四五六七八九十百千万亿". "0123456789壹贰叁肆伍陆柒捌玖拾廿卅卌佰仟萬億"; /** * Dots used in Chinese Numbers * @string */ public static $dot = "\..点"; /** * A list of characters can be used at the end of numbers * @string */ public static $num_end = "%%"; /** * Exception words of the regex found by functions: * isCardinalNumber, isOrdinalNumber, isDate * ex. "十分" in most of time means "very", but it will * be determined to be "10 minutes" by the function so we * need to remove it * @array of string */ public static $exception_list= ["十分","一","一点","千万", "万一", "一一", "拾", "一时", "千千", "万万", "陆"]; /** * A list of characters can be used as Chinese punctuations * @string */ public static $punctuation_preg = "/^([\x{2000}-\x{206F}\x{3000}-\x{303F}\x{FF00}-\x{FF0F}" . "\x{FF1A}-\x{FF20}\x{FF3B}-\x{FF40}\x{FF5B}-\x{FF65}" . "\x{FFE0}-\x{FFEE}\x{21}-\x{2F}\x{21}-\x{2F}" . "\x{3A}-\x{40}\x{5B}-\x{60}\x{25cf}])\\1*$/u"; /** * Stochastic Term Segmenter instance * @object */ private static $stochasticTermSegmenter; /** * named Entity Recognizer instance * @object */ private static $namedEntityRecognizer; /** * Removes the stop words from the page (used for Word Cloud generation * and language detection) * * @param mixed $data either a string or an array of string to remove * stop words from * @return mixed $data with no stop words */ public static function stopwordsRemover($data) { static $pattern = ""; if (empty($pattern)) { $pattern = '/\b(' . implode('|', self::$stop_words) . ')\b/u'; } $data = preg_replace($pattern, '', $data); return $data; } /** * A word segmenter. * Such a segmenter on input thisisabunchofwords would output * this is a bunch of words * * @param string $pre_segment before segmentation * @param string $method indicates which method to use * @return string with words separated by space */ public static function segment($pre_segment, $method="STS") { switch($method) { case("RMM"): return PhraseParser::reverseMaximalMatch($pre_segment, "zh-CN", ['/^\d+$/', '/^[a-zA-Z]+$/']); break; case("STS"): return self::getStochasticTermSegmenter() ->segmentText($pre_segment,true); break; } } /** * Check if the term passed in is a Cardinal Number */ public static function isCardinalNumber($term) { return !in_array($term,self::$exception_list) && preg_match("/^([" . self::$num_dict . "]+([" . self::$dot . "][" .self::$num_dict . "]+)?[" . self::$num_end . "]?[余餘多]?[百千万亿佰仟萬億]*)". "$|^([".self::$num_dict."]+分之[" . self::$num_dict . "]+([" . self::$dot . "][" .self::$num_dict . "]+)?)$/u", $term); } /* * Check if the term passed in is a Ordinal Number */ public static function isOrdinalNumber($term) { return !in_array($term,self::$exception_list) && preg_match("/^第[" . self::$num_dict . "]*$/u", $term); } /* * Check if the term passed in is a date */ public static function isDate($term) { return !in_array($term,self::$exception_list) && preg_match("/^[" . self::$num_dict . "]+(年|年代|月|日|时|小时|時|小時|" . "点|点钟|點|點鐘|分|分鐘|秒|秒鐘)$/u",$term); } /* * Check if the term is a punctuation */ public static function isPunctuation($term) { return preg_match(self::$punctuation_preg, $term); } /** * Check if all the chars in the term is NOT current language * @param $term is a string that to be checked * @return bool true if all the chars in $term is NOT current language * false otherwise */ public static function isNotCurrentLang($term) { return preg_match(self::$non_char_preg, $term); } /* * Create stochastic term segmenter */ public static function createStochasticTermSegmenter($cache_pct=0.06) { self::$stochasticTermSegmenter = new StochasticTermSegmenter("zh_CN", $cache_pct); } /* * Destory stochastic term segmenter */ public static function destoryStochasticTermSegmenter() { self::$stochasticTermSegmenter = null; } /* * Get the segmenter instance */ public static function getStochasticTermSegmenter() { if (!self::$stochasticTermSegmenter) { self::createStochasticTermSegmenter(); } return self::$stochasticTermSegmenter; } public static function POSGetKey($term) { if (self::isPunctuation($term)) { return 'PU'; } else if (self::isCardinalNumber($term)) { return 'CD'; } else if (self::isOrdinalNumber($term)) { return 'OD'; } else if (self::isDate($term)) { return 'NT'; } else if (self::isNotCurrentLang($term)) { return 'FW'; } return null; } /* * Create named entity recognizer instance */ public static function createNER() { self::$namedEntityRecognizer = new ContextWeightedNamedEntityRecognizer("zh_CN"); } /* * Destory named entity recognizer instance */ public static function destoryNER() { self::$namedEntityRecognizer = null; } /* * Get the named entity recognizer instance */ public static function getNER() { if (!self::$namedEntityRecognizer) { self::createNER(); } return self::$namedEntityRecognizer; } }