Last commit for src/locale/zh_CN/resources/Tokenizer.php: 2addb500315b7393a90fe66431d7832b1e7386c7

Adjust copyrights years

Chris Pollett [2024-01-03 21:Jan:rd]
Adjust copyrights years
<?php
/**
 * SeekQuarry/Yioop --
 * Open Source Pure PHP Search Engine, Crawler, and Indexer
 *
 * Copyright (C) 2009 - 2020  Chris Pollett chris@pollett.org
 *
 * LICENSE:
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 *
 * @author Chris Pollett chris@pollett.org
 * @license https://www.gnu.org/licenses/ GPL3
 * @link https://www.seekquarry.com/
 * @copyright 2009 - 2020
 * @filesource
 */
namespace seekquarry\yioop\locale\zh_CN\resources;

use seekquarry\yioop\library\PhraseParser;
use seekquarry\yioop\library\StochasticTermSegmenter;
use seekquarry\yioop\library\ContextWeightedNamedEntityRecognizer;

/**
 * Chinese specific tokenization code. Typically, tokenizer.php
 * either contains a stemmer for the language in question or
 * it specifies how many characters in a char gram
 *
 * @author Chris Pollett
 */

class Tokenizer
{
    /**
     * A list of frequently occurring terms for this locale which should
     * be excluded from certain kinds of queries. This is also used
     * for language detection
     * @array
     */
    public static $stop_words = ['一', '人', '里', '会', '没', '她', '吗', '去',
        '也', '有', '这', '那', '不', '什', '个', '来', '要', '就', '我', '你',
        '的', '是', '了', '他', '么', '们', '在', '说', '为', '好', '吧', '知道',
        '我的', '和', '你的', '想', '只', '很', '都', '对', '把', '啊', '怎', '得',
        '还', '过', '不是', '到', '样', '飞', '远', '身', '任何', '生活', '够',
        '号', '兰', '瑞', '达', '或', '愿', '蒂', '別', '军', '正', '是不是',
        '证', '不用', '三', '乐', '吉', '男人', '告訴', '路', '搞', '可是',
        '与', '次', '狗', '决', '金', '史', '姆', '部', '正在', '活', '刚',
        '回家', '贝', '如何', '须', '战', '不會', '夫', '喂', '父', '亚', '肯定',
        '女孩', '世界'];
    /**
     * regular expression to determine if the None of the char in this
     * term is in current language.
     * @var string
     */
    public static $non_char_preg = "/^[^\p{Han}]+$/u";
    /**
     * The dictionary of characters can be used as Chinese Numbers
     * @string
     */
    public static $num_dict =
       "1234567890○〇零一二两三四五六七八九十百千万亿".
       "0123456789壹贰叁肆伍陆柒捌玖拾廿卅卌佰仟萬億";
    /**
     * Dots used in Chinese Numbers
     * @string
     */
    public static $dot = "\..点";
    /**
     * A list of characters can be used at the end of numbers
     * @string
     */
    public static $num_end = "%%";
    /**
     * Exception words of the regex found by functions:
     * isCardinalNumber, isOrdinalNumber, isDate
     * ex. "十分" in most of time means "very", but it will
     * be determined to be "10 minutes" by the function so we
     * need to remove it
     * @array of string
     */
    public static $exception_list= ["十分","一","一点","千万",
    "万一", "一一", "拾", "一时", "千千", "万万", "陆"];
    /**
     * A list of characters can be used as Chinese punctuations
     * @string
     */
    public static $punctuation_preg =
    "/^([\x{2000}-\x{206F}\x{3000}-\x{303F}\x{FF00}-\x{FF0F}" .
    "\x{FF1A}-\x{FF20}\x{FF3B}-\x{FF40}\x{FF5B}-\x{FF65}" .
    "\x{FFE0}-\x{FFEE}\x{21}-\x{2F}\x{21}-\x{2F}" .
    "\x{3A}-\x{40}\x{5B}-\x{60}\x{25cf}])\\1*$/u";
    /**
     * Stochastic Term Segmenter instance
     * @object
     */
    private static $stochasticTermSegmenter;
    /**
     * named Entity Recognizer instance
     * @object
     */
    private static $namedEntityRecognizer;
    /**
     * Removes the stop words from the page (used for Word Cloud generation
     * and language detection)
     *
     * @param mixed $data either a string or an array of string to remove
     *      stop words from
     * @return mixed $data with no stop words
     */
    public static function stopwordsRemover($data)
    {
        static $pattern = "";
        if (empty($pattern)) {
            $pattern = '/\b(' . implode('|', self::$stop_words) . ')\b/u';
        }
        $data = preg_replace($pattern, '', $data);
        return $data;
    }
    /**
     * A word segmenter.
     * Such a segmenter on input thisisabunchofwords would output
     * this is a bunch of words
     *
     * @param string $pre_segment  before segmentation
     * @param string $method  indicates which method to use
     * @return string with words separated by space
     */
    public static function segment($pre_segment, $method="STS")
    {
        switch($method) {
            case("RMM"):
                return PhraseParser::reverseMaximalMatch($pre_segment, "zh-CN",
                ['/^\d+$/', '/^[a-zA-Z]+$/']);
                break;
            case("STS"):
                return self::getStochasticTermSegmenter()
                        ->segmentText($pre_segment,true);
                break;
        }
    }
    /**
     * Check if the term passed in is a Cardinal Number
     */
    public static function isCardinalNumber($term)
    {
        return !in_array($term,self::$exception_list)
            && preg_match("/^([" . self::$num_dict .
            "]+([" . self::$dot . "][" .self::$num_dict .
            "]+)?[" . self::$num_end .
            "]?[余餘多]?[百千万亿佰仟萬億]*)".
            "$|^([".self::$num_dict."]+分之[" . self::$num_dict .
            "]+([" . self::$dot . "][" .self::$num_dict .
            "]+)?)$/u", $term);
    }
    /*
     * Check if the term passed in is a Ordinal Number
     */
    public static function isOrdinalNumber($term)
    {
        return !in_array($term,self::$exception_list)
            && preg_match("/^第[" . self::$num_dict .
            "]*$/u", $term);
    }
    /*
     * Check if the term passed in is a date
     */
    public static function isDate($term)
    {
        return !in_array($term,self::$exception_list)
            && preg_match("/^[" . self::$num_dict .
            "]+(年|年代|月|日|时|小时|時|小時|" .
            "点|点钟|點|點鐘|分|分鐘|秒|秒鐘)$/u",$term);
    }
    /*
     * Check if the term is a punctuation
     */
    public static function isPunctuation($term)
    {
        return preg_match(self::$punctuation_preg, $term);
    }
    /**
     * Check if all the chars in the term is NOT current language
     * @param $term is a string that to be checked
     * @return bool true if all the chars in $term is NOT current language
     *         false otherwise
     */
    public static function isNotCurrentLang($term)
    {
        return preg_match(self::$non_char_preg, $term);
    }
    /*
     * Create stochastic term segmenter
     */
    public static function createStochasticTermSegmenter($cache_pct=0.06)
    {
        self::$stochasticTermSegmenter
            = new StochasticTermSegmenter("zh_CN", $cache_pct);
    }
    /*
     * Destory stochastic term segmenter
     */
    public static function destoryStochasticTermSegmenter()
    {
        self::$stochasticTermSegmenter = null;
    }
    /*
     * Get the segmenter instance
     */
    public static function getStochasticTermSegmenter() {
        if (!self::$stochasticTermSegmenter) {
            self::createStochasticTermSegmenter();
        }
        return self::$stochasticTermSegmenter;
    }
    public static function POSGetKey($term) {
        if (self::isPunctuation($term)) {
            return 'PU';
        } else if (self::isCardinalNumber($term)) {
            return 'CD';
        } else if (self::isOrdinalNumber($term)) {
            return 'OD';
        } else if (self::isDate($term)) {
            return 'NT';
        } else if (self::isNotCurrentLang($term)) {
            return 'FW';
        }
        return null;
    }
    /*
     * Create named entity recognizer instance
     */
    public static function createNER()
    {
        self::$namedEntityRecognizer
            = new ContextWeightedNamedEntityRecognizer("zh_CN");
    }
    /*
     * Destory named entity recognizer instance
     */
    public static function destoryNER()
    {
        self::$namedEntityRecognizer = null;
    }
    /*
     * Get the named entity recognizer instance
     */
    public static function getNER() {
        if (!self::$namedEntityRecognizer) {
            self::createNER();
        }
        return self::$namedEntityRecognizer;
    }
}
ViewGit