diff --git a/src/configs/TokenTool.php b/src/configs/TokenTool.php index 2bf749e7d..f0262d046 100644 --- a/src/configs/TokenTool.php +++ b/src/configs/TokenTool.php @@ -39,15 +39,9 @@ namespace seekquarry\yioop\configs; use seekquarry\yioop\configs as C; use seekquarry\yioop\library as L; +use seekquarry\yioop\models as M; use seekquarry\yioop\controllers\JobsController; -use seekquarry\yioop\library\FetchUrl; -use seekquarry\yioop\library\NWordGrams; -use seekquarry\yioop\library\Trie; -use seekquarry\yioop\library\StochasticTermSegmenter; -use seekquarry\yioop\library\UrlParser; -use seekquarry\yioop\models\CrawlModel; use seekquarry\yioop\models\LocaleModel; -use seekquarry\yioop\models\SearchverticalsModel; if (php_sapi_name() != 'cli' || defined("seekquarry\\yioop\\configs\\IS_OWN_WEB_SERVER")) { @@ -102,13 +96,16 @@ Usage ===== TokenTool is used to create suggest word dictionaries, 'n' word gram (for word entities) filter files, knowledge wiki and seed -site entries, and segment filters files for the Yioop! search engine for a -locale. It can also be used to localize Yioop text strings for a new language. -To create dictionaries or filter files, the user -puts a source file in Yioop's WORK_DIRECTORY/prepare folder. Suggest word -dictionaries are used to supply the content of the dropdown of search terms -that appears as a user is entering a query in Yioop. To make a suggest -dictionary one can use a command like: +site entries, segment filters, and named entity tag and part of speech tag +files for the Yioop! search engine for a locale. It can also be used to +localize Yioop text strings for a new language. +To create dictionaries, filter, or tag files, the user +puts a source file in Yioop's WORK_DIRECTORY/prepare folder. TokenTool will +typically output the resulting file in the folder +LOCALE_DIR/locale_tag/resources where locale_tag is the locale the +file is being created for. Suggest word dictionaries are used to supply the +content of the dropdown of search terms that appears as a user is entering a +query in Yioop. To make a suggest dictionary one can use a command like: php TokenTool.php dictionary filename locale endmarker @@ -161,7 +158,7 @@ meta pages, num_entries says to get use the num_entries wiki pages according to what the page count file says were most frequently accessed and only make knowledge wiki entries for these, num_seeds says only use the infoboxes on the top num_seeds many pages as sources of seed site urls. Seed site urls will be -written to thte current crawl.ini file used to set the crawl parameters of the +written to the current crawl.ini file used to set the crawl parameters of the next crawl. In some Asian languages, such as Chinese, Japanese, there are no spaces @@ -175,24 +172,55 @@ which is a less accurate way to do segmentation. To create such files, TokenTool.php is run from the command line as: -php TokenTool.php stochastic-segmenter locale files_format dataset_files... +php TokenTool.php stochastic-segmenter locale files_format max_files dataset_files... or php TokenTool.php segment-filter locale dictionary_file -respectively. - -Here locale is the IANA language tag of the locale to store the results for, -files_format is the format of the files, +respectively. Here locale is the IANA language tag of the locale to store +the results for, files_format is the format of the files, currently supported format can be "default" or "CTB" (Chinese Tree Bank). The default format has all word segmented by space, CTB information can be found at: -http://www.cs.brandeis.edu/~clp/ctb/ -dataset_files... should be a list of text files with the format described above -and dictionary_file is a text file or glob pattern to such text files with one -word/line. An example is: -php TokenTool.php stochastic-segmenter zh-CN CTB segmented/* +https://www.cs.brandeis.edu/~clp/ctb/ +max_files is the maximum number of found dataset files to process during +training. If negative it means train on all files, if positive train at most +that many files (useful if running out of memory). dataset_files... should be a +list of text files or glob pattern to such text files with the format described +above and dictionary_file is a text file or glob pattern to such text file +with one word/line. An example is: + +php TokenTool.php stochastic-segmenter zh-CN CTB -1 segmented/* + +segment-filter outputs a file LOCALE_DIR/locale_tag/resources/segment.ftr +and stochastic-segmenter outputs a file +LOCALE_DIR/locale_tag/resources/term_weights.txt.gz + +TokenTool can be used to create name entity tag files. These can be +used to find named entities in text passages for a language, and is +used as part of the stochastic-segmenter process. The command to create such +a file is: + +php TokenTool.php entity-tagger locale tag_separator max_files dataset_files... + +the arguments are the same as stochastic-segmenter except tag_separator. +The input training files contain tagged white space separated terms. +If the tag_separator was '-', then non-named entity examples should look like: +term-o, and named entity example might look like term-nr or term-nt +where nr = proper noun, ns = place name, or nt = temporal noun. For Chinese, +one can use a little script-fu to convert the postagged data of the Chinese +treebank into this format. The file output by running the entity-tagger command +is LOCALE_DIR/locale_tag/resources/nect_weights.txt.gz + +TokenTool can also be used to part of speech tag files. These can be used for +Yioop's question answering subsystem for a language. The command to create such +a file is: + +php TokenTool.php pos-tagger locale tag_separator max_files dataset_files... + +The command line arguments are the same as the entity-tagger command. The file +output by this command is: LOCALE_DIR/locale_tag/resources/pos_weights.txt.gz Localizing Yioop's web app strings to a new language can be done manually via the Manage Locale activity within Yioop. Alternatively, TokenTool can @@ -257,6 +285,12 @@ switch ($argv[1]) { } makeSuggestTrie($argv[2], $argv[3], $argv[4]); break; + case "entity-tagger": + $file_names = getTrainingFileNames($argv); + $ne_tagger = new L\NamedEntityContextTagger($argv[2]); + $ne_tagger->train($file_names, $argv[3]); + echo "Training Complete!"; + break; case "filter": array_shift($argv); array_shift($argv); @@ -270,25 +304,10 @@ switch ($argv[1]) { $argv[5], $argv[6]); break; case "pos-tagger": - $file_path = PREP_DIR . "/"; - if (!isset($argv[3])) { - echo $usage; - } - $texts = []; - for($i = 4; $i < count($argv); $i++) { - $files = glob($file_path . $argv[$i]); - if (count($files) == 0) { - echo "error: {$file_path}{$argv[i]}: File not found\n"; - exit(); - } - $texts = array_merge($texts, $files); - } - $pos_tagger = new ContextWeightedPosTagger($argv[2]); - if ($pos_tagger->train($texts, "/")) { - echo "Success\n"; - } else { - echo "Failed\n"; - } + $file_names = getTrainingFileNames($argv); + $pos_tagger = new L\PartOfSpeechContextTagger($argv[2]); + $pos_tagger->train($file_names, $argv[3]); + echo "Training Complete!"; break; case "segment-filter": $file_path = PREP_DIR . "/"; @@ -296,28 +315,13 @@ switch ($argv[1]) { echo $argv[3] . " does not exist in " . $file_path; exit(); } - NWordGrams::makeSegmentFilterFile($file_path . $argv[3], $argv[2]); + L\NWordGrams::makeSegmentFilterFile($file_path . $argv[3], $argv[2]); break; case "stochastic-segmenter": - $file_path = PREP_DIR . "/"; - if (!isset($argv[3])) { - echo $usage; - } - $texts = []; - for($i = 4; $i < count($argv); $i++) { - $files = glob($file_path . $argv[$i]); - if (count($files) == 0) { - echo "error: {$file_path}{$argv[$i]}: File not found\n"; - exit(); - } - $texts = array_merge($texts, $files); - } - $segmenter = new StochasticTermSegmenter($argv[2]); - if ($segmenter->train($texts, $argv[3])) { - echo "Success\n"; - } else { - echo "Failed\n"; - } + $file_names = getTrainingFileNames($argv); + $segmenter = new L\StochasticTermSegmenter($argv[2]); + $segmenter->train($file_names, $argv[3]); + echo "Training Complete!"; break; case "translate-locale": if (!isset($argv[2])) { @@ -334,6 +338,38 @@ if (!PROFILE) { "by visiting its web interface on localhost.\n"; exit(); } +/** + * Returns an array of filenames to be used for training the current + * task in TokenTool + * @param array $command_line_args supplied to TokenTool.php. Assume + * array of the format: + * [ ... max_file_names_to_consider, file_glob1, file_glob2, ...] + * @param int $start_index index in $command_line_args of + * max_file_names_to_consider + * @return array $file_names of files with training data + */ +function getTrainingFileNames($command_line_args, $start_index = 4) +{ + $file_path = PREP_DIR . "/"; + if (!isset($command_line_args[$start_index + 1])) { + echo $usage; + exit(); + } + $file_names = []; + for($i = $start_index + 1; $i < count($command_line_args); $i++) { + $file_path = glob($file_path . $command_line_args[$i]); + if (count($file_path) == 0) { + echo "error: $file_path{$command_line_args[$i]}: File not found\n"; + exit(); + } + $file_names = array_merge($file_names, $file_path); + } + if ($command_line_args[$start_index] > 0) { + $file_names = array_slice($file_names, 0, $command_line_args[4]); + } + return $file_names; +} + /** * Generates knowledge wiki callouts for search results pages based * on the first paragraph of a Wikipedia Page that matches a give qeury. @@ -392,8 +428,8 @@ function makeKwikiEntriesGetSeedSites($locale_tag, $page_count_file, $rank_titles[$title] = $i; $i++; } - $verticals_model = new SearchverticalsModel(); - $crawl_model = new CrawlModel(); + $verticals_model = new M\SearchverticalsModel(); + $crawl_model = new M\CrawlModel(); list($fr, $read, $close) = smartOpen($wiki_dump_file); $input_buffer = ""; $time = time(); @@ -495,7 +531,7 @@ function makeKwikiEntriesGetSeedSites($locale_tag, $page_count_file, } if (preg_match("/。|\.|\!|\?/", $first_paragraph)) { if (!empty($website)) { - $simplified_website = UrlParser::simplifyUrl($website, + $simplified_website = L\UrlParser::simplifyUrl($website, 100); $website = "[[$website|$simplified_website]]"; } @@ -810,7 +846,7 @@ function translateLocale($locale_tag) $translate_text = str_replace("%s", "101", $translate_from_data[$string_id]); $url = $pre_url . urlencode($translate_text); - $response = json_decode(FetchUrl::getPage($url), true); + $response = json_decode(L\FetchUrl::getPage($url), true); if (!empty($response["text"][0]) && !empty($response["code"]) && $response["code"] == 200) { $translated_text = str_replace("101", "%s", @@ -903,7 +939,7 @@ function makeSuggestTrie($dict_file, $locale, $end_marker) // Read and load dictionary and stop word files $words = fileWithTrim($dict_file); sort($words); - $trie = new Trie($end_marker); + $trie = new L\Trie($end_marker); /** Ignore the words in the following cases. If the word * - contains punctuation diff --git a/src/library/ContextTagger.php b/src/library/ContextTagger.php index ed4482074..549760cae 100644 --- a/src/library/ContextTagger.php +++ b/src/library/ContextTagger.php @@ -160,7 +160,7 @@ abstract class ContextTagger if (count($t) == 2) { $ret[count($ret) - 1][0][] = $term_callback ? $term_callback($t[0]) : $t[0]; - $ret[count($ret)-1][1][] = + $ret[count($ret) - 1][1][] = $tag_callback ? $tag_callback($t[1]) : $t[1]; } } @@ -380,6 +380,42 @@ abstract class ContextTagger ($this->max_w - $this->min_w) + $this->min_w;; return $t; } + /** + * Tags a sequence of strings according to this tagger's predict method + * returning the tagged result as a string. + * This function is mainly used to facilitate unit testing of taggers. + * @param string $text to be tagged + * @param string $tag_separator terms in the output string will + * be the terms from the input texts followed by $tag_separator + * followed by their tag. So if $tag_separator == "_", then a term + * 中国 in the input texts might be 中国_NR in the output string + * @return string single string where terms in the intput texts + * have been tagged. For example output might look like: + * 中国_NR 人民_NN 将_AD 满怀信心_VV + * 地_DEV 开创_VV 新_VA 的_DEC 业绩_NN 。_PU + */ + public function tag($text, $tag_separator = "_") + { + $tagged_text = ""; + $lines = preg_split('/\r\n|\r|\n/u', $text); + foreach($lines as $line) { + $line_vector = explode(" ", trim($line)); + $tag_vector = $this->predict($line_vector); + $tagged_term_vector = []; + for($i = 0; $i < count($tag_vector); $i++) { + if (is_array($tag_vector[$i])) { + list($term_element, $tag_element) = $tag_vector[$i]; + } else { + $term_element = $line_vector[$i]; + $tag_element = $tag_vector[$i]; + } + $tagged_term_vector[$i] = $term_element . $tag_separator . + $tag_element; + } + $tagged_text .= join(" ", $tagged_term_vector); + } + return $tagged_text; + } /** * Uses text files to train a tagger for terms or chars in a document * @param mixed $text_files with training data. These can be a file or diff --git a/src/library/NamedEntityContextTagger.php b/src/library/NamedEntityContextTagger.php index 94a2cba73..53de40890 100644 --- a/src/library/NamedEntityContextTagger.php +++ b/src/library/NamedEntityContextTagger.php @@ -41,6 +41,14 @@ use seekquarry\yioop\configs as C; */ class NamedEntityContextTagger extends ContextTagger { + /** + * Maximum character length of a named entity + */ + const MAX_ENTITY_LENGTH = 10; + /** + * Minimum entropy needs to go down between epochs or we stop training + */ + const MIN_ENTROPY_CHANGE = 0.000001; /** * Constructor for the NamedEntityContextTagger. * Sets the language this tagger tags for and sets up the path for @@ -57,6 +65,12 @@ class NamedEntityContextTagger extends ContextTagger * so that from a two chars before a term, two chars after a char context, * together with a two tags before a term context and a term, * the odds that a named entity as been found can be calculated + * Format of training file should be a tagged white space separated terms + * If the separator was '-', then non-named entity examples should look like + * term-o, and named entity example might look like term-nr or term-nt + * where nr = proper noun, ns = place name, nt = temporal noun. The + * use of a $tag_callback might help in mapping more general datasets into + * this format * * @param mixed $text_files with training data. These can be a file or * an array of file names. @@ -65,7 +79,7 @@ class NamedEntityContextTagger extends ContextTagger * @param float $learning_rate learning rate when cycling over data trying * to minimize the cross-entropy loss in the prediction of the tag of the * middle term. - * @param int $num_epoch number of times to cycle through the + * @param int $num_epochs number of times to cycle through the * complete data set. Default value of 1200 seems to avoid overfitting * @param function $term_callback callback function applied to a term * before adding term to sentence term array as part of processing and @@ -75,14 +89,14 @@ class NamedEntityContextTagger extends ContextTagger * processing and training with a sentence. */ public function train($text_files, $term_tag_separator = "-", - $learning_rate = 0.1, $num_epoch = 1200, $term_callback = null, + $learning_rate = 0.1, $num_epochs = 1200, $term_callback = null, $tag_callback = null, $resume = false) { if (is_string($text_files)) { $text_files = [$text_files]; } - echo "Reading files\n"; - // term_tag_sentences[sentence#]=[[words...],[tags...]] + echo "Reading files... \n"; + // term_tag_sentences[sentence#] = [[words...], [tags...]] $term_tag_sentences = self::processTexts($text_files, $term_tag_separator, $term_callback, $tag_callback); $this->word_feature = []; @@ -100,10 +114,10 @@ class NamedEntityContextTagger extends ContextTagger if (!isset($this->tag_set[$tags[$i]])) { $this->tag_set[$tags[$i]] = $tag_index++; } - if ($i == 0) {} - else if ($i == 1) { + if ($i == 0) { + } else if ($i == 1) { if (!isset($this->tag_feature["start-" . $tags[$i-1]])) { - $this->tag_feature["start-".$tags[$i - 1]] = []; + $this->tag_feature["start-" . $tags[$i - 1]] = []; } if (!isset($this->tag_feature[$tags[$i - 1]])) { $this->tag_feature[$tags[$i - 1]] = []; @@ -124,7 +138,7 @@ class NamedEntityContextTagger extends ContextTagger } } foreach (array_keys($this->word_feature) as $key) { - for ($i = -2; $i <= 2 ;$i++) { + for ($i = -2; $i <= 2; $i++) { if (!isset($this->word_feature[$key][$i])) { $this->word_feature[$key][$i] = []; } @@ -151,9 +165,9 @@ class NamedEntityContextTagger extends ContextTagger //train the weight $cross_entropy_loss = 1; $pre_cross_entropy_loss = 2; - for ($epoch = 0; ($epoch < $num_epoch) && - $pre_cross_entropy_loss - $cross_entropy_loss > 0.000001; - $epoch++) { + for ($epoch = 0; $epoch < $num_epochs && + $pre_cross_entropy_loss - $cross_entropy_loss > + self::MIN_ENTROPY_CHANGE; $epoch++) { $this->min_w = 0; $this->max_w = 0; $time = time(); @@ -172,19 +186,20 @@ class NamedEntityContextTagger extends ContextTagger } //for each sentence foreach ($term_tag_sentences as $term_tag_pairs) { - $terms=$term_tag_pairs[0]; - $tags=$term_tag_pairs[1]; + $terms = $term_tag_pairs[0]; + $tags = $term_tag_pairs[1]; for ($i = 0; $i < count($terms); $i++) { - $k=[]; - for ($j=-2; $j <= 2;$j++) { - $k[$j]= $this->getIndex($i + $j,$terms); + $k = []; + for ($j = -2; $j <= 2; $j++) { + $k[$j] = $this->getIndex($i + $j, $terms); } foreach ($this->tag_set as $possible_tag => $tag_index) { - $equality = $possible_tag == $tags[$i] ? 1 : 0; - $sum=0; - //5 words including itself - for ($j=-2; $j <= 2; $j++) { - $sum += $this->word_feature[$k[$j]][$j][$tag_index]; + $equality = ($possible_tag == $tags[$i]) ? 1 : 0; + $sum = 0; + //5 terms including term itself + for ($j = -2; $j <= 2; $j++) { + $sum += $this->word_feature[$k[$j]][$j][$tag_index] + ?? 0; } //previous 2 tags if ($i == 0) { @@ -202,7 +217,7 @@ class NamedEntityContextTagger extends ContextTagger //bias $sum += $this->bias[$tag_index]; $sigmoid = 1 / (1 + exp(-1 * $sum)); - for ($j=-2; $j<=2;$j++) { + for ($j = -2; $j <= 2; $j++) { if (!isset($dy_dw[$k[$j]])) { $dy_dw[$k[$j]] = []; $dy_dw_n[$k[$j]] = []; @@ -251,11 +266,12 @@ class NamedEntityContextTagger extends ContextTagger } $cross_entropy_loss /= $cross_entropy_loss_n; $duration = time() - $time; - echo "Epoch {$epoch} cross_entropy {$cross_entropy_loss}". - " took {$duration} seconds\n"; + echo "Epoch {$epoch} of {$num_epochs} took {$duration} seconds." . + " Current cross_entropy is {$cross_entropy_loss}\n"; foreach ($dy_dw as $i => $v1) { foreach ($v1 as $j => $v2) { foreach ($v2 as $k => $v3) { + $this->word_feature[$i][$j][$k] ??= 0; $this->word_feature[$i][$j][$k] -= $dy_dw[$i][$j][$k] / $dy_dw_n[$i][$j][$k] * $learning_rate; if ($this->word_feature[$i][$j][$k] < $this->min_w) { @@ -307,7 +323,7 @@ class NamedEntityContextTagger extends ContextTagger if (!$this->word_feature) { $this->loadWeights(); } - $result = []; + $results = []; for($i = 0; $i < count($terms); $i++) { $term = $terms[$i]; $score = []; @@ -324,36 +340,41 @@ class NamedEntityContextTagger extends ContextTagger $tf1 = "start"; $tf2 = "start-start"; } else if ($i == 1) { - $tf1 = $result[$i - 1]; - $tf2 = "start-" . $result[$i - 1]; + $tf1 = $results[$i - 1]; + $tf2 = "start-" . $results[$i - 1]; } else { - $tf1 = $result[$i - 1]; - $tf2 = $result[$i - 2] . "-" . $result[$i-1]; + $tf1 = $results[$i - 1]; + $tf2 = $results[$i - 2] . "-" . $results[$i - 1]; } $score[$possible_tag] += $this->getT($tf1, $tag_index); $score[$possible_tag] += $this->getT($tf2, $tag_index); $score[$possible_tag] += $this->getB($tag_index); } - $result[] = array_keys($score, max($score))[0]; + $results[] = array_keys($score, max($score))[0]; } $pre_tag = 'o'; $current_entity = ""; $ret = []; for ($i = 0; $i < count($terms); $i++) { - if ($pre_tag != $result[$i] && $pre_tag != "o") { - if (mb_strlen($current_entity) < 10) { + if ($pre_tag != $results[$i] && $pre_tag != "o") { + if (mb_strlen($current_entity) < self::MAX_ENTITY_LENGTH) { $ret[] = [$current_entity, $pre_tag]; } $current_entity = ""; } - if ($result[$i] != "o") { + if ($results[$i] != "o") { if ($current_entity) { $current_entity .= $terms[$i]; } else { $current_entity = $terms[$i]; } } - $pre_tag = $result[$i]; + $pre_tag = $results[$i]; + } + if ($pre_tag != "o") { + if (mb_strlen($current_entity) < self::MAX_ENTITY_LENGTH) { + $ret[] = [$current_entity, $pre_tag]; + } } return $ret; } diff --git a/src/library/PartOfSpeechContextTagger.php b/src/library/PartOfSpeechContextTagger.php index 10fa5eec9..c8523f7c7 100644 --- a/src/library/PartOfSpeechContextTagger.php +++ b/src/library/PartOfSpeechContextTagger.php @@ -41,6 +41,10 @@ use seekquarry\yioop\configs as C; */ class PartOfSpeechContextTagger extends ContextTagger { + /** + * Minimum entropy needs to go down between epochs or we stop training + */ + const MIN_ENTROPY_CHANGE = 0.000001; /** * Constructor for the part of speech tagger. * Sets the language this tagger tags for and sets up the path for @@ -56,7 +60,7 @@ class PartOfSpeechContextTagger extends ContextTagger * Uses text files containing sentences to create a matrix * so that from a two term before a term, two term after a term context * and a term, the odds of each of its possible parts of speech can be - * calculated + * calculated. * * @param mixed $text_files with training data. These can be a file or * an array of file names. For now these files are assumed to be in @@ -78,13 +82,13 @@ class PartOfSpeechContextTagger extends ContextTagger * if false, start from beginning */ public function train($text_files, $term_tag_separator = "-", - $learning_rate = 0.1, $num_epoch = 1200, $term_callback = null, + $learning_rate = 0.1, $num_epochs = 1200, $term_callback = null, $tag_callback = null, $resume = false) { if (is_string($text_files)) { $text_files = [$text_files]; } - echo "Reading files\n"; + echo "Reading files... \n"; // term_tag_sentences[sentence#] = [[words...], [tags...]] $term_tag_sentences = self::processTexts($text_files, $term_tag_separator, $term_callback, $tag_callback); @@ -138,12 +142,13 @@ class PartOfSpeechContextTagger extends ContextTagger $this->bias[$tag_index] = 0; } } - echo "Training\n"; + echo "Training...\n"; //train the weight $cross_entropy_loss = 1; $pre_cross_entropy_loss = 2; - for ($epoch = 0; $epoch < $num_epoch && - $pre_cross_entropy_loss - $cross_entropy_loss > 0.000001; $epoch++){ + for ($epoch = 0; $epoch < $num_epochs && + $pre_cross_entropy_loss - $cross_entropy_loss > + self::MIN_ENTROPY_CHANGE; $epoch++) { $this->min_w = 0; $this->max_w = 0; $time = time(); @@ -163,7 +168,7 @@ class PartOfSpeechContextTagger extends ContextTagger $terms = $term_tag_pairs[0]; $tags = $term_tag_pairs[1]; for ($i = 0; $i < count($terms); $i++) { - $k=[]; + $k = []; for ($j = -2; $j <= 2; $j++) { $k[$j] = $this->getIndex($i + $j, $terms); } @@ -171,7 +176,8 @@ class PartOfSpeechContextTagger extends ContextTagger $equality = ($possible_tag == $tags[$i]) ? 1 : 0; $sum = 0; for ($j = -2; $j <= 2; $j++) { - $sum += $this->word_feature[$k[$j]][$j][$tag_index]; + $sum += $this->word_feature[$k[$j]][$j][$tag_index] + ?? 0; } $sum += $this->bias[$tag_index]; $sigmoid = 1 / (1 + exp(-1 * $sum)); @@ -203,15 +209,14 @@ class PartOfSpeechContextTagger extends ContextTagger } $cross_entropy_loss /= $cross_entropy_loss_n; $duration = time() - $time; - echo "Epoch {$epoch} cross_entropy {$cross_entropy_loss}" . - " took {$duration} seconds\n"; + echo "Epoch {$epoch} of {$num_epochs} took {$duration} seconds." . + " Current cross_entropy is {$cross_entropy_loss}\n"; foreach ($dy_dw as $i => $v1) { foreach ($v1 as $j => $v2) { foreach ($v2 as $k => $v3) { - $this->word_feature[$i][$j][$k] -= - $dy_dw[$i][$j][$k] / - $dy_dw_n[$i][$j][$k] * - $learning_rate; + $this->word_feature[$i][$j][$k] ??= 0; + $this->word_feature[$i][$j][$k] -= $dy_dw[$i][$j][$k] / + $dy_dw_n[$i][$j][$k] * $learning_rate; if ($this->word_feature[$i][$j][$k] < $this->min_w) { $this->min_w = $this->word_feature[$i][$j][$k]; } @@ -224,7 +229,7 @@ class PartOfSpeechContextTagger extends ContextTagger foreach ($dy_db as $k => $v) { $this->bias[$k] -= $dy_db[$k] / $dy_db_n[$k] * $learning_rate; } - if ($epoch % 10 == 9 ) { + if ($epoch % 10 == 9) { $this->saveWeights(); } } diff --git a/src/library/StochasticTermSegmenter.php b/src/library/StochasticTermSegmenter.php index b6f096d90..766c4eca9 100644 --- a/src/library/StochasticTermSegmenter.php +++ b/src/library/StochasticTermSegmenter.php @@ -91,6 +91,10 @@ class StochasticTermSegmenter * @var string */ public $dictionary_path; + /** + * Maximum character length of a term + */ + const MAX_TERM_LENGTH = 7; /** * Constructs an instance of this class used for segmenting string with * respect to words in a locale using a probabilistic approach to evaluate @@ -175,7 +179,6 @@ class StochasticTermSegmenter * @param mixed $text_files is a string name or an array of files * that to be trained; words in the files need to be segmented by space * @param string $format currently only support default and CTB - * @return bool true if success */ public function train($text_files, $format = "default") { @@ -206,11 +209,12 @@ class StochasticTermSegmenter } $words = preg_split("/[\s ]+/u", $line); foreach ($words as $word) { - if ($word != "" && !$this->isException($word) - && !$this->notCurrentLang($word)) { + if ($word != "" && !$this->isException($word) && + !$this->notCurrentLang($word)) { if (!empty($dictionary[$word])) { $dictionary[$word]++; - } else if (mb_strlen($word) < 7) { + } else if (mb_strlen($word) < + self::MAX_TERM_LENGTH) { $dictionary[$word] = 1; } } @@ -228,7 +232,7 @@ class StochasticTermSegmenter foreach ($dictionary as $key => $value) { if (mb_substr($key, 0, 1) != $start_char) { $this->dictionary["dic"][$start_char] = - json_encode($tmp_array[$start_char]); + json_encode($tmp_array[$start_char] ?? []); $tmp_array = []; $start_char = mb_substr($key, 0, 1); } @@ -238,9 +242,7 @@ class StochasticTermSegmenter $this->unknown_term_score = $this->getScore(1); file_put_contents($this->dictionary_path, gzencode(json_encode($this->dictionary), 9)); - return true; } - /** * Segments the text in a list of files * @param mixed $text_files can be a file name or a list of file names @@ -283,34 +285,23 @@ class StochasticTermSegmenter return true; } /** - * Segments text into words separated by space + * Segments text into terms separated by space * @param string $text to be segmented - * @param bool $return_string return segmented string if true, - * print otherwise - * @return mixed segmented words with space or true/false; + * @return string segmented terms with space */ - public function segmentText($text, $return_string = false) + public function segmentText($text) { - if ($return_string) { - $result = ""; - } - $sentences = explode("\n", $text); - foreach ($sentences as $line) { - if (mb_strlen($line)) { - $t = $this->segmentSentence($line); - if (!empty($t)) { - if ($return_string) { - $result .= join( " ", $t) . "\n"; - } else { - echo join( " ", $t) . "\n"; - } + $segmented_text = ""; + $lines = explode("\n", $text); + foreach ($lines as $line) { + if (mb_strlen($line) > 0) { + $segmented_line = $this->segmentSentence($line); + if (!empty($segmented_line)) { + $segmented_text .= join(" ", $segmented_line) . "\n"; } } } - if ($return_string) { - return mb_substr($result, 0, -1); - } - return true; + return mb_substr($segmented_text, 0, -1); } /** * Segments a single sentence into an array of words. @@ -366,12 +357,12 @@ class StochasticTermSegmenter $score[-1] = 0; for($index = 0; $index < count($characters); $index++) { //If not current language - if ($this->notCurrentLang($characters[$index]) - && !$this->isPunctuation($characters[$index])) { + if ($this->notCurrentLang($characters[$index]) && + !$this->isPunctuation($characters[$index])) { $current_char = $characters[$index]; for($j = $index + 1; $j < count($characters); $j++) { - if ($this->notCurrentLang($current_char . $characters[$j]) - && !$this->isPunctuation($characters[$j])) { + if ($this->notCurrentLang($current_char . $characters[$j])&& + !$this->isPunctuation($characters[$j])) { $current_char .= $characters[$j]; } else { break; @@ -385,9 +376,9 @@ class StochasticTermSegmenter } } //If date or number - if ($this->isException($characters[$index]) ) { + if ($this->isException($characters[$index])) { $current_char = $characters[$index]; - for($j = $index+1; $j < count($characters); $j++) { + for($j = $index + 1; $j < count($characters); $j++) { if (!$this->isException( $current_char . $characters[$j])) { break; @@ -405,7 +396,7 @@ class StochasticTermSegmenter //If is punctuation, give slightly better score than unknown words if ($this->isPunctuation($characters[$index])) { $current_char = $characters[$index]; - for($j = $index+1; $j<count($characters); $j++) { + for($j = $index + 1; $j < count($characters); $j++) { if (!$this->isPunctuation( $current_char . $characters[$j])) { break; @@ -513,10 +504,10 @@ class StochasticTermSegmenter * * @param string $term the term to be inserted * @param string $frequency the frequency to be inserted - * @param array & $trie array based trie we want to insert the key value + * @param array &$trie array based trie we want to insert the key value * pair into */ - public function add($term, $frequency, & $trie) + public function add($term, $frequency, &$trie) { $sub_trie = & $trie; for ($i = 0; $i < mb_strlen($term, "utf-8"); $i++) { @@ -527,7 +518,7 @@ class StochasticTermSegmenter if (!isset($sub_trie[$enc_char])) { $sub_trie[$enc_char] = []; } - $sub_trie = & $sub_trie[$enc_char]; + $sub_trie = &$sub_trie[$enc_char]; } // Set end of term marker $sub_trie['$'] = $frequency; diff --git a/src/locale/zh_CN/resources/Tokenizer.php b/src/locale/zh_CN/resources/Tokenizer.php index 3548d8eed..2270f7344 100755 --- a/src/locale/zh_CN/resources/Tokenizer.php +++ b/src/locale/zh_CN/resources/Tokenizer.php @@ -241,15 +241,14 @@ class Tokenizer */ public static function segment($pre_segment, $method = "STS") { - switch($method) { + switch($method) + { case "RMM": return PhraseParser::reverseMaximalMatch($pre_segment, "zh-CN", ['/^\d+$/', '/^[a-zA-Z]+$/']); - break; case "STS": return self::getStochasticTermSegmenter() - ->segmentText($pre_segment, true); - break; + ->segmentText($pre_segment); } } /** diff --git a/src/locale/zh_CN/resources/nect_weights.txt.gz b/src/locale/zh_CN/resources/nect_weights.txt.gz index 46cdc9d3d..a54e319dd 100755 Binary files a/src/locale/zh_CN/resources/nect_weights.txt.gz and b/src/locale/zh_CN/resources/nect_weights.txt.gz differ diff --git a/tests/ZhTokenizerTest.php b/tests/ZhTokenizerTest.php new file mode 100644 index 000000000..1e6698341 --- /dev/null +++ b/tests/ZhTokenizerTest.php @@ -0,0 +1,82 @@ +<?php +/** + * SeekQuarry/Yioop -- + * Open Source Pure PHP Search Engine, Crawler, and Indexer + * + * Copyright (C) 2009 - 2020 Chris Pollett chris@pollett.org + * + * LICENSE: + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <https://www.gnu.org/licenses/>. + * + * END LICENSE + * + * @author Chris Pollett chris@pollett.org + * @license https://www.gnu.org/licenses/ GPL3 + * @link https://www.seekquarry.com/ + * @copyright 2009 - 2020 + * @filesource + */ +namespace seekquarry\yioop\tests; + +use seekquarry\yioop\configs as C; +use seekquarry\yioop\library as L; +use seekquarry\yioop\library\UnitTest; + +/** + * Used to test Named Entity Tagging and Part of Speech Tagging for the + * Chinese Language. Word segmentation is already tested in + * @see seekquarry\yioop\tests\PhraseParserTest + */ +class ZhTokenizerTest extends UnitTest +{ + /** + * Each test we set up a new Italian Tokenizer object + */ + public function setUp() + { + } + /** + * Nothing done for unit test tear done + */ + public function tearDown() + { + } + /** + * Tests whether Yioop correctly identity Chinese Named Entities + */ + public function namedEntityTestCase() + { + $source = "郑振铎 国民党 國家元首 行政權 日本"; + $expected_tagging = "郑振铎_nr 国民党_nt 日本_ns"; + $ne_tagger = new L\NamedEntityContextTagger('zh-CN'); + $output_tagging = $ne_tagger->tag($source); + $this->assertEqual($output_tagging, $expected_tagging, + "Named Entities Correctly Found in Chinese Source String"); + } + /** + * Tests whether Yioop can correctly tag a Chinese sentence + */ + public function partOfSpeechTestCase() + { + $source = "印度 总统 是 印度 国家元首 和 " . + "武装部队 总司令 有 该国 第一 公民 之 称"; + $expected_tagging = "印度_NR 总统_NN 是_VC 印度_NR 国家元首_NN ". + "和_CC 武装部队_NN 总司令_NN 有_VE 该国_NN 第一_VV 公民_NN 之_DEG 称_NN"; + $pos_tagger = new L\PartOfSpeechContextTagger('zh-CN'); + $output_tagging = $pos_tagger->tag($source); + $this->assertEqual($output_tagging, $expected_tagging, + "Parts of Speech Correctly Tagged in Chinese Source String"); + } +}