diff --git a/src/configs/TokenTool.php b/src/configs/TokenTool.php
index 1e8704284..e579f3da2 100644
--- a/src/configs/TokenTool.php
+++ b/src/configs/TokenTool.php
@@ -357,12 +357,12 @@ function getTrainingFileNames($command_line_args, $start_index = 4)
}
$file_names = [];
for($i = $start_index + 1; $i < count($command_line_args); $i++) {
- $file_path = glob($file_path . $command_line_args[$i]);
- if (count($file_path) == 0) {
+ $files = glob($file_path . $command_line_args[$i]);
+ if (count($files) == 0) {
echo "error: $file_path{$command_line_args[$i]}: File not found\n";
exit();
}
- $file_names = array_merge($file_names, $file_path);
+ $file_names = array_merge($file_names, $files);
}
if ($command_line_args[$start_index] > 0) {
$file_names = array_slice($file_names, 0, $command_line_args[4]);
diff --git a/src/library/NamedEntityContextTagger.php b/src/library/NamedEntityContextTagger.php
index 6553f3115..af0dd7265 100644
--- a/src/library/NamedEntityContextTagger.php
+++ b/src/library/NamedEntityContextTagger.php
@@ -96,6 +96,10 @@ class NamedEntityContextTagger extends ContextTagger
$text_files = [$text_files];
}
echo "Reading files... \n";
+ if(!$term_callback && !empty($this->tokenizer)
+ && method_exists($this->tokenizer,"normalize")) {
+ $term_callback = [$this->tokenizer,"normalize"];
+ }
// term_tag_sentences[sentence#] = [[words...], [tags...]]
$term_tag_sentences = self::processTexts($text_files,
$term_tag_separator, $term_callback, $tag_callback, true);
@@ -302,11 +306,13 @@ class NamedEntityContextTagger extends ContextTagger
* Predicts named entities that exists in a sentence.
* @param mixed $sentence is an array of segmented words/terms
* or a string that will be split on white space
+ * @param string $normalize return the normalized form
+ * 乾隆->干隆
* @return array all predicted named entities together with a tag
* indicating kind of named entity
* ex. [["郑振铎","nr"],["国民党","nt"]]
*/
- public function predict($sentence)
+ public function predict($sentence, $normalize=true)
{
if (empty($sentence)) {
return [];
@@ -320,7 +326,14 @@ class NamedEntityContextTagger extends ContextTagger
$this->loadWeights();
}
$found_entities = [];
- foreach ($sentence_vector as $term) {
+ foreach ($sentence_vector as $unnormalized) {
+ if (!empty($this->tokenizer) &&
+ method_exists($this->tokenizer,
+ "normalize")) {
+ $term=$this->tokenizer::normalize($unnormalized);
+ } else {
+ $term=$unnormalized;
+ }
$characters = preg_split('//u', $term, null,
PREG_SPLIT_NO_EMPTY);
if (empty($characters)) {
@@ -358,6 +371,10 @@ class NamedEntityContextTagger extends ContextTagger
$pre_tag = 'o';
$current_entity = "";
$entities = [];
+ if (!$normalize) {
+ $characters = preg_split('//u', $unnormalized, null,
+ PREG_SPLIT_NO_EMPTY);
+ }
for ($i = 0; $i < count($characters); $i++) {
if ($pre_tag != $tags[$i] && $pre_tag != "o") {
if (mb_strlen($current_entity) < self::MAX_ENTITY_LENGTH) {
@@ -366,11 +383,7 @@ class NamedEntityContextTagger extends ContextTagger
$current_entity = "";
}
if ($tags[$i] != "o") {
- if ($current_entity) {
- $current_entity .= $characters[$i] ?? "";
- } else {
- $current_entity = $characters[$i] ?? "";
- }
+ $current_entity .= $characters[$i] ?? "";
}
$pre_tag = $tags[$i];
}
diff --git a/src/library/StochasticTermSegmenter.php b/src/library/StochasticTermSegmenter.php
index 98395e9df..80bca2dcc 100644
--- a/src/library/StochasticTermSegmenter.php
+++ b/src/library/StochasticTermSegmenter.php
@@ -211,6 +211,11 @@ class StochasticTermSegmenter
foreach ($words as $word) {
if ($word != "" && !$this->isException($word) &&
!$this->notCurrentLang($word)) {
+ if (!empty($this->tokenizer) &&
+ method_exists($this->tokenizer,
+ "normalize")) {
+ $word=$this->tokenizer::normalize($word);
+ }
if (!empty($dictionary[$word])) {
$dictionary[$word]++;
} else if (mb_strlen($word) <
@@ -287,9 +292,11 @@ class StochasticTermSegmenter
/**
* Segments text into terms separated by space
* @param string $text to be segmented
+ * @param string $normalize return the normalized form
+ * 乾隆->干隆
* @return string segmented terms with space
*/
- public function segmentText($text)
+ public function segmentText($text, $normalize=false)
{
$segmented_text = "";
$lines = explode("\n", $text);
@@ -307,15 +314,17 @@ class StochasticTermSegmenter
* Segments a single sentence into an array of words.
* Must NOT contain any new line characters.
* @param string $sentence is a string without newline to be segmented
+ * @param string $normalize return the normalized form
+ * 乾隆->干隆
* @return array of segmented words
*/
- public function segmentSentence($sentence)
+ public function segmentSentence($sentence, $normalize=true)
{
$t = preg_split("/[\s ]+/u", trim($sentence));
if(count($t) > 1) {
$ret = [];
foreach($t as $s) {
- $segments = $this->segmentSentence($s);
+ $segments = $this->segmentSentence($s, $normalize);
if (is_array($segments)) {
$ret = array_merge($ret, $segments);
}
@@ -338,8 +347,13 @@ class StochasticTermSegmenter
if ($cache_size == 0) {
$cache_size = 1;
}
- preg_match_all('/./u', trim($sentence), $matches);
- $characters = $matches[0];
+ $unnormalized = trim($sentence);
+ $normalized = (!empty($this->tokenizer) &&
+ method_exists($this->tokenizer, "normalize")) ?
+ $this->tokenizer::normalize($unnormalized)
+ : $unnormalized;
+ $characters = preg_split('//u', $normalized, null,
+ PREG_SPLIT_NO_EMPTY);
if (!count($characters)) {
return [];
}
@@ -475,6 +489,10 @@ class StochasticTermSegmenter
}
$result = [];
$t = 0;
+ if (!$normalize) {
+ $characters = preg_split('//u', $unnormalized,
+ null, PREG_SPLIT_NO_EMPTY);
+ }
foreach(array_reverse($tmp) as $next_node) {
$result_word = "";
while($t <= $next_node) {
diff --git a/src/locale/zh_CN/resources/zh2HansChar.txt.gz b/src/locale/zh_CN/resources/zh2HansChar.txt.gz
new file mode 100644
index 000000000..7936ab4c3
Binary files /dev/null and b/src/locale/zh_CN/resources/zh2HansChar.txt.gz differ