viewgit/inc/functions.php:22 Function utf8_encode() is deprecated [8192]
diff --git a/src/library/NamedEntityContextTagger.php b/src/library/NamedEntityContextTagger.php index 62963016d..3a9768d28 100644 --- a/src/library/NamedEntityContextTagger.php +++ b/src/library/NamedEntityContextTagger.php @@ -312,81 +312,75 @@ class NamedEntityContextTagger extends ContextTagger return []; } if (is_array($sentence)) { - $terms = $sentence; + $sentence_vector = $sentence; } else { - $terms = preg_split("/[\s]+/u", $sentence); - } - if (count($terms) > 1) { - $results = []; - foreach ($terms as $term) { - $entities = $this->predict($term); - if (!empty($entities)) { - $results = array_merge($results, $entities); - } - } - return $results; - } - $terms = preg_split('//u', $terms[0], null, - PREG_SPLIT_NO_EMPTY); - if (!count($terms)) { - return []; + $sentence_vector = preg_split("/[\s]+/u", $sentence); } if (!$this->word_feature) { $this->loadWeights(); } - $results = []; - for($i = 0; $i < count($terms); $i++) { - $term = $terms[$i]; - $score = []; - foreach($this->tag_set as $possible_tag => $tag_index) { - $score[$possible_tag] = 0; - for ($j = -2; $j <= 2; $j++) { - $k = $this->getIndex($i + $j, $terms); - if (isset($this->word_feature[$k])) { - $score[$possible_tag] += - $this->getW($k, $j, $tag_index); + $found_entities = []; + foreach ($sentence_vector as $term) { + $characters = preg_split('//u', $term, null, + PREG_SPLIT_NO_EMPTY); + if (empty($characters)) { + continue; + } + $tags = []; + for($i = 0; $i < count($characters); $i++) { + $character = $characters[$i]; + $score = []; + foreach($this->tag_set as $possible_tag => $tag_index) { + $score[$possible_tag] = 0; + for ($j = -2; $j <= 2; $j++) { + $k = $this->getIndex($i + $j, $characters); + if (isset($this->word_feature[$k])) { + $score[$possible_tag] += + $this->getW($k, $j, $tag_index); + } } + if ($i == 0) { + $tf1 = "start"; + $tf2 = "start-start"; + } else if ($i == 1) { + $tf1 = $tags[$i - 1]; + $tf2 = "start-" . $tags[$i - 1]; + } else { + $tf1 = $tags[$i - 1]; + $tf2 = $tags[$i - 2] . "-" . $tags[$i - 1]; + } + $score[$possible_tag] += $this->getT($tf1, $tag_index); + $score[$possible_tag] += $this->getT($tf2, $tag_index); + $score[$possible_tag] += $this->getB($tag_index); } - if ($i == 0) { - $tf1 = "start"; - $tf2 = "start-start"; - } else if ($i == 1) { - $tf1 = $results[$i - 1]; - $tf2 = "start-" . $results[$i - 1]; - } else { - $tf1 = $results[$i - 1]; - $tf2 = $results[$i - 2] . "-" . $results[$i - 1]; - } - $score[$possible_tag] += $this->getT($tf1, $tag_index); - $score[$possible_tag] += $this->getT($tf2, $tag_index); - $score[$possible_tag] += $this->getB($tag_index); + $tags[] = array_keys($score, max($score))[0]; } - $results[] = array_keys($score, max($score))[0]; - } - $pre_tag = 'o'; - $current_entity = ""; - $ret = []; - for ($i = 0; $i < count($terms); $i++) { - if ($pre_tag != $results[$i] && $pre_tag != "o") { - if (mb_strlen($current_entity) < self::MAX_ENTITY_LENGTH) { - $ret[] = [$current_entity, $pre_tag]; + $pre_tag = 'o'; + $current_entity = ""; + $entities = []; + for ($i = 0; $i < count($characters); $i++) { + if ($pre_tag != $tags[$i] && $pre_tag != "o") { + if (mb_strlen($current_entity) < self::MAX_ENTITY_LENGTH) { + $entities[] = [$current_entity, $pre_tag]; + } + $current_entity = ""; } - $current_entity = ""; - } - if ($results[$i] != "o") { - if ($current_entity) { - $current_entity .= $terms[$i]; - } else { - $current_entity = $terms[$i]; + if ($tags[$i] != "o") { + if ($current_entity) { + $current_entity .= $characters[$i]; + } else { + $current_entity = $characters[$i]; + } } + $pre_tag = $tags[$i]; } - $pre_tag = $results[$i]; - } - if ($pre_tag != "o") { - if (mb_strlen($current_entity) < self::MAX_ENTITY_LENGTH) { - $ret[] = [$current_entity, $pre_tag]; + if ($pre_tag != "o") { + if (mb_strlen($current_entity) < self::MAX_ENTITY_LENGTH) { + $entities[] = [$current_entity, $pre_tag]; + } } + $found_entities = array_merge($found_entities, $entities); } - return $ret; + return $found_entities; } } diff --git a/src/locale/zh_CN/resources/nect_weights.txt.gz b/src/locale/zh_CN/resources/nect_weights.txt.gz index 8e6fea731..46cdc9d3d 100755 Binary files a/src/locale/zh_CN/resources/nect_weights.txt.gz and b/src/locale/zh_CN/resources/nect_weights.txt.gz differ