viewgit/inc/functions.php:22 Function utf8_encode() is deprecated [8192]
diff --git a/src/locale/hi/resources/Tokenizer.php b/src/locale/hi/resources/Tokenizer.php index 737b56eb0..d6ae4b3e3 100755 --- a/src/locale/hi/resources/Tokenizer.php +++ b/src/locale/hi/resources/Tokenizer.php @@ -142,5 +142,182 @@ class Tokenizer } return $word; } + /** + * The method ttakes as input a phrase and returns a string with each + * term tagged with a part of speech. + * + * @param string $phrase which is the input string to be tagged. + * @return string $tagged_phrase which is a string of format term~pos + */ + public static function tagPartsOfSpeechPhrase($phrase, $with_tokens = true) + { + $tagged_tokens = self::tagTokenizePartOfSpeech($phrase); + $tagged_phrase = self::taggedPartOfSpeechTokensToString( + $tagged_tokens, $with_tokens); + return $tagged_phrase; + } + /** + * Uses the lexicon to assign a tag to each token and then uses a rule + * based approach to assign the most likely of tags to each token + * + * @param string $text input phrase which is to be tagged + * @return string $result which is an array of token => tag + */ + public static function tagTokenizePartsofSpeech($text) + { + static $dictionary = []; + if (empty($dictionary)) { + $fh = gzopen(C\LOCALE_DIR . "/hi/resources/lexicon.txt.gz", 'r'); + while ($line = gzgets($fh)) { + $line = gzgets($fh); + $line = trim($line, ' '); + $tags = explode(',', $line); + $dictionary[array_shift($tags)] = $tags; + } + gzclose($fh); + } + preg_match_all("/[\w\d]+/", $text, $matches); + $nouns = ['NN','NNP','NNS']; + $verbs = ['VBZ','VBD','VBN']; + $tokens = explode(' ', $text); -} + $result = []; + $tag_list = []; + $i = 0; + + foreach ($tokens as $token) { + /** + * Tag the tokens as found in the Lexicon, else use default tag as + * UNKNOWN + */ + $current = ['token' => $token, 'tag' => 'UNKNOWN']; + if (!empty($dictionary[$token])) { + $tag_list = $dictionary[$token]; + $current['tag'] = $tag_list[0]; + } + + /** + * NOUN IDENTIFICATION + * RULE 1: If the previous word tagged is a Adjective / Pronoun / + * Postposition then the current word is likely to be a noun + */ + if ($previous['tag'] == 'JJ' || + $previous['tag'] == 'PRP' || + $previous['tag'] == 'POST_POS') { + $current['tag'] = 'NN'; + } + + /** + * NOUN INDENTIFICATION + * RULE 2: If the current word is a verb then the previous word is + * likely to be a noun + */ + if (in_array($current['tag'], $verbs)) { + $previous['tag'] = 'NN'; + $result[$i-1] = $previous; + } + + /** + * NOUN INDENTIFICATION + * RULE 3: If the current tag is a noun then next / previous is + * likely to be a noun + */ + + /** + * DEMONSTRATIVE IDENTIFICATION + * RULE 1: If the current and previous words are tagged as pronouns + * then the previous word is likley to be a demonstrative + */ + if ($current['tag'] == 'PRP' && $previous['tag'] == 'PRP') { + $previous['tag'] = 'DEM'; + $result[$i-1] = $previous; + } + + /** + * DEMONSTRATIVE IDENTIFICATION + * RULE 2: If current word is a noun and previous word is a pronoun + * then the current word is liklely to be demonstrative + */ + if ($current['tag'] == 'NN' && $previous['tag'] == 'PRP') { + $current['tag'] = 'DEM'; + } + + /** + * PRONOUN IDENTIFICATION + * RULE 1: If the previous word is unknown and cuurent word is a + * noun then the previous word is most likely to be a pronoun + */ + if ($previous['tag'] == 'UNKNOWN' && $current['tag'] == 'NN') { + $previous['tag'] = 'PRP'; + $result[$i-1] = $previous; + } + + /** + * NAME Identification + * RULE: If we get two words which are untagged the most probably + * they form a name and will be tagged as noun + */ + if ($previous['tag'] == 'UNKNOWN' && $current['tag'] == 'UNKNOWN') { + $current['tag'] = 'NN'; + $previous['tag'] = 'NN'; + $result[$i-1] = $previous; + } + + /** + * ADJECTIVE IDENTIFCATION + * RULE: If the word ends with <tar>, <tam>, <thik> then we tag it + * as a Adjective + */ + + /** + * VERB IDENTIFICATION + * RULE: If the current word is tagged as Auxilary verb and + * previous word is tagged as Unknown then most likely that the + * previous word is a verb + */ + if ($current['tag'] == 'VAUX' && $previous['tag'] == 'UNKNOWN') { + $previous['tag'] = 'VB'; + $result[$i-1] = $previous; + } + + $result[$i] = $current; + $i++; + $previous = $current; + $previous_token = $token; + } + + return $result; + } + /** + * This menthod is used to simplify the different tags of speech to a + * common form + * + * @param array $tagged_tokens which is an array of tokens assigned tags. + * @return string $tagged_phrase which is a string fo form token~pos + */ + public static function taggedPartOfSpeechTokensToString($tagged_tokens, + $with_tokens = true) + { + $tagged_phrase = []; + $with_tokens = $with_tokens; + + $simplified_parts_of_speech = [ + "NN" => "NN", "NNS" => "NN", "NNP" => "NN", "NNPS" => "NN", + "PRP" => "NN", 'PRP$' => "NN", "WP" => "NN", + "VB" => "VB", "VBD" => "VB", "VBN" => "VB", "VBP" => "VB", + "VBZ" => "VB", + "JJ" => "AJ", "JJR" => "AJ", "JJS" => "AJ", + "RB" => "AV", "RBR" => "AV", "RBS" => "AV", "WRB" => "AV" + ]; + + foreach ($tagged_tokens as $t) { + $tag = trim($t['tag']); + $tag = (isset($simplified_parts_of_speech[$tag])) ? + $simplified_parts_of_speech[$tag] : $tag; + $token = ($with_tokens) ? $t['token'] . "~" : ""; + $tagged_phrase .= $token . $tag . " "; + } + + return $tagged_phrase; + } +} \ No newline at end of file diff --git a/src/locale/hi/resources/lexicon.txt.gz b/src/locale/hi/resources/lexicon.txt.gz new file mode 100644 index 000000000..5d7c3ba79 Binary files /dev/null and b/src/locale/hi/resources/lexicon.txt.gz differ