Adding a Rule Based Hindi Part of Speech Tagger and a Hindi Lexicon

Salil Shenoy [2016-12-11 23:Dec:th]

Adding a Rule Based Hindi Part of Speech Tagger and a Hindi Lexicon

Signed-off-by: Chris Pollett <chris@pollett.org>

Filename
src/locale/hi/resources/Tokenizer.php
src/locale/hi/resources/lexicon.txt.gz

diff --git a/src/locale/hi/resources/Tokenizer.php b/src/locale/hi/resources/Tokenizer.php
index 737b56eb0..d6ae4b3e3 100755
--- a/src/locale/hi/resources/Tokenizer.php
+++ b/src/locale/hi/resources/Tokenizer.php
@@ -142,5 +142,182 @@ class Tokenizer
         }
         return $word;
     }
+    /**
+     * The method ttakes as input a phrase and returns a string with each
+     * term tagged with a part of speech.
+     *
+     * @param string $phrase which is the input string to be tagged.
+     * @return string $tagged_phrase which is a string of format term~pos
+     */
+    public static function tagPartsOfSpeechPhrase($phrase, $with_tokens = true)
+    {
+        $tagged_tokens = self::tagTokenizePartOfSpeech($phrase);
+        $tagged_phrase  = self::taggedPartOfSpeechTokensToString(
+            $tagged_tokens, $with_tokens);
+        return $tagged_phrase;
+    }
+    /**
+     * Uses the lexicon to assign a tag to each token and then uses a rule
+     * based approach to assign the most likely of tags to each token
+     *
+     * @param string $text input phrase which is to be tagged
+     * @return string $result which is an array of token => tag
+     */
+    public static function tagTokenizePartsofSpeech($text)
+    {
+        static $dictionary = [];
+        if (empty($dictionary)) {
+            $fh = gzopen(C\LOCALE_DIR . "/hi/resources/lexicon.txt.gz", 'r');
+            while ($line = gzgets($fh)) {
+                $line = gzgets($fh);
+                $line = trim($line, ' ');
+                $tags = explode(',', $line);
+                $dictionary[array_shift($tags)] = $tags;
+            }
+            gzclose($fh);
+        }
+        preg_match_all("/[\w\d]+/", $text, $matches);
+        $nouns = ['NN','NNP','NNS'];
+        $verbs = ['VBZ','VBD','VBN'];
+        $tokens = explode(' ', $text);

-}
+        $result = [];
+        $tag_list = [];
+        $i = 0;
+
+        foreach ($tokens as $token) {
+            /**
+             * Tag the tokens as found in the Lexicon, else use default tag as
+             * UNKNOWN
+             */
+            $current = ['token' => $token, 'tag' => 'UNKNOWN'];
+            if (!empty($dictionary[$token])) {
+                $tag_list = $dictionary[$token];
+                $current['tag'] = $tag_list[0];
+            }
+
+            /**
+             * NOUN IDENTIFICATION
+             * RULE 1: If the previous word tagged is a Adjective / Pronoun /
+             * Postposition then the current word is likely to be a noun
+             */
+            if ($previous['tag'] == 'JJ' ||
+                $previous['tag'] == 'PRP' ||
+                $previous['tag'] == 'POST_POS') {
+                $current['tag'] = 'NN';
+            }
+
+            /**
+             * NOUN INDENTIFICATION
+             * RULE 2: If the current word is a verb then the previous word is
+             * likely to be a noun
+             */
+            if (in_array($current['tag'], $verbs)) {
+                $previous['tag'] = 'NN';
+                $result[$i-1] = $previous;
+            }
+
+            /**
+             * NOUN INDENTIFICATION
+             * RULE 3: If the current tag is a noun then next / previous is
+             * likely to be a noun
+             */
+
+            /**
+             * DEMONSTRATIVE IDENTIFICATION
+             * RULE 1: If the current and previous words are tagged as pronouns
+             * then the previous word is likley to be a demonstrative
+             */
+            if ($current['tag'] == 'PRP' && $previous['tag'] == 'PRP') {
+                $previous['tag'] = 'DEM';
+                $result[$i-1] = $previous;
+            }
+
+            /**
+             * DEMONSTRATIVE IDENTIFICATION
+             * RULE 2: If current word is a noun and previous word is a pronoun
+             * then the current word is liklely to be demonstrative
+             */
+            if ($current['tag'] == 'NN' && $previous['tag'] == 'PRP') {
+                $current['tag'] = 'DEM';
+            }
+
+            /**
+             * PRONOUN IDENTIFICATION
+             * RULE 1: If the previous word is unknown and cuurent word is a
+             * noun then the previous word is most likely to be a pronoun
+             */
+            if ($previous['tag'] == 'UNKNOWN' && $current['tag'] == 'NN') {
+                $previous['tag'] = 'PRP';
+                $result[$i-1] = $previous;
+            }
+
+            /**
+             * NAME Identification
+             * RULE: If we get two words which are untagged the most probably
+             * they form a name and will be tagged as noun
+             */
+            if ($previous['tag'] == 'UNKNOWN' && $current['tag'] == 'UNKNOWN') {
+                $current['tag'] = 'NN';
+                $previous['tag'] = 'NN';
+                $result[$i-1] = $previous;
+            }
+
+            /**
+            * ADJECTIVE IDENTIFCATION
+            * RULE: If the word ends with <tar>, <tam>, <thik> then we tag it
+            * as a Adjective
+            */
+
+            /**
+             * VERB IDENTIFICATION
+             * RULE: If the current word is tagged as Auxilary verb and
+             * previous word is tagged as Unknown then most likely that the
+             * previous word is a verb
+             */
+            if ($current['tag'] == 'VAUX' && $previous['tag'] == 'UNKNOWN') {
+                $previous['tag'] = 'VB';
+                $result[$i-1] = $previous;
+            }
+
+            $result[$i] = $current;
+            $i++;
+            $previous = $current;
+            $previous_token = $token;
+        }
+
+        return $result;
+    }
+    /**
+     * This menthod is used to simplify the different tags of speech to a
+     * common form
+     *
+     * @param array $tagged_tokens which is an array of tokens assigned tags.
+     * @return string $tagged_phrase which is a string fo form token~pos
+     */
+    public static function taggedPartOfSpeechTokensToString($tagged_tokens,
+                                                           $with_tokens = true)
+    {
+        $tagged_phrase = [];
+        $with_tokens = $with_tokens;
+
+        $simplified_parts_of_speech = [
+          "NN" => "NN", "NNS" => "NN", "NNP" => "NN", "NNPS" => "NN",
+          "PRP" => "NN", 'PRP$' => "NN", "WP" => "NN",
+          "VB" => "VB", "VBD" => "VB", "VBN" => "VB", "VBP" => "VB",
+          "VBZ" => "VB",
+          "JJ" => "AJ", "JJR" => "AJ", "JJS" => "AJ",
+          "RB" => "AV", "RBR" => "AV", "RBS" => "AV", "WRB" => "AV"
+        ];
+
+        foreach ($tagged_tokens as $t) {
+            $tag = trim($t['tag']);
+            $tag = (isset($simplified_parts_of_speech[$tag])) ?
+                   $simplified_parts_of_speech[$tag] : $tag;
+            $token = ($with_tokens) ? $t['token'] . "~" : "";
+            $tagged_phrase .= $token . $tag .  " ";
+        }
+
+        return $tagged_phrase;
+    }
+}
\ No newline at end of file
diff --git a/src/locale/hi/resources/lexicon.txt.gz b/src/locale/hi/resources/lexicon.txt.gz
new file mode 100644
index 000000000..5d7c3ba79
Binary files /dev/null and b/src/locale/hi/resources/lexicon.txt.gz differ

ViewGit