TripletExtraction Improved and Lexicon Table Structure modified

SalilShenoy [2017-11-22 04:Nov:nd]

TripletExtraction Improved and Lexicon Table Structure modified

Signed-off-by: Chris Pollett <chris@pollett.org>

Filename
src/configs/Config.php
src/configs/Createdb.php
src/library/VersionFunctions.php
src/locale/hi/resources/Tokenizer.php
src/models/ProfileModel.php

diff --git a/src/configs/Config.php b/src/configs/Config.php
index 0a010845a..d42a5e7bd 100755
--- a/src/configs/Config.php
+++ b/src/configs/Config.php
@@ -976,3 +976,5 @@ nsconddefine('AD_LOGO','resources/adv-logo.png');
 nsconddefine('SENTENCE_COMPRESSION_ENABLED', false);
 /** Define cipher to be used in AES */
 nsconddefine('AES_256_CBC', 'aes-256-cbc');
+/** The number of rows to be used in bulk insert from Lexicon */
+nsconddefine('NUM_LEX_BULK_INSERTS',100000);
diff --git a/src/configs/Createdb.php b/src/configs/Createdb.php
index efd9bf31f..a2086903e 100755
--- a/src/configs/Createdb.php
+++ b/src/configs/Createdb.php
@@ -264,26 +264,22 @@ foreach ($locales as $locale) {
         foreach ($lines as $line) {
             $line = trim($line, " ");
             $line = explode(" ", $line);
-            if (empty($line[0]) || empty($line[1]))
-                continue;
             $insert_values .= '(\'' . trim($line[0]) . '\',\'' . $locale[0] .
                 '\',\'' . trim($line[1]) . '\'),';
             $count++;
-            if ($count >= 10000) {
+            if ($count >= C\NUM_LEX_BULK_INSERTS) {
                 $insert_values = rtrim($insert_values, ',');
-                $query = 'INSERT INTO LEXICON (WORD, LOCALE, POS) VALUES
-                    {$insert_values}';
+                $query = "INSERT INTO LEXICON (TERM, LOCALE, PART_OF_SPEECH)
+                 VALUES {$insert_values}";
                 $db->exec($query);
                 $insert_values = "";
                 $count = 0;
-                if ($db->affectedRows() == 0) {
-                    continue;
-                }
             }
         }
+
         if ($count > 0) {
             $insert_values = rtrim($insert_values, ',');
-            $query = "INSERT INTO LEXICON (WORD, LOCALE, POS) VALUES
+            $query = "INSERT INTO LEXICON (TERM, LOCALE, PART_OF_SPEECH) VALUES
                 {$insert_values}";
             $db->exec($query);
         }
diff --git a/src/library/VersionFunctions.php b/src/library/VersionFunctions.php
index 6131a9454..afa228859 100644
--- a/src/library/VersionFunctions.php
+++ b/src/library/VersionFunctions.php
@@ -1564,6 +1564,55 @@ function upgradeDatabaseVersion54(&$db)
  */
 function upgradeDatabaseVersion55(&$db)
 {
-    $db->execute("CREATE TABLE LEXICON(WORD VARCHAR, LOCALE VARCHAR,
-                POS VARCHAR, PRIMARY KEY(WORD, LOCALE))");
+    $db->execute("CREATE TABLE LEXICON(
+                TERM VARCHAR(". C\LONG_NAME_LEN ."),
+                LOCALE VARCHAR(" . C\NAME_LEN . "),
+                PART_OF_SPEECH VARCHAR(16), PRIMARY KEY(TERM, LOCALE))");
+
+    // Retrieve the locales added to the Locale table
+    $sql = "SELECT LOCALE_TAG from LOCALE";
+    $result = $db->execute($sql);
+    if ($result) {
+        $locales = $db->fetchArray($result);
+    }
+    /*
+     * Go through the locales, check of there is a lexicon,
+     * if present then add it to the Lexicon database.
+     * as (term, part_of_speech, locale)
+     */
+    foreach ($locales as $locale) {
+        $folder_name = $locale;
+        if (strstr($locale, "-")) {
+            $locale_name = explode("-", $locale);
+            $folder_name = $locale_name . "_" . $locale_name;
+        }
+        $lexicon_file = C\LOCALE_DIR . "/" . $folder_name .
+            "/resources/lexicon.txt.gz";
+        if (file_exists($lexicon_file)) {
+            $lines = gzfile($lexicon_file);
+            $insert_values = "";
+            $count = 0;
+            foreach ($lines as $line) {
+                $line = trim($line, " ");
+                $line = explode(" ", $line);
+                $insert_values .= '(' . trim($line[0]) . ',' . $locale[0] .
+                    ',' . trim($line[1]) . '),';
+                $count++;
+                if ($count >= C\NUM_LEX_BULK_INSERTS) {
+                    $insert_values = rtrim($insert_values, ',');
+                    $query = "INSERT INTO LEXICON (TERM, LOCALE, PART_OF_SPEECH)
+                     VALUES {$insert_values}";
+                    $db->exec($query);
+                    $insert_values = "";
+                    $count = 0;
+                }
+            }
+            if ($count > 0) {
+                $insert_values = rtrim($insert_values, ',');
+                $query = "INSERT INTO LEXICON (TERM, LOCALE, PART_OF_SPEECH)
+                 VALUES {$insert_values}";
+                $db->exec($query);
+            }
+        }
+    }
 }
diff --git a/src/locale/hi/resources/Tokenizer.php b/src/locale/hi/resources/Tokenizer.php
index f0df85681..08da7bfce 100755
--- a/src/locale/hi/resources/Tokenizer.php
+++ b/src/locale/hi/resources/Tokenizer.php
@@ -47,12 +47,13 @@ class Tokenizer
      * List of verb-like parts of speech that might appear in lexicon
      * @array
      */
-    public static $verb_phrases = ["VB", "VBD", "VBG", "VBN", "VBP", "VBZ"];
+    public static $verb_phrases = ["VB", "VBD", "VBG", "VBN", "VBP", "VBZ",
+        "RB"];
     /**
      * List of noun-like parts of speech that might appear in lexicon
      * @array
      */
-    public static $noun_phrases = ["NN", "NNS", "NNP", "NNPS", "PRP"];
+    public static $noun_phrases = ["NN", "NNS", "NNP", "NNPS", "DT"];
     /**
      * List of adjective-like parts of speech that might appear in lexicon
      * @array
@@ -62,8 +63,13 @@ class Tokenizer
      * List of postpositional-like parts of speech that might appear in lexicon
      * @array
      */
-    public static $postpositional_phrases = ["inj", "PREP", "proNN", "CONJ",
-        "INT", "particle", "case", "PSP"];
+    public static $postpositional_phrases = ["IN", "inj", "PREP", "proNN",
+        "CONJ", "INT", "particle", "case", "PSP", "direct_DT", "PRP"];
+    /*
+     * List of questions in Hindi
+     */
+    public static $questions = ["क्या", "कब", "कहा", "क्यों", "कौन", "जिसे",
+            "जिसका", "कहाँ", "कहां"];
     /**
      * Any unique identifier corresponding to the component of a triplet which
      * can be answered using a question answer list
@@ -88,40 +94,6 @@ class Tokenizer
     {
         return $pre_segment;
     }
-    /**
-     * Removes the stop words from the page (used for Word Cloud generation)
-     *
-     * @param string $page the page to remove stop words from.
-     * @return string $page with no stop words
-     */
-    public static function stopwordsRemover($page)
-    {
-        $stop_words = [
-            "पर ", "इन ", "वह ", "यिह ", "वुह ", "जिन्हें", "जिन्हों",
-            "तिन्हें", "तिन्हों", "किन्हों", "किन्हें", "इत्यादि", "द्वारा",
-            "इन्हें", "इन्हों", "उन्हों", "बिलकुल", "निहायत", "ऱ्वासा",
-            "इन्हीं", "उन्हीं", "उन्हें", "इसमें", "जितना", "दुसरा",
-            "कितना", "दबारा", "साबुत", "वग़ैरह", "दूसरे", "कौनसा", "लेकिन",
-            "होता", "करने", "किया", "लिये", "अपने", "नहीं", "दिया", "इसका",
-            "करना", "वाले", "सकते", "इसके", "सबसे", "होने", "करते", "बहुत",
-            "वर्ग", "करें", "होती", "अपनी", "उनके", "कहते", "होते", "करता",
-            "उनकी", "इसकी", "सकता", "रखें", "अपना", "उसके", "जिसे",
-            "तिसे", "किसे", "किसी", "काफ़ी", "पहले", "नीचे", "बाला", "यहाँ",
-            "जैसा", "जैसे", "मानो", "अंदर", "भीतर", "पूरा", "सारा", "होना",
-            "उनको", "वहाँ", "वहीं", "जहाँ", "जीधर","उनका", "इनका", "के",
-            "हैं", "गया", "बनी", "एवं", "हुआ", "साथ", "बाद", "लिए", "कुछ",
-            "कहा", "यदि", "हुई", "इसे", "हुए", "अभी", "सभी", "कुल", "रहा",
-            "रहे", "इसी", "उसे", "जिस", "जिन", "तिस", "तिन", "कौन", "किस",
-            "कोई", "ऐसे", "तरह", "किर", "साभ", "संग", "यही", "बही", "उसी",
-            "फिर", "मगर", "का", "एक", "यह", "से", "को", "इस", "कि", "जो",
-            "कर", "मे", "ने", "तो", "ही", "या", "हो", "था", "तक", "आप", "ये",
-            "थे", "दो", "वे", "थी", "जा", "ना", "उस", "एस", "पे", "उन", "सो",
-            "भी", "और", "घर", "तब", "जब", "अत", "व", "न"
-        ];
-        $page = preg_replace('/\b('.implode('|',$stop_words).')\b/u', '',
-            $page);
-        return $page;
-    }
     /**
      * Computes the stem of an Hindi word
      *
@@ -130,10 +102,6 @@ class Tokenizer
      */
     public static function stem($word)
     {
-        if (in_array($word, self::$no_stem_list)) {
-            return $word;
-        }
-        $word = self::removeSuffix($word);
         return $word;
     }
     /**
@@ -144,30 +112,6 @@ class Tokenizer
      */
     private static function removeSuffix($word)
     {
-        $length = mb_strlen($word);
-        if ($length > 5) {
-            $last_three = mb_substr($word, -3);
-            if (in_array($last_three, ["िया", "ियो"])) {
-                $word = mb_substr($word, 0, -3);
-                return $word;
-            }
-        }
-        if ($length > 4) {
-            $last_two = mb_substr($word, -2);
-            if (in_array($last_two, ["ाए", " ाओ", " ुआ", " ुओ",
-                "ये", " ेन", " ेण", " ीय", "टी", "ार", "ाई"])) {
-                $word = mb_substr($word, 0, -2);
-                return $word;
-            }
-        }
-        if ($length > 3) {
-            $last_one = mb_substr($word, -1);
-             if (in_array($last_one, [" ा", " े", " ी", " ो", "ि ",
-                "अ"])) {
-                $word = mb_substr($word, 0, -1);
-                return $word;
-            }
-        }
         return $word;
     }
     /**
@@ -195,7 +139,7 @@ class Tokenizer
      */
     public static function tagTokenizePartofSpeech($text)
     {
-        $tokens = preg_split("/[\s]+/", $text);
+        $tokens = preg_split("/\s+/u", $text);
         $result = [];
         $tag_list = [];
         $i = 0;
@@ -204,25 +148,25 @@ class Tokenizer
         {
             //Tag the tokens as found in the Lexicon
             $token = trim($token);
-            $current = ['token' => $token, 'tag' => 'UNKNOWN'];
-            $word = $current['token'];
-            $sql = "SELECT * FROM LEXICON WHERE WORD = '{$word}'
-                AND LOCALE = 'hi'";
+            $current = ["token" => $token, "tag" => "UNKNOWN"];
+            $term = $current["token"];
+            $sql = "SELECT PART_OF_SPEECH FROM LEXICON WHERE TERM = '{$term}'
+             AND LOCALE = 'hi'";
             $queryResult = @$model->db->execute($sql);
             if ($queryResult !== false) {
                 $row = $model->db->fetchArray($queryResult);
-                $current['tag'] = $row['POS'];
+                $current["tag"] = $row["PART_OF_SPEECH"];
             }

             if (is_numeric($token)) {
-                $current['tag'] = "NN";
+                $current["tag"] = "NN";
             } else if (strcmp($token,"है") == 0 ||
                         strcmp($token, "हैं") == 0) {
-                $current['tag'] = "VB";
+                $current["tag"] = "VB";
             }

-            if (!isset($current['tag'])) {
-                $current['tag'] = "UNKNOWN";
+            if (!isset($current["tag"])) {
+                $current["tag"] = "UNKNOWN";
             }

             $result[$i] = $current;
@@ -236,65 +180,75 @@ class Tokenizer
     public static function tagUnknownWords($partiallyTaggedText)
     {
         $result = $partiallyTaggedText;
-        $verbs = ['VBZ','VBD','VBN'];
+        $verbs = ["VBZ","VBD","VBN"];
         $length = count($result);
         $previous = $result[0];
         for ($i = 1; $i < $length; $i++)
         {
             $current = $result[$i];
-            $current['token'] = trim($current['token']);
-            $current['tag'] = trim($current['tag']);
-            if ($current['tag'] == "UNKNOWN" || $previous['tag'] == "UNKNOWN")
-            {
-                //RULE 1: If the previous word tagged is a Adjective / Pronoun
-                // Postposition then the current word is likely to be a noun
-                if ($previous['tag'] == 'JJ'     ||
-                    $previous['tag'] == 'PRO_NN' ||
-                    $previous['tag'] == 'POST_POS') {
-                    $current['tag'] = 'NN';
+            $current["token"] = trim($current["token"]);
+            $current["tag"] = trim($current["tag"]);
+            if ($current["tag"] == "UNKNOWN" || $previous["tag"] == "UNKNOWN") {
+                /**
+                 * RULE 1: If the previous word tagged is a Adjective Pronoun
+                 * Postposition then the current word is likely to be a noun
+                 */
+                if ($previous["tag"] == "JJ"     ||
+                    $previous["tag"] == "PRO_NN" ||
+                    $previous["tag"] == "POST_POS") {
+                    $current["tag"] = "NN";
                     $result[$i] = $current;
                 }
-                //RULE 2: If the current word is a verb then the previous word is
-                //likely to be a noun
-                if (in_array($current['tag'], $verbs)) {
-                    $previous['tag'] = 'NN';
-                    $result[$i] = $previous;
+                /**
+                 * RULE 2: If the current word is a verb then the previous
+                 * word is likely to be a noun
+                 */
+                if (in_array($current["tag"], $verbs)) {
+                    $previous["tag"] = "NN";
+                    $result[$i-1] = $previous;
                 }
-                //PRONOUN IDENTIFICATION
-                //RULE 3: If the previous word is unknown and cuurent word is a
-                //noun then the previous word is most likely to be a pronoun
-                if ($previous['tag'] == 'UNKNOWN' &&
-                    $current['tag'] == 'NN') {
-                    $previous['tag'] = 'PRP';
+                /**
+                 * PRONOUN IDENTIFICATION
+                 * RULE 3: If the previous word is unknown and cuurent word
+                 * is a noun then the previous word is most likely to be a
+                 * pronoun
+                 */
+                if ($previous["tag"] == "UNKNOWN" &&
+                    $current["tag"] == "NN") {
+                    $previous["tag"] = "PRP";
                     $result[$i-1] = $previous;
                 }
-                //VERB IDENTIFICATION
-                //RULE 4: If the current word is tagged as Auxilary verb  and
-                //previous word is tagged as Unknown then most likely that the
-                //previous word is a verb
-                if ($current['tag'] == 'VAUX' &&
-                    $previous['tag'] == 'UNKNOWN') {
-                    $previous['tag'] = 'VB';
+                /**
+                 * VERB IDENTIFICATION
+                 * RULE 4: If the current word is tagged as Auxilary verb and
+                 * previous word is tagged as Unknown then most likely that
+                 * the previous word is a verb
+                 */
+                if ($current["tag"] == "VAUX" &&
+                    $previous["tag"] == "UNKNOWN") {
+                    $previous["tag"] = "VB";
                     $result[$i-1] = $previous;
-                }
-                //ADJECTIVE IDENTIFIATION
-                //RULE 5: if the currennt word ends with 'तम' or 'इक' or 'िक'
-                //or 'तर' then the word is an adjective
-                if(mb_substr($current['token'], -2, 2) == "इक" ||
-                       mb_substr($current['token'], -2, 2) == "िक" ||
-                       mb_substr($current['token'], -2, 2) == "तर"  ||
-                       mb_substr($current['token'], -2, 2) == "तम") {
-                    $current['tag'] = 'AJ';
+                }
+                /**
+                 * ADJECTIVE IDENTIFIATION
+                 * RULE 5: if the currennt word ends with "तम" or "इक" or "िक"
+                 * or "तर" then the word is an adjective
+                 */
+                if(mb_substr($current["token"], -2, 2) == "इक" ||
+                    mb_substr($current["token"], -2, 2) == "िक" ||
+                    mb_substr($current["token"], -2, 2) == "तर"  ||
+                    mb_substr($current["token"], -2, 2) == "तम") {
+                    $current["tag"] = "JJ";
                     $result[$i] = $current;
                 }

-                if ($current['tag'] == "UNKNOWN") {
-                    $current['tag'] = 'NN';
+                if ($current["tag"] == "UNKNOWN") {
+                    $current["tag"] = "NN";
                     $result[$i] = $current;
                 }

-                if ($previous['tag'] == "UNKNOWN"){
-                    $previous['tag'] = 'NN';
+                if ($previous["tag"] == "UNKNOWN"){
+                    $previous["tag"] = "NN";
                     $result[$i-1] = $previous;
                 }
             }
@@ -327,10 +281,10 @@ class Tokenizer
           "direct_DT" => "DT",
        ];
         foreach ($tagged_tokens as $t) {
-            $tag = trim($t['tag']);
+            $tag = trim($t["tag"]);
             $tag = (isset($simplified_parts_of_speech[$tag])) ?
                    $simplified_parts_of_speech[$tag] : $tag;
-            $token = ($with_tokens) ? $t['token'] . "~" : "";
+            $token = ($with_tokens) ? $t["token"] . "~" : "";
             $tagged_phrase .= $token . $tag .  " ";
         }
         return $tagged_phrase;
@@ -353,21 +307,21 @@ class Tokenizer
     {
         //Combining multiple noun into one
         $noun_string = "";
-        $cur_node = $tree['cur_node'];
-        while (isset($tagged_phrase[$cur_node]['tag']) &&
-            (in_array(trim($tagged_phrase[$cur_node]['tag']),
+        $cur_node = $tree["cur_node"];
+        while (isset($tagged_phrase[$cur_node]["tag"]) &&
+            (in_array(trim($tagged_phrase[$cur_node]["tag"]),
             self::$noun_phrases))) {
-            $noun_string .= " " . $tagged_phrase[$cur_node]['token'];
+            $noun_string .= " " . $tagged_phrase[$cur_node]["token"];
             $cur_node++;
         }
         if (!empty($noun_string)) {
             $tree["NN"] = $noun_string;
         }
-        $tree['cur_node'] = $cur_node;
+        $tree["cur_node"] = $cur_node;
         return $tree;
     }
     /**
-     * Takes a part-of-speech tagged phrase and pre-tree with a
+     * Takes a part-of-speech tagged phrase and parse-tree with a
      * parse-from position and builds a parse tree for a sequence of
      * postpositional phrases if possible
      *
@@ -379,56 +333,57 @@ class Tokenizer
      * @return array has fields
      *      "cur_node" index of how far we parsed $tagged_phrase
      */
-    public static function extractPostposition($tagged_phrase, $tree,
+    public static function extractPostpositionPhrase($tagged_phrase, $tree,
         $index = 1)
     {
-        $cur_node = $tree['cur_node'];
-        if (isset($tagged_phrase[$cur_node]['tag']) &&
-            in_array(trim($tagged_phrase[$cur_node]['tag']),
+        $cur_node = $tree["cur_node"];
+        $tree_pp["cur_node"] = $tree["cur_node"];
+        if (isset ($tagged_phrase[$cur_node]["tag"]) &&
+            in_array($tagged_phrase[$cur_node]["tag"],
+            self::$postpositional_phrases)) {
+            $pp_string ="";
+            while (isset($tagged_phrase[$cur_node]["tag"]) &&
+                in_array($tagged_phrase[$cur_node]["tag"],
                 self::$postpositional_phrases)) {
-            $preposition_string = "";
-            while (isset($tagged_phrase[$cur_node]['tag']) &&
-                in_array(trim($tagged_phrase[$cur_node]['tag']),
-                self::$postpositional_phrases)) {
-                $preposition_string .= " ". $tagged_phrase[$cur_node]['token'];
+                $pp_string .= " " . $tagged_phrase[$cur_node]["token"];
                 $cur_node++;
             }
-            if (!empty($preposition_string)) {
-                $tree["IN_$index"] = $preposition_string;
-            }
-            if (isset($tagged_phrase[$cur_node]['tag']) &&
-                trim($tagged_phrase[$cur_node]['tag']) == "DT") {
-                $tree['DT_$index'] = $tagged_phrase[$cur_node]['token'];
-                $cur_node++;
+            if (!empty($pp_string)) {
+                $tree_pp["IN_$index"] = $pp_string;
             }
             $adjective_string = "";
-            while (isset($tagged_phrase[$cur_node]['tag']) &&
-                in_array(trim($tagged_phrase[$cur_node]['tag']),
-                self::$adjective_phrases)) {
-                $adjective_string .= " " . $tagged_phrase[$cur_node]['token'];
+            while (isset($tagged_phrase[$cur_node]["tag"]) &&
+                in_array($tagged_phrase[$cur_node]["tag"],
+                    self::$adjective_phrases)) {
+                $adjective_string .= " " .
+                    $tagged_phrase[$cur_node]["token"];
                 $cur_node++;
             }
             if (!empty($adjective_string)) {
-                $tree["JJ_$index"] = $adjective_string;
+                $tree_pp["JJ_$index"] = $adjective_string;
             }
-            $prep_noun_string = "";
-            while (isset($tagged_phrase[$cur_node]['tag']) &&
-                in_array(trim($tagged_phrase[$cur_node]['tag']),
-                self::$noun_phrases)) {
-                $prep_noun_string .= " ". $tagged_phrase[$cur_node]['token'];
+            $nn_string = "";
+            while (isset($tagged_phrase[$cur_node]["tag"]) &&
+                in_array($tagged_phrase[$cur_node]["tag"],
+                    self::$noun_phrases)) {
+                $nn_string .= " " . $tagged_phrase[$cur_node]["token"];
                 $cur_node++;
             }
-            if ($prep_noun_string) {
-                $tree["NP_$index"] = $prep_noun_string;
+            if (!empty($nn_string)) {
+                $tree_pp["NN_$index"] = $nn_string;
             }
-            $tree_next = self::extractPostposition($tagged_phrase,
-                    ["cur_node" => $cur_node], $index + 1);
+            $tree_pp["cur_node"] = $cur_node;
+            $tree_next = self::extractPostpositionPhrase($tagged_phrase,
+                $tree_pp, $index+1);
+            $tree_pp = array_merge ($tree_pp, $tree_next);
         }
-        $tree['cur_node'] = $cur_node;
+        $tree["cur_node"] = $tree_pp["cur_node"];
+        unset ($tree_pp["cur_node"]);
+        $tree["POST"] = $tree_pp;
         return $tree;
     }
     /**
-     * Takes a part-of-speech tagged phrase and pre-tree with a
+     * Takes a part-of-speech tagged phrase and parse-tree with a
      * parse-from position and builds a parse tree for a noun phrase if possible
      *
      * @param array $tagged_phrase
@@ -438,30 +393,26 @@ class Tokenizer
      *      current parse position in $tagged_phrase]
      * @return array has fields
      *      "cur_node" index of how far we parsed $tagged_phrase
-     *      "JJ" with value an adjective subtree
-     *       "POST" with value a post position subtree
+     *      "JJ" with value an Adjective subtree
+     *      "NN" with value of a Noun Subtree
      */
     public static function extractNounPhrase($tagged_phrase, $tree)
     {
-        $cur_node = $tree['cur_node'];
-        $tree_jj = self::extractAdjective($tagged_phrase,
-            ['cur_node' => $tree['cur_node']]);
-         $tree_nn =self::extractNoun($tagged_phrase,
-            ['cur_node' => $tree_jj['cur_node']]);
-        $tree_post = self::extractPostposition($tagged_phrase,
-            ['cur_node' => $tree_nn['cur_node']]);
-        if ($tree_nn['cur_node'] == $cur_node) {
-            $tree['NP'] = "";
+        $cur_node = $tree["cur_node"];
+        $tree_jj = self::extractAdjective($tagged_phrase,
+            ["cur_node" => $tree["cur_node"]]);
+        $tree_nn = self::extractNoun($tagged_phrase,
+            ["cur_node" => $tree_jj["cur_node"]]);
+        if ($tree_nn["cur_node"] == $cur_node) {
+            $tree["NP"] = "";
         } else {
-            $cur_node = $tree_post['cur_node'];
-            unset($tree_jj['cur_node']);
-            $tree_new_sub['JJ'] = $tree_jj;
-            unset($tree_nn['cur_node']);
-            $tree_new_sub['NN'] = $tree_nn;
-            unset($tree_post['cur_node']);
-            $tree_new_sub['POST'] = $tree_post;
-            $tree_new['cur_node'] = $cur_node;
-            $tree_new['NP'] = $tree_new_sub;
+            $cur_node = $tree_nn["cur_node"];
+            unset($tree_jj["cur_node"]);
+            $tree_new_sub["JJ"] = $tree_jj;
+            unset($tree_nn["cur_node"]);
+            $tree_new_sub["NN"] = $tree_nn;
+            $tree_new["cur_node"] = $cur_node;
+            $tree_new["NP"] = $tree_new_sub;
             return $tree_new;
         }
         return $tree;
@@ -482,24 +433,18 @@ class Tokenizer
      */
     public static function extractVerb($tagged_phrase, $tree)
     {
-        $cur_node = $tree['cur_node'];
-        // skip stuff before verb (intensifiers and adverbs)
-        while (isset($tagged_phrase[$cur_node]['tag']) &&
-            !in_array(trim($tagged_phrase[$cur_node]['tag']),
-            self::$verb_phrases)) {
-            $cur_node++;
-        }
+        $cur_node = $tree["cur_node"];
         $verb_string = "";
-        while (isset($tagged_phrase[$cur_node]['tag']) &&
-            in_array(trim($tagged_phrase[$cur_node]['tag']),
+        while (isset($tagged_phrase[$cur_node]["tag"]) &&
+            in_array(trim($tagged_phrase[$cur_node]["tag"]),
             self::$verb_phrases)) {
-            $verb_string .= " " . $tagged_phrase[$cur_node]['token'];
+            $verb_string .= " " . $tagged_phrase[$cur_node]["token"];
             $cur_node++;
         }
         if (!empty($verb_string)) {
             $tree["VB"] = $verb_string;
         }
-        $tree['cur_node'] = $cur_node;
+        $tree["cur_node"] = $cur_node;
         return $tree;
     }
     /**
@@ -518,39 +463,40 @@ class Tokenizer
      */
     public static function extractVerbPhrase($tagged_phrase, $tree)
     {
-        $cur_node = $tree['cur_node'];
-        $tree_vb = self::extractVerb($tagged_phrase, ['cur_node' => $cur_node]);
-        if ($tree_vb['cur_node'] == $cur_node) {
+        $cur_node = $tree["cur_node"];
+        $tree_vb = self::extractVerb($tagged_phrase, ["cur_node" => $cur_node]);
+        if ($tree_vb["cur_node"] == $cur_node) {
+            $tree["VP"] = [];
             return $tree;
         }
-        $cur_node = $tree_vb['cur_node'];
-        $preposition_string = "";
-        while (isset($tagged_phrase[$cur_node]['tag']) &&
-            in_array(trim($tagged_phrase[$cur_node]['tag']),
+        $cur_node = $tree_vb["cur_node"];
+        $postposition_string = "";
+        while (isset($tagged_phrase[$cur_node]["tag"]) &&
+            in_array(trim($tagged_phrase[$cur_node]["tag"]),
                 self::$postpositional_phrases)) {
-            $preposition_string .= " ". $tagged_phrase[$cur_node]['token'];
+            $postposition_string .= " ". $tagged_phrase[$cur_node]["token"];
             $cur_node++;
         }
-        if (!empty($preposition_string)) {
-            $tree_vb["IN"] = $preposition_string;
+        if (!empty($postposition_string)) {
+            $tree_vb["IN"] = $postposition_string;
         }
         $tree_np = self::extractNounPhrase($tagged_phrase,
-            ['cur_node' => $cur_node]);
+            ["cur_node" => $cur_node]);
         $tree_new = [];
         $tree_new_sub = [];
-        if ($tree_np['cur_node'] !=  $cur_node) {
-            $cur_node = $tree_np['cur_node'];
-            unset($tree_vb['cur_node'], $tree_np['cur_node']);
-            $tree_new_sub['VB'] = $tree_vb;
-            $tree_new_sub['NP'] = $tree_np['NP'];
-            $tree_new['cur_node'] = $cur_node;
-            $tree_new['VP'] = $tree_new_sub;
+        if ($tree_np["cur_node"] !=  $cur_node) {
+            $cur_node = $tree_np["cur_node"];
+            unset($tree_vb["cur_node"], $tree_np["cur_node"]);
+            $tree_new_sub["VB"] = $tree_vb;
+            $tree_new_sub["NP"] = $tree_np["NP"];
+            $tree_new["cur_node"] = $cur_node;
+            $tree_new["VP"] = $tree_new_sub;
             return $tree_new;
         }
-        unset($tree_vb['cur_node']);
-        $tree_new_sub['VB'] = $tree_vb;
-        $tree_new['cur_node'] = $cur_node;
-        $tree_new['VP'] = $tree_new_sub;
+        unset($tree_vb["cur_node"]);
+        $tree_new_sub["VB"] = $tree_vb;
+        $tree_new["cur_node"] = $cur_node;
+        $tree_new["VP"] = $tree_new_sub;
         return $tree_new;
     }
     /**
@@ -570,17 +516,17 @@ class Tokenizer
     public static function extractAdjective($tagged_phrase, $tree)
     {
         $adjective_string = "";
-        $cur_node = $tree['cur_node'];
-        while (isset($tagged_phrase[$cur_node]['tag']) &&
-            in_array(trim($tagged_phrase[$cur_node]['tag']),
+        $cur_node = $tree["cur_node"];
+        while (isset($tagged_phrase[$cur_node]["tag"]) &&
+            in_array(trim($tagged_phrase[$cur_node]["tag"]),
             self::$adjective_phrases)) {
-            $adjective_string .= " " . $tagged_phrase[$cur_node]['token'];
+            $adjective_string .= " " . $tagged_phrase[$cur_node]["token"];
             $cur_node++;
         }
         if (!empty($adjective_string)) {
             $tree["JJ"] = $adjective_string;
         }
-        $tree['cur_node'] = $cur_node;
+        $tree["cur_node"] = $cur_node;
         return $tree;
     }
     /**
@@ -592,24 +538,23 @@ class Tokenizer
      *     "tag"=> part_of_speech_tag_for_term)
      * @return array used to represent a tree. The array has up to three fields
      *      $tree["cur_node"] index of how far we parsed our$tagged_phrase
-     *      $tree["NP"] contains a subtree for a noun phrase
-     *      $tree["VP"] contains a subtree for a verb phrase
+     *      $tree["NP"] contains a subtree for a subject phrase
+     *      $tree["POST"] contains a subtree for a object phrase
+     *      $tree["VP"] contains a subtree for a predicate phrase
      */
     public static function generatePhraseParseTree($tagged_phrase)
     {
         $tree = [];
-        $tree_np = self::extractNounPhrase($tagged_phrase,['cur_node' => 0]);
-        $tree = ["cur_node" => $tree_np['cur_node']];
+        $tree_np = self::extractNounPhrase($tagged_phrase,["cur_node" => 0]);
+        $tree = ["cur_node" => $tree_np["cur_node"]];
+        $tree_pp = self::extractPostpositionPhrase($tagged_phrase, $tree);
+        $tree["cur_node"] = $tree_pp["cur_node"];
         $tree_vp = self::extractVerbPhrase($tagged_phrase, $tree);
-        $tree['cur_node'] = $tree_vp['cur_node'];
-        if ($tree == $tree_vp) {
-            unset($tree_np['cur_node'], $tree_vp['cur_node']);
-            $tree['NP'] = $tree_np['NP'];
-            return $tree;
-        }
-        unset($tree_np['cur_node'], $tree_vp['cur_node']);
-        $tree['NP'] = $tree_np['NP'];
-        $tree['VP'] = $tree_vp['VP'];
+        $tree["cur_node"] = $tree_vp["cur_node"];
+        unset($tree_np["cur_node"], $tree_pp["cur_node"], $tree_vp["cur_node"]);
+        $tree["NP"] = $tree_np["NP"];
+        $tree["POST"] = $tree_pp["POST"];
+        $tree["VP"] = $tree_vp["VP"];
         return $tree;
     }
     /**
@@ -628,26 +573,29 @@ class Tokenizer
         $triplets_list = [];
         $question_list = [];
         $question_answer_list = [];
-        $triplet_types = ['CONCISE', 'RAW'];
+        $triplet_types = ["CONCISE", "RAW"];
         foreach ($word_and_phrase_list as $word_and_phrase => $position_list) {
-            $tagged_phrase = self::tagTokenizePartOfSpeech($word_and_phrase);
+            $sentence = $word_and_phrase;
+            $sentence = preg_replace("/\s+/u", " ", $word_and_phrase);
+            $sentence = trim($sentence);
+            $tagged_phrase = self::tagTokenizePartOfSpeech($sentence);
             $parse_tree = self::generatePhraseParseTree($tagged_phrase);
             $triplets = self::extractTripletsParseTree($parse_tree);
             $extracted_triplets = self::rearrangeTripletsByType($triplets);
             foreach ($triplet_types as $type) {
                 if (!empty($extracted_triplets[$type])) {
                     $triplets = $extracted_triplets[$type];
-                    $questions = $triplets['QUESTION_LIST'];
+                    $questions = $triplets["QUESTION_LIST"];
                     foreach ($questions as $question) {
                         $question_list[$question] = $position_list;
                     }
                     $question_answer_list = array_merge($question_answer_list,
-                        $triplets['QUESTION_ANSWER_LIST']);
+                        $triplets["QUESTION_ANSWER_LIST"]);
                 }
             }
         }
-        $out_triplets['QUESTION_LIST'] = $question_list;
-        $out_triplets['QUESTION_ANSWER_LIST'] = $question_answer_list;
+        $out_triplets["QUESTION_LIST"] = $question_list;
+        $out_triplets["QUESTION_ANSWER_LIST"] = $question_answer_list;
         return $out_triplets;
     }
     /**
@@ -681,19 +629,19 @@ class Tokenizer
     public static function extractSubjectParseTree($tree)
     {
         $subject = [];
-        if (!empty($tree['NP'])) {
-            $subject['CONCISE'] = self::extractDeepestSpeechPartPhrase(
-                $tree['NP'], "NN");
+        if (!empty($tree["NP"])) {
+            $subject["CONCISE"] = self::extractDeepestSpeechPartPhrase(
+                $tree["NP"], "NN");
             $raw_subject = "";
             $it = new \RecursiveIteratorIterator(
-                new \RecursiveArrayIterator($tree['NP']));
+                new \RecursiveArrayIterator($tree["NP"]));
             foreach ($it as $v) {
                 $raw_subject .= $v . " ";
             }
-            $subject['RAW']= $raw_subject;
+            $subject["RAW"]= $raw_subject;
         } else {
-            $subject['CONCISE'] = "";
-            $subject['RAW'] = "";
+            $subject["CONCISE"] = "";
+            $subject["RAW"] = "";
         }
         return $subject;
     }
@@ -709,23 +657,23 @@ class Tokenizer
     public static function extractPredicateParseTree($tree)
     {
         $predicate = [];
-        if (!empty($tree['VP'])) {
-            $tree_vp = $tree['VP'];
-            $predicate['CONCISE'] = self::extractDeepestSpeechPartPhrase(
+        if (!empty($tree["VP"])) {
+            $tree_vp = $tree["VP"];
+            $predicate["CONCISE"] = self::extractDeepestSpeechPartPhrase(
                 $tree_vp, "VB");
             $raw_predicate = "";
-            if (!empty($tree_vp['VB'])) {
-                $tree_vb = $tree_vp['VB'];
+            if (!empty($tree_vp["VB"])) {
+                $tree_vb = $tree_vp["VB"];
                 $it = new \RecursiveIteratorIterator(
                     new \RecursiveArrayIterator($tree_vb));
                 foreach ($it as $v) {
                     $raw_predicate .= $v . " ";
                 }
-                $predicate['RAW'] = $raw_predicate;
+                $predicate["RAW"] = $raw_predicate;
             }
         } else {
-            $predicate['CONCISE'] = "";
-            $predicate['RAW'] = "";
+            $predicate["CONCISE"] = "";
+            $predicate["RAW"] = "";
         }
         return $predicate;
     }
@@ -741,26 +689,25 @@ class Tokenizer
     public static function extractObjectParseTree($tree)
     {
         $object = [];
-        if (!empty($tree['VP'])) {
-            $tree_vp = $tree['VP'];
-            if (!empty($tree_vp['NP'])) {
-                $nb = $tree_vp['NP'];
-                $object['CONCISE'] = self::extractDeepestSpeechPartPhrase($nb,
+        if (!empty($tree["POST"])) {
+            $tree_pp = $tree["POST"];
+            if (!empty($tree_pp["NP"])) {
+                $np = $tree_pp["NP"];
+                $object["CONCISE"] = self::extractDeepestSpeechPartPhrase($np,
                     "NN");
-                $raw_object = "";
-                $it = new \RecursiveIteratorIterator(
-                    new \RecursiveArrayIterator($nb));
-                foreach ($it as $v) {
-                    $raw_object .= $v . " ";
-                }
-                $object['RAW'] = $raw_object;
             } else {
-                $object['CONCISE'] = "";
-                $object['RAW'] = "";
+                $object["CONCISE"] = "";
             }
+            $raw_object = "";
+            $it = new \RecursiveIteratorIterator(
+                new \RecursiveArrayIterator($tree_pp));
+            foreach ($it as $v) {
+                $raw_object .= $v . " ";
+            }
+            $object["RAW"] = $raw_object;
         } else {
-            $object['CONCISE'] = "";
-            $object['RAW'] = "";
+            $object["CONCISE"] = "";
+            $object["RAW"] = "";
         }
         return $object;
     }
@@ -771,15 +718,15 @@ class Tokenizer
      * original phrase and RAW to the case where extraneous words have been
      * removed
      *
-     * @param are $tree a parse tree for a sentence
+     * @param  array $parse_tree a parse tree for a sentence
      * @return array triplet array
      */
     public static function extractTripletsParseTree($parse_tree)
     {
         $triplets = [];
-        $triplets['subject'] = self::extractSubjectParseTree($parse_tree);
-        $triplets['object'] = self::extractObjectParseTree($parse_tree);
-        $triplets['predicate'] = self::extractPredicateParseTree($parse_tree);
+        $triplets["subject"] = self::extractSubjectParseTree($parse_tree);
+        $triplets["object"] = self::extractObjectParseTree($parse_tree);
+        $triplets["predicate"] = self::extractPredicateParseTree($parse_tree);
         return $triplets;
     }
     /**
@@ -794,17 +741,17 @@ class Tokenizer
     public static function rearrangeTripletsByType($sub_pred_obj_triplets)
     {
         $processed_triplet = [];
-        $processed_triplets['CONCISE'] =
-            self::extractTripletByType($sub_pred_obj_triplets, 'CONCISE');
-        $processed_triplets['RAW'] =
-            self::extractTripletByType($sub_pred_obj_triplets, 'RAW');
+        $processed_triplets["CONCISE"] =
+            self::extractTripletByType($sub_pred_obj_triplets, "CONCISE");
+        $processed_triplets["RAW"] =
+            self::extractTripletByType($sub_pred_obj_triplets, "RAW");
         return $processed_triplets;
     }
     /**
      * Takes a triplets array with subject, predicate, object fields with
-     * CONCISE, RAW subfields and produces a triplits with $type subfield (where
-     * $type is one of CONCISE and RAW) and with subject, predicate, object,
-     * and QUESTION_ANSWER_LIST subfields
+     * CONCISE, RAW subfields and produces triplets with $type subfield
+     * where $type is one of CONCISE and RAW and with subject, predicate,
+     * object and QUESTION_ANSWER_LIST subfields
      *
      * @param array $sub_pred_obj_triplets  in format described above
      * @param string $type either CONCISE or RAW
@@ -813,27 +760,30 @@ class Tokenizer
     public static function extractTripletByType($sub_pred_obj_triplets, $type)
     {
         $triplets = [];
-        if (!empty($sub_pred_obj_triplets['subject'][$type])
-            && !empty($sub_pred_obj_triplets['predicate'][$type])
-            && !empty($sub_pred_obj_triplets['object'][$type])) {
+        if (!empty($sub_pred_obj_triplets["subject"][$type])
+            && !empty($sub_pred_obj_triplets["predicate"][$type])
+            && !empty($sub_pred_obj_triplets["object"][$type])) {
             $question_answer_triplets = [];
             $question_marker = self::$question_marker;
-            $sentence = [$sub_pred_obj_triplets['subject'][$type],
-                    $sub_pred_obj_triplets['predicate'][$type],
-                    $sub_pred_obj_triplets['object'][$type]];
+            $sentence = [$sub_pred_obj_triplets["subject"][$type],
+                    $sub_pred_obj_triplets["object"][$type],
+                    $sub_pred_obj_triplets["predicate"][$type]];
             $question_triplets = [];
             for ($j = 0; $j < 2; $j++) {
                 for ($i = 0; $i < 3; $i++) {
-                    $q_sentence = $sentence;
-                    $q_sentence[$i] = $question_marker;
-                    $q_sentence_string = implode(" ", $q_sentence);
-                    $question_triplets[] = $q_sentence_string;
-                    $question_answer_triplets[$q_sentence_string] =
-                        preg_replace('/\s+/u', ' ',$sentence[$i]);
+                    $question = $sentence;
+                    $question[$i] = $question_marker;
+                    $question_string = implode(" ", $question);
+                    $question_string = trim($question_string);
+                    $question_string = preg_replace("/\s+/u", " ",
+                        $question_string);
+                    $question_triplets[] = $question_string;
+                    $question_answer_triplets[$question_string] =
+                        preg_replace("/\s+/u", " ", $sentence[$i]);
                 }
             }
-            $triplets['QUESTION_LIST'] = $question_triplets;
-            $triplets['QUESTION_ANSWER_LIST'] = $question_answer_triplets;
+            $triplets["QUESTION_LIST"] = $question_triplets;
+            $triplets["QUESTION_ANSWER_LIST"] = $question_answer_triplets;
         }
         return $triplets;
     }
@@ -845,33 +795,26 @@ class Tokenizer
      * @param int $index current index in statement
      * @return array parsed triplet
      */
-    public static function parseWhoQuestion($tagged_question, $index)
+    public static function parseQuestion($tagged_question, $index)
     {
-        $start_pos = 0;
-        if ($index == 0)
-            $start_pos = $index + 1;
         $generated_questions = [];
-        $question_marker = self::getQuestionMarker();
+        $question_marker = trim(self::getQuestionMarker());
         $triplets = [];
-        $tree_np = self::extractNounPhrase($tagged_question, ["cur_node" =>
-            $start_pos]);
-        $triplets['subject'] = self::extractSubjectParseTree($tree_np);
-        $tree = ["cur_node" => $index];
-        $tree['NP'] = $tagged_question[$index]['token'];
-        $tree_vp = self::extractVerbPhrase($tagged_question, $tree);
-        $triplets['predicate'] = self::extractPredicateParseTree($tree_vp);
-        $triplet_types = ['CONCISE', 'RAW'];
+        $tree_np = self::extractNounPhrase($tagged_question,
+            ["cur_node" => 0]);
+        $triplets["subject"] = self::extractSubjectParseTree($tree_np);
+        $tree_vp = self::extractVerbPhrase($tagged_question,
+            ["cur_node" => $index+1]);
+        $triplets["predicate"] = self::extractPredicateParseTree($tree_vp);
+        $triplet_types = ["CONCISE", "RAW"];
         foreach ($triplet_types as $type) {
-            if (!empty($triplets['subject'][$type])
-                && !empty($triplets['predicate'][$type])) {
-                $generated_questions[$type][] =
-                    trim($triplets['subject'][$type]) .
-                    " " . trim($triplets['predicate'][$type]) . " " .
-                    $question_marker;
-                $generated_questions[$type][] =
-                    trim($triplets['subject'][$type]) .
+            if (!empty($triplets["subject"][$type])
+                && !empty($triplets["predicate"][$type])) {
+                $question = trim (trim($triplets["subject"][$type]) .
                     " " . $question_marker .
-                    " " . trim($triplets['predicate'][$type]);
+                    " " . trim($triplets["predicate"][$type]));
+                $question = preg_replace("/\s+/u", " ", $question);
+                $generated_questions[$type][] = $question;
             }
         }
         return $generated_questions;
@@ -885,10 +828,11 @@ class Tokenizer
      */
     public function isQuestion($phrase)
     {
-        $who_question = "कौन";
         $phrase = trim($phrase);
-        if (mb_strpos($phrase, $who_question) !== false) {
-            return true;
+        for ($i = 0; $i < count(self::$questions); $i++) {
+            if (mb_strpos($phrase, trim(self::$questions[$i])) !== false) {
+                return true;
+            }
         }
         return false;
     }
@@ -902,27 +846,25 @@ class Tokenizer
         return self::$question_marker;
     }
     /**
-     * Takes WH questions and returns the triplet from the question
+     * Takes questions and returns the triplet from the question
      *
      * @param string $question question to parse
      * @return array question triplet
      */
     public static function questionParser($question)
     {
-        /*
-         * Array of 'wh' questions: What, When, Where, Why, Who, Which, Whom,
-         * Whose
-         */
-        $wh_questions = array( "क्या", "कब", "कहा", "क्यों", "कौन", "जिसे",
-            "जिसका", "कहाँ");
+        $question = trim($question);
+        $question = preg_replace("/\s+/u", " ", $question);
         $tagged_question = self::tagTokenizePartOfSpeech($question);
         $index = -1;
         foreach ($tagged_question as $i => $term_pos) {
-            if (in_array($term_pos['token'], $wh_questions)) {
+            if (in_array($term_pos["token"], self::$questions)) {
                 $index = $i;
+                $term_pos["tag"] = "p_wh";
+                $tagged_question[$i] = $term_pos;
                 break;
             }
         }
-        return self::parseWhoQuestion($tagged_question, $index);
+        return self::parseQuestion($tagged_question, $index);
     }
 }
diff --git a/src/models/ProfileModel.php b/src/models/ProfileModel.php
index 3b31e613f..ac3f19fac 100755
--- a/src/models/ProfileModel.php
+++ b/src/models/ProfileModel.php
@@ -390,6 +390,10 @@ class ProfileModel extends Model
                 ACCESS_COUNT INTEGER,
                 PRIMARY KEY(ADDRESS, PAGE_NAME))",
             "VERSION" => "CREATE TABLE VERSION(ID INTEGER PRIMARY KEY)",
+            "LEXICON" => "CREATE TABLE LEXICON(
+                TERM VARCHAR(". C\LONG_NAME_LEN ."),
+                LOCALE VARCHAR(" . C\NAME_LEN . "),
+                PART_OF_SPEECH VARCHAR(16), PRIMARY KEY(TERM, LOCALE))",
             ];
     }
     /**

ViewGit