diff --git a/src/configs/Config.php b/src/configs/Config.php
index 0a010845a..d42a5e7bd 100755
--- a/src/configs/Config.php
+++ b/src/configs/Config.php
@@ -976,3 +976,5 @@ nsconddefine('AD_LOGO','resources/adv-logo.png');
nsconddefine('SENTENCE_COMPRESSION_ENABLED', false);
/** Define cipher to be used in AES */
nsconddefine('AES_256_CBC', 'aes-256-cbc');
+/** The number of rows to be used in bulk insert from Lexicon */
+nsconddefine('NUM_LEX_BULK_INSERTS',100000);
diff --git a/src/configs/Createdb.php b/src/configs/Createdb.php
index efd9bf31f..a2086903e 100755
--- a/src/configs/Createdb.php
+++ b/src/configs/Createdb.php
@@ -264,26 +264,22 @@ foreach ($locales as $locale) {
foreach ($lines as $line) {
$line = trim($line, " ");
$line = explode(" ", $line);
- if (empty($line[0]) || empty($line[1]))
- continue;
$insert_values .= '(\'' . trim($line[0]) . '\',\'' . $locale[0] .
'\',\'' . trim($line[1]) . '\'),';
$count++;
- if ($count >= 10000) {
+ if ($count >= C\NUM_LEX_BULK_INSERTS) {
$insert_values = rtrim($insert_values, ',');
- $query = 'INSERT INTO LEXICON (WORD, LOCALE, POS) VALUES
- {$insert_values}';
+ $query = "INSERT INTO LEXICON (TERM, LOCALE, PART_OF_SPEECH)
+ VALUES {$insert_values}";
$db->exec($query);
$insert_values = "";
$count = 0;
- if ($db->affectedRows() == 0) {
- continue;
- }
}
}
+
if ($count > 0) {
$insert_values = rtrim($insert_values, ',');
- $query = "INSERT INTO LEXICON (WORD, LOCALE, POS) VALUES
+ $query = "INSERT INTO LEXICON (TERM, LOCALE, PART_OF_SPEECH) VALUES
{$insert_values}";
$db->exec($query);
}
diff --git a/src/library/VersionFunctions.php b/src/library/VersionFunctions.php
index 6131a9454..afa228859 100644
--- a/src/library/VersionFunctions.php
+++ b/src/library/VersionFunctions.php
@@ -1564,6 +1564,55 @@ function upgradeDatabaseVersion54(&$db)
*/
function upgradeDatabaseVersion55(&$db)
{
- $db->execute("CREATE TABLE LEXICON(WORD VARCHAR, LOCALE VARCHAR,
- POS VARCHAR, PRIMARY KEY(WORD, LOCALE))");
+ $db->execute("CREATE TABLE LEXICON(
+ TERM VARCHAR(". C\LONG_NAME_LEN ."),
+ LOCALE VARCHAR(" . C\NAME_LEN . "),
+ PART_OF_SPEECH VARCHAR(16), PRIMARY KEY(TERM, LOCALE))");
+
+ // Retrieve the locales added to the Locale table
+ $sql = "SELECT LOCALE_TAG from LOCALE";
+ $result = $db->execute($sql);
+ if ($result) {
+ $locales = $db->fetchArray($result);
+ }
+ /*
+ * Go through the locales, check of there is a lexicon,
+ * if present then add it to the Lexicon database.
+ * as (term, part_of_speech, locale)
+ */
+ foreach ($locales as $locale) {
+ $folder_name = $locale;
+ if (strstr($locale, "-")) {
+ $locale_name = explode("-", $locale);
+ $folder_name = $locale_name . "_" . $locale_name;
+ }
+ $lexicon_file = C\LOCALE_DIR . "/" . $folder_name .
+ "/resources/lexicon.txt.gz";
+ if (file_exists($lexicon_file)) {
+ $lines = gzfile($lexicon_file);
+ $insert_values = "";
+ $count = 0;
+ foreach ($lines as $line) {
+ $line = trim($line, " ");
+ $line = explode(" ", $line);
+ $insert_values .= '(' . trim($line[0]) . ',' . $locale[0] .
+ ',' . trim($line[1]) . '),';
+ $count++;
+ if ($count >= C\NUM_LEX_BULK_INSERTS) {
+ $insert_values = rtrim($insert_values, ',');
+ $query = "INSERT INTO LEXICON (TERM, LOCALE, PART_OF_SPEECH)
+ VALUES {$insert_values}";
+ $db->exec($query);
+ $insert_values = "";
+ $count = 0;
+ }
+ }
+ if ($count > 0) {
+ $insert_values = rtrim($insert_values, ',');
+ $query = "INSERT INTO LEXICON (TERM, LOCALE, PART_OF_SPEECH)
+ VALUES {$insert_values}";
+ $db->exec($query);
+ }
+ }
+ }
}
diff --git a/src/locale/hi/resources/Tokenizer.php b/src/locale/hi/resources/Tokenizer.php
index f0df85681..08da7bfce 100755
--- a/src/locale/hi/resources/Tokenizer.php
+++ b/src/locale/hi/resources/Tokenizer.php
@@ -47,12 +47,13 @@ class Tokenizer
* List of verb-like parts of speech that might appear in lexicon
* @array
*/
- public static $verb_phrases = ["VB", "VBD", "VBG", "VBN", "VBP", "VBZ"];
+ public static $verb_phrases = ["VB", "VBD", "VBG", "VBN", "VBP", "VBZ",
+ "RB"];
/**
* List of noun-like parts of speech that might appear in lexicon
* @array
*/
- public static $noun_phrases = ["NN", "NNS", "NNP", "NNPS", "PRP"];
+ public static $noun_phrases = ["NN", "NNS", "NNP", "NNPS", "DT"];
/**
* List of adjective-like parts of speech that might appear in lexicon
* @array
@@ -62,8 +63,13 @@ class Tokenizer
* List of postpositional-like parts of speech that might appear in lexicon
* @array
*/
- public static $postpositional_phrases = ["inj", "PREP", "proNN", "CONJ",
- "INT", "particle", "case", "PSP"];
+ public static $postpositional_phrases = ["IN", "inj", "PREP", "proNN",
+ "CONJ", "INT", "particle", "case", "PSP", "direct_DT", "PRP"];
+ /*
+ * List of questions in Hindi
+ */
+ public static $questions = ["क्या", "कब", "कहा", "क्यों", "कौन", "जिसे",
+ "जिसका", "कहाँ", "कहां"];
/**
* Any unique identifier corresponding to the component of a triplet which
* can be answered using a question answer list
@@ -88,40 +94,6 @@ class Tokenizer
{
return $pre_segment;
}
- /**
- * Removes the stop words from the page (used for Word Cloud generation)
- *
- * @param string $page the page to remove stop words from.
- * @return string $page with no stop words
- */
- public static function stopwordsRemover($page)
- {
- $stop_words = [
- "पर ", "इन ", "वह ", "यिह ", "वुह ", "जिन्हें", "जिन्हों",
- "तिन्हें", "तिन्हों", "किन्हों", "किन्हें", "इत्यादि", "द्वारा",
- "इन्हें", "इन्हों", "उन्हों", "बिलकुल", "निहायत", "ऱ्वासा",
- "इन्हीं", "उन्हीं", "उन्हें", "इसमें", "जितना", "दुसरा",
- "कितना", "दबारा", "साबुत", "वग़ैरह", "दूसरे", "कौनसा", "लेकिन",
- "होता", "करने", "किया", "लिये", "अपने", "नहीं", "दिया", "इसका",
- "करना", "वाले", "सकते", "इसके", "सबसे", "होने", "करते", "बहुत",
- "वर्ग", "करें", "होती", "अपनी", "उनके", "कहते", "होते", "करता",
- "उनकी", "इसकी", "सकता", "रखें", "अपना", "उसके", "जिसे",
- "तिसे", "किसे", "किसी", "काफ़ी", "पहले", "नीचे", "बाला", "यहाँ",
- "जैसा", "जैसे", "मानो", "अंदर", "भीतर", "पूरा", "सारा", "होना",
- "उनको", "वहाँ", "वहीं", "जहाँ", "जीधर","उनका", "इनका", "के",
- "हैं", "गया", "बनी", "एवं", "हुआ", "साथ", "बाद", "लिए", "कुछ",
- "कहा", "यदि", "हुई", "इसे", "हुए", "अभी", "सभी", "कुल", "रहा",
- "रहे", "इसी", "उसे", "जिस", "जिन", "तिस", "तिन", "कौन", "किस",
- "कोई", "ऐसे", "तरह", "किर", "साभ", "संग", "यही", "बही", "उसी",
- "फिर", "मगर", "का", "एक", "यह", "से", "को", "इस", "कि", "जो",
- "कर", "मे", "ने", "तो", "ही", "या", "हो", "था", "तक", "आप", "ये",
- "थे", "दो", "वे", "थी", "जा", "ना", "उस", "एस", "पे", "उन", "सो",
- "भी", "और", "घर", "तब", "जब", "अत", "व", "न"
- ];
- $page = preg_replace('/\b('.implode('|',$stop_words).')\b/u', '',
- $page);
- return $page;
- }
/**
* Computes the stem of an Hindi word
*
@@ -130,10 +102,6 @@ class Tokenizer
*/
public static function stem($word)
{
- if (in_array($word, self::$no_stem_list)) {
- return $word;
- }
- $word = self::removeSuffix($word);
return $word;
}
/**
@@ -144,30 +112,6 @@ class Tokenizer
*/
private static function removeSuffix($word)
{
- $length = mb_strlen($word);
- if ($length > 5) {
- $last_three = mb_substr($word, -3);
- if (in_array($last_three, ["िया", "ियो"])) {
- $word = mb_substr($word, 0, -3);
- return $word;
- }
- }
- if ($length > 4) {
- $last_two = mb_substr($word, -2);
- if (in_array($last_two, ["ाए", " ाओ", " ुआ", " ुओ",
- "ये", " ेन", " ेण", " ीय", "टी", "ार", "ाई"])) {
- $word = mb_substr($word, 0, -2);
- return $word;
- }
- }
- if ($length > 3) {
- $last_one = mb_substr($word, -1);
- if (in_array($last_one, [" ा", " े", " ी", " ो", "ि ",
- "अ"])) {
- $word = mb_substr($word, 0, -1);
- return $word;
- }
- }
return $word;
}
/**
@@ -195,7 +139,7 @@ class Tokenizer
*/
public static function tagTokenizePartofSpeech($text)
{
- $tokens = preg_split("/[\s]+/", $text);
+ $tokens = preg_split("/\s+/u", $text);
$result = [];
$tag_list = [];
$i = 0;
@@ -204,25 +148,25 @@ class Tokenizer
{
//Tag the tokens as found in the Lexicon
$token = trim($token);
- $current = ['token' => $token, 'tag' => 'UNKNOWN'];
- $word = $current['token'];
- $sql = "SELECT * FROM LEXICON WHERE WORD = '{$word}'
- AND LOCALE = 'hi'";
+ $current = ["token" => $token, "tag" => "UNKNOWN"];
+ $term = $current["token"];
+ $sql = "SELECT PART_OF_SPEECH FROM LEXICON WHERE TERM = '{$term}'
+ AND LOCALE = 'hi'";
$queryResult = @$model->db->execute($sql);
if ($queryResult !== false) {
$row = $model->db->fetchArray($queryResult);
- $current['tag'] = $row['POS'];
+ $current["tag"] = $row["PART_OF_SPEECH"];
}
if (is_numeric($token)) {
- $current['tag'] = "NN";
+ $current["tag"] = "NN";
} else if (strcmp($token,"है") == 0 ||
strcmp($token, "हैं") == 0) {
- $current['tag'] = "VB";
+ $current["tag"] = "VB";
}
- if (!isset($current['tag'])) {
- $current['tag'] = "UNKNOWN";
+ if (!isset($current["tag"])) {
+ $current["tag"] = "UNKNOWN";
}
$result[$i] = $current;
@@ -236,65 +180,75 @@ class Tokenizer
public static function tagUnknownWords($partiallyTaggedText)
{
$result = $partiallyTaggedText;
- $verbs = ['VBZ','VBD','VBN'];
+ $verbs = ["VBZ","VBD","VBN"];
$length = count($result);
$previous = $result[0];
for ($i = 1; $i < $length; $i++)
{
$current = $result[$i];
- $current['token'] = trim($current['token']);
- $current['tag'] = trim($current['tag']);
- if ($current['tag'] == "UNKNOWN" || $previous['tag'] == "UNKNOWN")
- {
- //RULE 1: If the previous word tagged is a Adjective / Pronoun
- // Postposition then the current word is likely to be a noun
- if ($previous['tag'] == 'JJ' ||
- $previous['tag'] == 'PRO_NN' ||
- $previous['tag'] == 'POST_POS') {
- $current['tag'] = 'NN';
+ $current["token"] = trim($current["token"]);
+ $current["tag"] = trim($current["tag"]);
+ if ($current["tag"] == "UNKNOWN" || $previous["tag"] == "UNKNOWN") {
+ /**
+ * RULE 1: If the previous word tagged is a Adjective Pronoun
+ * Postposition then the current word is likely to be a noun
+ */
+ if ($previous["tag"] == "JJ" ||
+ $previous["tag"] == "PRO_NN" ||
+ $previous["tag"] == "POST_POS") {
+ $current["tag"] = "NN";
$result[$i] = $current;
}
- //RULE 2: If the current word is a verb then the previous word is
- //likely to be a noun
- if (in_array($current['tag'], $verbs)) {
- $previous['tag'] = 'NN';
- $result[$i] = $previous;
+ /**
+ * RULE 2: If the current word is a verb then the previous
+ * word is likely to be a noun
+ */
+ if (in_array($current["tag"], $verbs)) {
+ $previous["tag"] = "NN";
+ $result[$i-1] = $previous;
}
- //PRONOUN IDENTIFICATION
- //RULE 3: If the previous word is unknown and cuurent word is a
- //noun then the previous word is most likely to be a pronoun
- if ($previous['tag'] == 'UNKNOWN' &&
- $current['tag'] == 'NN') {
- $previous['tag'] = 'PRP';
+ /**
+ * PRONOUN IDENTIFICATION
+ * RULE 3: If the previous word is unknown and cuurent word
+ * is a noun then the previous word is most likely to be a
+ * pronoun
+ */
+ if ($previous["tag"] == "UNKNOWN" &&
+ $current["tag"] == "NN") {
+ $previous["tag"] = "PRP";
$result[$i-1] = $previous;
}
- //VERB IDENTIFICATION
- //RULE 4: If the current word is tagged as Auxilary verb and
- //previous word is tagged as Unknown then most likely that the
- //previous word is a verb
- if ($current['tag'] == 'VAUX' &&
- $previous['tag'] == 'UNKNOWN') {
- $previous['tag'] = 'VB';
+ /**
+ * VERB IDENTIFICATION
+ * RULE 4: If the current word is tagged as Auxilary verb and
+ * previous word is tagged as Unknown then most likely that
+ * the previous word is a verb
+ */
+ if ($current["tag"] == "VAUX" &&
+ $previous["tag"] == "UNKNOWN") {
+ $previous["tag"] = "VB";
$result[$i-1] = $previous;
- }
- //ADJECTIVE IDENTIFIATION
- //RULE 5: if the currennt word ends with 'तम' or 'इक' or 'िक'
- //or 'तर' then the word is an adjective
- if(mb_substr($current['token'], -2, 2) == "इक" ||
- mb_substr($current['token'], -2, 2) == "िक" ||
- mb_substr($current['token'], -2, 2) == "तर" ||
- mb_substr($current['token'], -2, 2) == "तम") {
- $current['tag'] = 'AJ';
+ }
+ /**
+ * ADJECTIVE IDENTIFIATION
+ * RULE 5: if the currennt word ends with "तम" or "इक" or "िक"
+ * or "तर" then the word is an adjective
+ */
+ if(mb_substr($current["token"], -2, 2) == "इक" ||
+ mb_substr($current["token"], -2, 2) == "िक" ||
+ mb_substr($current["token"], -2, 2) == "तर" ||
+ mb_substr($current["token"], -2, 2) == "तम") {
+ $current["tag"] = "JJ";
$result[$i] = $current;
}
- if ($current['tag'] == "UNKNOWN") {
- $current['tag'] = 'NN';
+ if ($current["tag"] == "UNKNOWN") {
+ $current["tag"] = "NN";
$result[$i] = $current;
}
- if ($previous['tag'] == "UNKNOWN"){
- $previous['tag'] = 'NN';
+ if ($previous["tag"] == "UNKNOWN"){
+ $previous["tag"] = "NN";
$result[$i-1] = $previous;
}
}
@@ -327,10 +281,10 @@ class Tokenizer
"direct_DT" => "DT",
];
foreach ($tagged_tokens as $t) {
- $tag = trim($t['tag']);
+ $tag = trim($t["tag"]);
$tag = (isset($simplified_parts_of_speech[$tag])) ?
$simplified_parts_of_speech[$tag] : $tag;
- $token = ($with_tokens) ? $t['token'] . "~" : "";
+ $token = ($with_tokens) ? $t["token"] . "~" : "";
$tagged_phrase .= $token . $tag . " ";
}
return $tagged_phrase;
@@ -353,21 +307,21 @@ class Tokenizer
{
//Combining multiple noun into one
$noun_string = "";
- $cur_node = $tree['cur_node'];
- while (isset($tagged_phrase[$cur_node]['tag']) &&
- (in_array(trim($tagged_phrase[$cur_node]['tag']),
+ $cur_node = $tree["cur_node"];
+ while (isset($tagged_phrase[$cur_node]["tag"]) &&
+ (in_array(trim($tagged_phrase[$cur_node]["tag"]),
self::$noun_phrases))) {
- $noun_string .= " " . $tagged_phrase[$cur_node]['token'];
+ $noun_string .= " " . $tagged_phrase[$cur_node]["token"];
$cur_node++;
}
if (!empty($noun_string)) {
$tree["NN"] = $noun_string;
}
- $tree['cur_node'] = $cur_node;
+ $tree["cur_node"] = $cur_node;
return $tree;
}
/**
- * Takes a part-of-speech tagged phrase and pre-tree with a
+ * Takes a part-of-speech tagged phrase and parse-tree with a
* parse-from position and builds a parse tree for a sequence of
* postpositional phrases if possible
*
@@ -379,56 +333,57 @@ class Tokenizer
* @return array has fields
* "cur_node" index of how far we parsed $tagged_phrase
*/
- public static function extractPostposition($tagged_phrase, $tree,
+ public static function extractPostpositionPhrase($tagged_phrase, $tree,
$index = 1)
{
- $cur_node = $tree['cur_node'];
- if (isset($tagged_phrase[$cur_node]['tag']) &&
- in_array(trim($tagged_phrase[$cur_node]['tag']),
+ $cur_node = $tree["cur_node"];
+ $tree_pp["cur_node"] = $tree["cur_node"];
+ if (isset ($tagged_phrase[$cur_node]["tag"]) &&
+ in_array($tagged_phrase[$cur_node]["tag"],
+ self::$postpositional_phrases)) {
+ $pp_string ="";
+ while (isset($tagged_phrase[$cur_node]["tag"]) &&
+ in_array($tagged_phrase[$cur_node]["tag"],
self::$postpositional_phrases)) {
- $preposition_string = "";
- while (isset($tagged_phrase[$cur_node]['tag']) &&
- in_array(trim($tagged_phrase[$cur_node]['tag']),
- self::$postpositional_phrases)) {
- $preposition_string .= " ". $tagged_phrase[$cur_node]['token'];
+ $pp_string .= " " . $tagged_phrase[$cur_node]["token"];
$cur_node++;
}
- if (!empty($preposition_string)) {
- $tree["IN_$index"] = $preposition_string;
- }
- if (isset($tagged_phrase[$cur_node]['tag']) &&
- trim($tagged_phrase[$cur_node]['tag']) == "DT") {
- $tree['DT_$index'] = $tagged_phrase[$cur_node]['token'];
- $cur_node++;
+ if (!empty($pp_string)) {
+ $tree_pp["IN_$index"] = $pp_string;
}
$adjective_string = "";
- while (isset($tagged_phrase[$cur_node]['tag']) &&
- in_array(trim($tagged_phrase[$cur_node]['tag']),
- self::$adjective_phrases)) {
- $adjective_string .= " " . $tagged_phrase[$cur_node]['token'];
+ while (isset($tagged_phrase[$cur_node]["tag"]) &&
+ in_array($tagged_phrase[$cur_node]["tag"],
+ self::$adjective_phrases)) {
+ $adjective_string .= " " .
+ $tagged_phrase[$cur_node]["token"];
$cur_node++;
}
if (!empty($adjective_string)) {
- $tree["JJ_$index"] = $adjective_string;
+ $tree_pp["JJ_$index"] = $adjective_string;
}
- $prep_noun_string = "";
- while (isset($tagged_phrase[$cur_node]['tag']) &&
- in_array(trim($tagged_phrase[$cur_node]['tag']),
- self::$noun_phrases)) {
- $prep_noun_string .= " ". $tagged_phrase[$cur_node]['token'];
+ $nn_string = "";
+ while (isset($tagged_phrase[$cur_node]["tag"]) &&
+ in_array($tagged_phrase[$cur_node]["tag"],
+ self::$noun_phrases)) {
+ $nn_string .= " " . $tagged_phrase[$cur_node]["token"];
$cur_node++;
}
- if ($prep_noun_string) {
- $tree["NP_$index"] = $prep_noun_string;
+ if (!empty($nn_string)) {
+ $tree_pp["NN_$index"] = $nn_string;
}
- $tree_next = self::extractPostposition($tagged_phrase,
- ["cur_node" => $cur_node], $index + 1);
+ $tree_pp["cur_node"] = $cur_node;
+ $tree_next = self::extractPostpositionPhrase($tagged_phrase,
+ $tree_pp, $index+1);
+ $tree_pp = array_merge ($tree_pp, $tree_next);
}
- $tree['cur_node'] = $cur_node;
+ $tree["cur_node"] = $tree_pp["cur_node"];
+ unset ($tree_pp["cur_node"]);
+ $tree["POST"] = $tree_pp;
return $tree;
}
/**
- * Takes a part-of-speech tagged phrase and pre-tree with a
+ * Takes a part-of-speech tagged phrase and parse-tree with a
* parse-from position and builds a parse tree for a noun phrase if possible
*
* @param array $tagged_phrase
@@ -438,30 +393,26 @@ class Tokenizer
* current parse position in $tagged_phrase]
* @return array has fields
* "cur_node" index of how far we parsed $tagged_phrase
- * "JJ" with value an adjective subtree
- * "POST" with value a post position subtree
+ * "JJ" with value an Adjective subtree
+ * "NN" with value of a Noun Subtree
*/
public static function extractNounPhrase($tagged_phrase, $tree)
{
- $cur_node = $tree['cur_node'];
- $tree_jj = self::extractAdjective($tagged_phrase,
- ['cur_node' => $tree['cur_node']]);
- $tree_nn =self::extractNoun($tagged_phrase,
- ['cur_node' => $tree_jj['cur_node']]);
- $tree_post = self::extractPostposition($tagged_phrase,
- ['cur_node' => $tree_nn['cur_node']]);
- if ($tree_nn['cur_node'] == $cur_node) {
- $tree['NP'] = "";
+ $cur_node = $tree["cur_node"];
+ $tree_jj = self::extractAdjective($tagged_phrase,
+ ["cur_node" => $tree["cur_node"]]);
+ $tree_nn = self::extractNoun($tagged_phrase,
+ ["cur_node" => $tree_jj["cur_node"]]);
+ if ($tree_nn["cur_node"] == $cur_node) {
+ $tree["NP"] = "";
} else {
- $cur_node = $tree_post['cur_node'];
- unset($tree_jj['cur_node']);
- $tree_new_sub['JJ'] = $tree_jj;
- unset($tree_nn['cur_node']);
- $tree_new_sub['NN'] = $tree_nn;
- unset($tree_post['cur_node']);
- $tree_new_sub['POST'] = $tree_post;
- $tree_new['cur_node'] = $cur_node;
- $tree_new['NP'] = $tree_new_sub;
+ $cur_node = $tree_nn["cur_node"];
+ unset($tree_jj["cur_node"]);
+ $tree_new_sub["JJ"] = $tree_jj;
+ unset($tree_nn["cur_node"]);
+ $tree_new_sub["NN"] = $tree_nn;
+ $tree_new["cur_node"] = $cur_node;
+ $tree_new["NP"] = $tree_new_sub;
return $tree_new;
}
return $tree;
@@ -482,24 +433,18 @@ class Tokenizer
*/
public static function extractVerb($tagged_phrase, $tree)
{
- $cur_node = $tree['cur_node'];
- // skip stuff before verb (intensifiers and adverbs)
- while (isset($tagged_phrase[$cur_node]['tag']) &&
- !in_array(trim($tagged_phrase[$cur_node]['tag']),
- self::$verb_phrases)) {
- $cur_node++;
- }
+ $cur_node = $tree["cur_node"];
$verb_string = "";
- while (isset($tagged_phrase[$cur_node]['tag']) &&
- in_array(trim($tagged_phrase[$cur_node]['tag']),
+ while (isset($tagged_phrase[$cur_node]["tag"]) &&
+ in_array(trim($tagged_phrase[$cur_node]["tag"]),
self::$verb_phrases)) {
- $verb_string .= " " . $tagged_phrase[$cur_node]['token'];
+ $verb_string .= " " . $tagged_phrase[$cur_node]["token"];
$cur_node++;
}
if (!empty($verb_string)) {
$tree["VB"] = $verb_string;
}
- $tree['cur_node'] = $cur_node;
+ $tree["cur_node"] = $cur_node;
return $tree;
}
/**
@@ -518,39 +463,40 @@ class Tokenizer
*/
public static function extractVerbPhrase($tagged_phrase, $tree)
{
- $cur_node = $tree['cur_node'];
- $tree_vb = self::extractVerb($tagged_phrase, ['cur_node' => $cur_node]);
- if ($tree_vb['cur_node'] == $cur_node) {
+ $cur_node = $tree["cur_node"];
+ $tree_vb = self::extractVerb($tagged_phrase, ["cur_node" => $cur_node]);
+ if ($tree_vb["cur_node"] == $cur_node) {
+ $tree["VP"] = [];
return $tree;
}
- $cur_node = $tree_vb['cur_node'];
- $preposition_string = "";
- while (isset($tagged_phrase[$cur_node]['tag']) &&
- in_array(trim($tagged_phrase[$cur_node]['tag']),
+ $cur_node = $tree_vb["cur_node"];
+ $postposition_string = "";
+ while (isset($tagged_phrase[$cur_node]["tag"]) &&
+ in_array(trim($tagged_phrase[$cur_node]["tag"]),
self::$postpositional_phrases)) {
- $preposition_string .= " ". $tagged_phrase[$cur_node]['token'];
+ $postposition_string .= " ". $tagged_phrase[$cur_node]["token"];
$cur_node++;
}
- if (!empty($preposition_string)) {
- $tree_vb["IN"] = $preposition_string;
+ if (!empty($postposition_string)) {
+ $tree_vb["IN"] = $postposition_string;
}
$tree_np = self::extractNounPhrase($tagged_phrase,
- ['cur_node' => $cur_node]);
+ ["cur_node" => $cur_node]);
$tree_new = [];
$tree_new_sub = [];
- if ($tree_np['cur_node'] != $cur_node) {
- $cur_node = $tree_np['cur_node'];
- unset($tree_vb['cur_node'], $tree_np['cur_node']);
- $tree_new_sub['VB'] = $tree_vb;
- $tree_new_sub['NP'] = $tree_np['NP'];
- $tree_new['cur_node'] = $cur_node;
- $tree_new['VP'] = $tree_new_sub;
+ if ($tree_np["cur_node"] != $cur_node) {
+ $cur_node = $tree_np["cur_node"];
+ unset($tree_vb["cur_node"], $tree_np["cur_node"]);
+ $tree_new_sub["VB"] = $tree_vb;
+ $tree_new_sub["NP"] = $tree_np["NP"];
+ $tree_new["cur_node"] = $cur_node;
+ $tree_new["VP"] = $tree_new_sub;
return $tree_new;
}
- unset($tree_vb['cur_node']);
- $tree_new_sub['VB'] = $tree_vb;
- $tree_new['cur_node'] = $cur_node;
- $tree_new['VP'] = $tree_new_sub;
+ unset($tree_vb["cur_node"]);
+ $tree_new_sub["VB"] = $tree_vb;
+ $tree_new["cur_node"] = $cur_node;
+ $tree_new["VP"] = $tree_new_sub;
return $tree_new;
}
/**
@@ -570,17 +516,17 @@ class Tokenizer
public static function extractAdjective($tagged_phrase, $tree)
{
$adjective_string = "";
- $cur_node = $tree['cur_node'];
- while (isset($tagged_phrase[$cur_node]['tag']) &&
- in_array(trim($tagged_phrase[$cur_node]['tag']),
+ $cur_node = $tree["cur_node"];
+ while (isset($tagged_phrase[$cur_node]["tag"]) &&
+ in_array(trim($tagged_phrase[$cur_node]["tag"]),
self::$adjective_phrases)) {
- $adjective_string .= " " . $tagged_phrase[$cur_node]['token'];
+ $adjective_string .= " " . $tagged_phrase[$cur_node]["token"];
$cur_node++;
}
if (!empty($adjective_string)) {
$tree["JJ"] = $adjective_string;
}
- $tree['cur_node'] = $cur_node;
+ $tree["cur_node"] = $cur_node;
return $tree;
}
/**
@@ -592,24 +538,23 @@ class Tokenizer
* "tag"=> part_of_speech_tag_for_term)
* @return array used to represent a tree. The array has up to three fields
* $tree["cur_node"] index of how far we parsed our$tagged_phrase
- * $tree["NP"] contains a subtree for a noun phrase
- * $tree["VP"] contains a subtree for a verb phrase
+ * $tree["NP"] contains a subtree for a subject phrase
+ * $tree["POST"] contains a subtree for a object phrase
+ * $tree["VP"] contains a subtree for a predicate phrase
*/
public static function generatePhraseParseTree($tagged_phrase)
{
$tree = [];
- $tree_np = self::extractNounPhrase($tagged_phrase,['cur_node' => 0]);
- $tree = ["cur_node" => $tree_np['cur_node']];
+ $tree_np = self::extractNounPhrase($tagged_phrase,["cur_node" => 0]);
+ $tree = ["cur_node" => $tree_np["cur_node"]];
+ $tree_pp = self::extractPostpositionPhrase($tagged_phrase, $tree);
+ $tree["cur_node"] = $tree_pp["cur_node"];
$tree_vp = self::extractVerbPhrase($tagged_phrase, $tree);
- $tree['cur_node'] = $tree_vp['cur_node'];
- if ($tree == $tree_vp) {
- unset($tree_np['cur_node'], $tree_vp['cur_node']);
- $tree['NP'] = $tree_np['NP'];
- return $tree;
- }
- unset($tree_np['cur_node'], $tree_vp['cur_node']);
- $tree['NP'] = $tree_np['NP'];
- $tree['VP'] = $tree_vp['VP'];
+ $tree["cur_node"] = $tree_vp["cur_node"];
+ unset($tree_np["cur_node"], $tree_pp["cur_node"], $tree_vp["cur_node"]);
+ $tree["NP"] = $tree_np["NP"];
+ $tree["POST"] = $tree_pp["POST"];
+ $tree["VP"] = $tree_vp["VP"];
return $tree;
}
/**
@@ -628,26 +573,29 @@ class Tokenizer
$triplets_list = [];
$question_list = [];
$question_answer_list = [];
- $triplet_types = ['CONCISE', 'RAW'];
+ $triplet_types = ["CONCISE", "RAW"];
foreach ($word_and_phrase_list as $word_and_phrase => $position_list) {
- $tagged_phrase = self::tagTokenizePartOfSpeech($word_and_phrase);
+ $sentence = $word_and_phrase;
+ $sentence = preg_replace("/\s+/u", " ", $word_and_phrase);
+ $sentence = trim($sentence);
+ $tagged_phrase = self::tagTokenizePartOfSpeech($sentence);
$parse_tree = self::generatePhraseParseTree($tagged_phrase);
$triplets = self::extractTripletsParseTree($parse_tree);
$extracted_triplets = self::rearrangeTripletsByType($triplets);
foreach ($triplet_types as $type) {
if (!empty($extracted_triplets[$type])) {
$triplets = $extracted_triplets[$type];
- $questions = $triplets['QUESTION_LIST'];
+ $questions = $triplets["QUESTION_LIST"];
foreach ($questions as $question) {
$question_list[$question] = $position_list;
}
$question_answer_list = array_merge($question_answer_list,
- $triplets['QUESTION_ANSWER_LIST']);
+ $triplets["QUESTION_ANSWER_LIST"]);
}
}
}
- $out_triplets['QUESTION_LIST'] = $question_list;
- $out_triplets['QUESTION_ANSWER_LIST'] = $question_answer_list;
+ $out_triplets["QUESTION_LIST"] = $question_list;
+ $out_triplets["QUESTION_ANSWER_LIST"] = $question_answer_list;
return $out_triplets;
}
/**
@@ -681,19 +629,19 @@ class Tokenizer
public static function extractSubjectParseTree($tree)
{
$subject = [];
- if (!empty($tree['NP'])) {
- $subject['CONCISE'] = self::extractDeepestSpeechPartPhrase(
- $tree['NP'], "NN");
+ if (!empty($tree["NP"])) {
+ $subject["CONCISE"] = self::extractDeepestSpeechPartPhrase(
+ $tree["NP"], "NN");
$raw_subject = "";
$it = new \RecursiveIteratorIterator(
- new \RecursiveArrayIterator($tree['NP']));
+ new \RecursiveArrayIterator($tree["NP"]));
foreach ($it as $v) {
$raw_subject .= $v . " ";
}
- $subject['RAW']= $raw_subject;
+ $subject["RAW"]= $raw_subject;
} else {
- $subject['CONCISE'] = "";
- $subject['RAW'] = "";
+ $subject["CONCISE"] = "";
+ $subject["RAW"] = "";
}
return $subject;
}
@@ -709,23 +657,23 @@ class Tokenizer
public static function extractPredicateParseTree($tree)
{
$predicate = [];
- if (!empty($tree['VP'])) {
- $tree_vp = $tree['VP'];
- $predicate['CONCISE'] = self::extractDeepestSpeechPartPhrase(
+ if (!empty($tree["VP"])) {
+ $tree_vp = $tree["VP"];
+ $predicate["CONCISE"] = self::extractDeepestSpeechPartPhrase(
$tree_vp, "VB");
$raw_predicate = "";
- if (!empty($tree_vp['VB'])) {
- $tree_vb = $tree_vp['VB'];
+ if (!empty($tree_vp["VB"])) {
+ $tree_vb = $tree_vp["VB"];
$it = new \RecursiveIteratorIterator(
new \RecursiveArrayIterator($tree_vb));
foreach ($it as $v) {
$raw_predicate .= $v . " ";
}
- $predicate['RAW'] = $raw_predicate;
+ $predicate["RAW"] = $raw_predicate;
}
} else {
- $predicate['CONCISE'] = "";
- $predicate['RAW'] = "";
+ $predicate["CONCISE"] = "";
+ $predicate["RAW"] = "";
}
return $predicate;
}
@@ -741,26 +689,25 @@ class Tokenizer
public static function extractObjectParseTree($tree)
{
$object = [];
- if (!empty($tree['VP'])) {
- $tree_vp = $tree['VP'];
- if (!empty($tree_vp['NP'])) {
- $nb = $tree_vp['NP'];
- $object['CONCISE'] = self::extractDeepestSpeechPartPhrase($nb,
+ if (!empty($tree["POST"])) {
+ $tree_pp = $tree["POST"];
+ if (!empty($tree_pp["NP"])) {
+ $np = $tree_pp["NP"];
+ $object["CONCISE"] = self::extractDeepestSpeechPartPhrase($np,
"NN");
- $raw_object = "";
- $it = new \RecursiveIteratorIterator(
- new \RecursiveArrayIterator($nb));
- foreach ($it as $v) {
- $raw_object .= $v . " ";
- }
- $object['RAW'] = $raw_object;
} else {
- $object['CONCISE'] = "";
- $object['RAW'] = "";
+ $object["CONCISE"] = "";
}
+ $raw_object = "";
+ $it = new \RecursiveIteratorIterator(
+ new \RecursiveArrayIterator($tree_pp));
+ foreach ($it as $v) {
+ $raw_object .= $v . " ";
+ }
+ $object["RAW"] = $raw_object;
} else {
- $object['CONCISE'] = "";
- $object['RAW'] = "";
+ $object["CONCISE"] = "";
+ $object["RAW"] = "";
}
return $object;
}
@@ -771,15 +718,15 @@ class Tokenizer
* original phrase and RAW to the case where extraneous words have been
* removed
*
- * @param are $tree a parse tree for a sentence
+ * @param array $parse_tree a parse tree for a sentence
* @return array triplet array
*/
public static function extractTripletsParseTree($parse_tree)
{
$triplets = [];
- $triplets['subject'] = self::extractSubjectParseTree($parse_tree);
- $triplets['object'] = self::extractObjectParseTree($parse_tree);
- $triplets['predicate'] = self::extractPredicateParseTree($parse_tree);
+ $triplets["subject"] = self::extractSubjectParseTree($parse_tree);
+ $triplets["object"] = self::extractObjectParseTree($parse_tree);
+ $triplets["predicate"] = self::extractPredicateParseTree($parse_tree);
return $triplets;
}
/**
@@ -794,17 +741,17 @@ class Tokenizer
public static function rearrangeTripletsByType($sub_pred_obj_triplets)
{
$processed_triplet = [];
- $processed_triplets['CONCISE'] =
- self::extractTripletByType($sub_pred_obj_triplets, 'CONCISE');
- $processed_triplets['RAW'] =
- self::extractTripletByType($sub_pred_obj_triplets, 'RAW');
+ $processed_triplets["CONCISE"] =
+ self::extractTripletByType($sub_pred_obj_triplets, "CONCISE");
+ $processed_triplets["RAW"] =
+ self::extractTripletByType($sub_pred_obj_triplets, "RAW");
return $processed_triplets;
}
/**
* Takes a triplets array with subject, predicate, object fields with
- * CONCISE, RAW subfields and produces a triplits with $type subfield (where
- * $type is one of CONCISE and RAW) and with subject, predicate, object,
- * and QUESTION_ANSWER_LIST subfields
+ * CONCISE, RAW subfields and produces triplets with $type subfield
+ * where $type is one of CONCISE and RAW and with subject, predicate,
+ * object and QUESTION_ANSWER_LIST subfields
*
* @param array $sub_pred_obj_triplets in format described above
* @param string $type either CONCISE or RAW
@@ -813,27 +760,30 @@ class Tokenizer
public static function extractTripletByType($sub_pred_obj_triplets, $type)
{
$triplets = [];
- if (!empty($sub_pred_obj_triplets['subject'][$type])
- && !empty($sub_pred_obj_triplets['predicate'][$type])
- && !empty($sub_pred_obj_triplets['object'][$type])) {
+ if (!empty($sub_pred_obj_triplets["subject"][$type])
+ && !empty($sub_pred_obj_triplets["predicate"][$type])
+ && !empty($sub_pred_obj_triplets["object"][$type])) {
$question_answer_triplets = [];
$question_marker = self::$question_marker;
- $sentence = [$sub_pred_obj_triplets['subject'][$type],
- $sub_pred_obj_triplets['predicate'][$type],
- $sub_pred_obj_triplets['object'][$type]];
+ $sentence = [$sub_pred_obj_triplets["subject"][$type],
+ $sub_pred_obj_triplets["object"][$type],
+ $sub_pred_obj_triplets["predicate"][$type]];
$question_triplets = [];
for ($j = 0; $j < 2; $j++) {
for ($i = 0; $i < 3; $i++) {
- $q_sentence = $sentence;
- $q_sentence[$i] = $question_marker;
- $q_sentence_string = implode(" ", $q_sentence);
- $question_triplets[] = $q_sentence_string;
- $question_answer_triplets[$q_sentence_string] =
- preg_replace('/\s+/u', ' ',$sentence[$i]);
+ $question = $sentence;
+ $question[$i] = $question_marker;
+ $question_string = implode(" ", $question);
+ $question_string = trim($question_string);
+ $question_string = preg_replace("/\s+/u", " ",
+ $question_string);
+ $question_triplets[] = $question_string;
+ $question_answer_triplets[$question_string] =
+ preg_replace("/\s+/u", " ", $sentence[$i]);
}
}
- $triplets['QUESTION_LIST'] = $question_triplets;
- $triplets['QUESTION_ANSWER_LIST'] = $question_answer_triplets;
+ $triplets["QUESTION_LIST"] = $question_triplets;
+ $triplets["QUESTION_ANSWER_LIST"] = $question_answer_triplets;
}
return $triplets;
}
@@ -845,33 +795,26 @@ class Tokenizer
* @param int $index current index in statement
* @return array parsed triplet
*/
- public static function parseWhoQuestion($tagged_question, $index)
+ public static function parseQuestion($tagged_question, $index)
{
- $start_pos = 0;
- if ($index == 0)
- $start_pos = $index + 1;
$generated_questions = [];
- $question_marker = self::getQuestionMarker();
+ $question_marker = trim(self::getQuestionMarker());
$triplets = [];
- $tree_np = self::extractNounPhrase($tagged_question, ["cur_node" =>
- $start_pos]);
- $triplets['subject'] = self::extractSubjectParseTree($tree_np);
- $tree = ["cur_node" => $index];
- $tree['NP'] = $tagged_question[$index]['token'];
- $tree_vp = self::extractVerbPhrase($tagged_question, $tree);
- $triplets['predicate'] = self::extractPredicateParseTree($tree_vp);
- $triplet_types = ['CONCISE', 'RAW'];
+ $tree_np = self::extractNounPhrase($tagged_question,
+ ["cur_node" => 0]);
+ $triplets["subject"] = self::extractSubjectParseTree($tree_np);
+ $tree_vp = self::extractVerbPhrase($tagged_question,
+ ["cur_node" => $index+1]);
+ $triplets["predicate"] = self::extractPredicateParseTree($tree_vp);
+ $triplet_types = ["CONCISE", "RAW"];
foreach ($triplet_types as $type) {
- if (!empty($triplets['subject'][$type])
- && !empty($triplets['predicate'][$type])) {
- $generated_questions[$type][] =
- trim($triplets['subject'][$type]) .
- " " . trim($triplets['predicate'][$type]) . " " .
- $question_marker;
- $generated_questions[$type][] =
- trim($triplets['subject'][$type]) .
+ if (!empty($triplets["subject"][$type])
+ && !empty($triplets["predicate"][$type])) {
+ $question = trim (trim($triplets["subject"][$type]) .
" " . $question_marker .
- " " . trim($triplets['predicate'][$type]);
+ " " . trim($triplets["predicate"][$type]));
+ $question = preg_replace("/\s+/u", " ", $question);
+ $generated_questions[$type][] = $question;
}
}
return $generated_questions;
@@ -885,10 +828,11 @@ class Tokenizer
*/
public function isQuestion($phrase)
{
- $who_question = "कौन";
$phrase = trim($phrase);
- if (mb_strpos($phrase, $who_question) !== false) {
- return true;
+ for ($i = 0; $i < count(self::$questions); $i++) {
+ if (mb_strpos($phrase, trim(self::$questions[$i])) !== false) {
+ return true;
+ }
}
return false;
}
@@ -902,27 +846,25 @@ class Tokenizer
return self::$question_marker;
}
/**
- * Takes WH questions and returns the triplet from the question
+ * Takes questions and returns the triplet from the question
*
* @param string $question question to parse
* @return array question triplet
*/
public static function questionParser($question)
{
- /*
- * Array of 'wh' questions: What, When, Where, Why, Who, Which, Whom,
- * Whose
- */
- $wh_questions = array( "क्या", "कब", "कहा", "क्यों", "कौन", "जिसे",
- "जिसका", "कहाँ");
+ $question = trim($question);
+ $question = preg_replace("/\s+/u", " ", $question);
$tagged_question = self::tagTokenizePartOfSpeech($question);
$index = -1;
foreach ($tagged_question as $i => $term_pos) {
- if (in_array($term_pos['token'], $wh_questions)) {
+ if (in_array($term_pos["token"], self::$questions)) {
$index = $i;
+ $term_pos["tag"] = "p_wh";
+ $tagged_question[$i] = $term_pos;
break;
}
}
- return self::parseWhoQuestion($tagged_question, $index);
+ return self::parseQuestion($tagged_question, $index);
}
}
diff --git a/src/models/ProfileModel.php b/src/models/ProfileModel.php
index 3b31e613f..ac3f19fac 100755
--- a/src/models/ProfileModel.php
+++ b/src/models/ProfileModel.php
@@ -390,6 +390,10 @@ class ProfileModel extends Model
ACCESS_COUNT INTEGER,
PRIMARY KEY(ADDRESS, PAGE_NAME))",
"VERSION" => "CREATE TABLE VERSION(ID INTEGER PRIMARY KEY)",
+ "LEXICON" => "CREATE TABLE LEXICON(
+ TERM VARCHAR(". C\LONG_NAME_LEN ."),
+ LOCALE VARCHAR(" . C\NAME_LEN . "),
+ PART_OF_SPEECH VARCHAR(16), PRIMARY KEY(TERM, LOCALE))",
];
}
/**