Hindi Triplet Extractor Changes for patch

SalilShenoy [2017-11-07 03:Nov:th]

Hindi Triplet Extractor Changes for patch

Signed-off-by: Chris Pollett <chris@pollett.org>

Filename
src/configs/Createdb.php
src/library/PhraseParser.php
src/locale/hi/resources/Tokenizer.php

diff --git a/src/configs/Createdb.php b/src/configs/Createdb.php
index 3430501fb..6c9637895 100755
--- a/src/configs/Createdb.php
+++ b/src/configs/Createdb.php
@@ -39,6 +39,7 @@ use seekquarry\yioop\library\Cipher;
 use seekquarry\yioop\models\Model;
 use seekquarry\yioop\models\ProfileModel;
 use seekquarry\yioop\models\GroupModel;
+use seekquarry\yioop\configs as C;

 if (!empty($_SERVER['DOCUMENT_ROOT'])) {
     echo "BAD REQUEST";
diff --git a/src/library/PhraseParser.php b/src/library/PhraseParser.php
index ca8b23b5f..9239badf1 100755
--- a/src/library/PhraseParser.php
+++ b/src/library/PhraseParser.php
@@ -401,6 +401,9 @@ class PhraseParser
         } else {
             mb_internal_encoding("UTF-8");
             $string = mb_strtolower($string);
+            if ($lang == "hi") {
+                $string = preg_replace('/(,:)\p{P}/u', "", $string);
+            }
             $string = mb_ereg_replace("\s+|".C\PUNCT, " ", $string);
             $terms = self::segmentSegment($string, $lang);
             $terms = self::charGramTerms($terms, $lang);
diff --git a/src/locale/hi/resources/Tokenizer.php b/src/locale/hi/resources/Tokenizer.php
index 992573af4..f0df85681 100755
--- a/src/locale/hi/resources/Tokenizer.php
+++ b/src/locale/hi/resources/Tokenizer.php
@@ -43,40 +43,57 @@ use seekquarry\yioop\models as M;
  */
 class Tokenizer
 {
+    /**
+     * List of verb-like parts of speech that might appear in lexicon
+     * @array
+     */
     public static $verb_phrases = ["VB", "VBD", "VBG", "VBN", "VBP", "VBZ"];
-
+    /**
+     * List of noun-like parts of speech that might appear in lexicon
+     * @array
+     */
     public static $noun_phrases = ["NN", "NNS", "NNP", "NNPS", "PRP"];
-
+    /**
+     * List of adjective-like parts of speech that might appear in lexicon
+     * @array
+     */
     public static $adjective_phrases = ["JJ", "JJR", "JJS"];
-
+    /**
+     * List of postpositional-like parts of speech that might appear in lexicon
+     * @array
+     */
     public static $postpositional_phrases = ["inj", "PREP", "proNN", "CONJ",
         "INT", "particle", "case", "PSP"];
-
+    /**
+     * Any unique identifier corresponding to the component of a triplet which
+     * can be answered using a question answer list
+     * @string
+     */
     public static $question_marker = "qqq";
     /**
-    * Words we don't want to be stemmed
-    * @var array
-    */
+     * Words we don't want to be stemmed
+     * @var array
+     */
     public static $no_stem_list = [];
     /**
-    * Stub function which could be used for a word segmenter.
-    * Such a segmenter on input thisisabunchofwords would output
-    * this is a bunch of words
-    *
-    * @param string $pre_segment  before segmentation
-    * @return string should return string with words separated by space
-    *     in this case does nothing
-    */
+     * Stub function which could be used for a word segmenter.
+     * Such a segmenter on input thisisabunchofwords would output
+     * this is a bunch of words
+     *
+     * @param string $pre_segment  before segmentation
+     * @return string should return string with words separated by space
+     *     in this case does nothing
+     */
     public static function segment($pre_segment)
     {
         return $pre_segment;
     }
     /**
-    * Removes the stop words from the page (used for Word Cloud generation)
-    *
-    * @param string $page the page to remove stop words from.
-    * @return string $page with no stop words
-    */
+     * Removes the stop words from the page (used for Word Cloud generation)
+     *
+     * @param string $page the page to remove stop words from.
+     * @return string $page with no stop words
+     */
     public static function stopwordsRemover($page)
     {
         $stop_words = [
@@ -106,11 +123,11 @@ class Tokenizer
         return $page;
     }
     /**
-    * Computes the stem of an Hindi word
-    *
-    * @param string $word the string to stem
-    * @return string the stem of $word
-    */
+     * Computes the stem of an Hindi word
+     *
+     * @param string $word the string to stem
+     * @return string the stem of $word
+     */
     public static function stem($word)
     {
         if (in_array($word, self::$no_stem_list)) {
@@ -120,11 +137,11 @@ class Tokenizer
         return $word;
     }
     /**
-    * Removes common Hindi suffixes
-    *
-    * @param string $word to remove suffixes from
-    * @return string result of suffix removal
-    */
+     * Removes common Hindi suffixes
+     *
+     * @param string $word to remove suffixes from
+     * @return string result of suffix removal
+     */
     private static function removeSuffix($word)
     {
         $length = mb_strlen($word);
@@ -154,14 +171,14 @@ class Tokenizer
         return $word;
     }
     /**
-    * The method takes as input a phrase and returns a string with each
-    * term tagged with a part of speech.
-    *
-    * @param string $phrase text to add parts speech tags to
-    * @param bool $with_tokens whether to include the terms and the tags
-    *      in the output string or just the part of speech tags
-    * @return string $tagged_phrase which is a string of format term~pos
-    */
+     * The method takes as input a phrase and returns a string with each
+     * term tagged with a part of speech.
+     *
+     * @param string $phrase text to add parts speech tags to
+     * @param bool $with_tokens whether to include the terms and the tags
+     *      in the output string or just the part of speech tags
+     * @return string $tagged_phrase which is a string of format term~pos
+     */
     public static function tagPartsOfSpeechPhrase($phrase, $with_tokens = true)
     {
         $tagged_tokens = self::tagTokenizePartOfSpeech($phrase);
@@ -170,12 +187,12 @@ class Tokenizer
         return $tagged_phrase;
     }
     /**
-    * Uses the lexicon to assign a tag to each token and then uses a rule
-    * based approach to assign the most likely of tags to each token
-    *
-    * @param string $text input phrase which is to be tagged
-    * @return string $result which is an array of token => tag
-    */
+     * Uses the lexicon to assign a tag to each token and then uses a rule
+     * based approach to assign the most likely of tags to each token
+     *
+     * @param string $text input phrase which is to be tagged
+     * @return string $result which is an array of token => tag
+     */
     public static function tagTokenizePartofSpeech($text)
     {
         $tokens = preg_split("/[\s]+/", $text);
@@ -214,8 +231,8 @@ class Tokenizer
         return self::tagUnknownWords($result);
     }
     /**
-    *    This method tags the remaining words from the text.
-    */
+     *    This method tags the remaining words from the text.
+     */
     public static function tagUnknownWords($partiallyTaggedText)
     {
         $result = $partiallyTaggedText;
@@ -286,14 +303,14 @@ class Tokenizer
         return $result;
      }
     /**
-    * This method is used to simplify the different tags of speech to a
-    * common form
-    *
-    * @param array $tagged_tokens which is an array of tokens assigned tags.
-    * @param bool $with_tokens whether to include the terms and the tags
-    *      in the output string or just the part of speech tags
-    * @return string $tagged_phrase which is a string fo form token~pos
-    */
+     * This method is used to simplify the different tags of speech to a
+     * common form
+     *
+     * @param array $tagged_tokens which is an array of tokens assigned tags.
+     * @param bool $with_tokens whether to include the terms and the tags
+     *      in the output string or just the part of speech tags
+     * @return string $tagged_phrase which is a string fo form token~pos
+     */
     public static function taggedPartOfSpeechTokensToString($tagged_tokens,
         $with_tokens = true)
     {
@@ -319,19 +336,19 @@ class Tokenizer
         return $tagged_phrase;
     }
     /**
-    * Takes a part-of-speech tagged phrase and pre-tree with a
-    * parse-from position and builds a parse tree for a noun if possible
-    *
-    * @param array $tagged_phrase
-    *      an array of pairs of the form ("token" => token_for_term,
-    *     "tag"=> part_of_speech_tag_for_term)
-    * @param array $tree that consists of ["curnode" =>
-    *      current parse position in $tagged_phrase]
-    * @return array has fields
-    *      "cur_node" index of how far we parsed $tagged_phrase
-    *      "NN" a subarray with a token node for the noun string that was
-    *      parsed
-    */
+     * Takes a part-of-speech tagged phrase and pre-tree with a
+     * parse-from position and builds a parse tree for a noun if possible
+     *
+     * @param array $tagged_phrase
+     *      an array of pairs of the form ("token" => token_for_term,
+     *     "tag"=> part_of_speech_tag_for_term)
+     * @param array $tree that consists of ["curnode" =>
+     *      current parse position in $tagged_phrase]
+     * @return array has fields
+     *      "cur_node" index of how far we parsed $tagged_phrase
+     *      "NN" a subarray with a token node for the noun string that was
+     *      parsed
+     */
     public static function extractNoun($tagged_phrase, $tree)
     {
         //Combining multiple noun into one
@@ -350,18 +367,18 @@ class Tokenizer
         return $tree;
     }
     /**
-    * Takes a part-of-speech tagged phrase and pre-tree with a
-    * parse-from position and builds a parse tree for a sequence of
-    * postpositional phrases if possible
-    *
-    * @param array $tagged_phrase
-    *      an array of pairs of the form ("token" => token_for_term,
-    *     "tag"=> part_of_speech_tag_for_term)
-    * @param array $tree that consists of ["cur_node" =>
-    *      current parse position in $tagged_phrase]
-    * @return array has fields
-    *      "cur_node" index of how far we parsed $tagged_phrase
-    */
+     * Takes a part-of-speech tagged phrase and pre-tree with a
+     * parse-from position and builds a parse tree for a sequence of
+     * postpositional phrases if possible
+     *
+     * @param array $tagged_phrase
+     *      an array of pairs of the form ("token" => token_for_term,
+     *     "tag"=> part_of_speech_tag_for_term)
+     * @param array $tree that consists of ["cur_node" =>
+     *      current parse position in $tagged_phrase]
+     * @return array has fields
+     *      "cur_node" index of how far we parsed $tagged_phrase
+     */
     public static function extractPostposition($tagged_phrase, $tree,
         $index = 1)
     {
@@ -411,19 +428,19 @@ class Tokenizer
         return $tree;
     }
     /**
-    * Takes a part-of-speech tagged phrase and pre-tree with a
-    * parse-from position and builds a parse tree for a noun phrase if possible
-    *
-    * @param array $tagged_phrase
-    *      an array of pairs of the form ("token" => token_for_term,
-    *     "tag"=> part_of_speech_tag_for_term)
-    * @param array $tree that consists of ["curnode" =>
-    *      current parse position in $tagged_phrase]
-    * @return array has fields
-    *      "cur_node" index of how far we parsed $tagged_phrase
-    *      "JJ" with value an adjective subtree
-    *       "POST" with value a post position subtree
-    */
+     * Takes a part-of-speech tagged phrase and pre-tree with a
+     * parse-from position and builds a parse tree for a noun phrase if possible
+     *
+     * @param array $tagged_phrase
+     *      an array of pairs of the form ("token" => token_for_term,
+     *     "tag"=> part_of_speech_tag_for_term)
+     * @param array $tree that consists of ["curnode" =>
+     *      current parse position in $tagged_phrase]
+     * @return array has fields
+     *      "cur_node" index of how far we parsed $tagged_phrase
+     *      "JJ" with value an adjective subtree
+     *       "POST" with value a post position subtree
+     */
     public static function extractNounPhrase($tagged_phrase, $tree)
     {
         $cur_node = $tree['cur_node'];
@@ -450,19 +467,19 @@ class Tokenizer
         return $tree;
     }
     /**
-    * Takes a part-of-speech tagged phrase and pre-tree with a
-    * parse-from position and builds a parse tree for a verb if possible
-    *
-    * @param array $tagged_phrase
-    *      an array of pairs of the form ("token" => token_for_term,
-    *     "tag"=> part_of_speech_tag_for_term)
-    * @param array $tree that consists of ["curnode" =>
-    *      current parse position in $tagged_phrase]
-    * @return array has fields
-    *      "cur_node" index of how far we parsed $tagged_phrase
-    *      "VB" a subarray with a token node for the verb string that was
-    *      parsed
-    */
+     * Takes a part-of-speech tagged phrase and pre-tree with a
+     * parse-from position and builds a parse tree for a verb if possible
+     *
+     * @param array $tagged_phrase
+     *      an array of pairs of the form ("token" => token_for_term,
+     *     "tag"=> part_of_speech_tag_for_term)
+     * @param array $tree that consists of ["curnode" =>
+     *      current parse position in $tagged_phrase]
+     * @return array has fields
+     *      "cur_node" index of how far we parsed $tagged_phrase
+     *      "VB" a subarray with a token node for the verb string that was
+     *      parsed
+     */
     public static function extractVerb($tagged_phrase, $tree)
     {
         $cur_node = $tree['cur_node'];
@@ -486,19 +503,19 @@ class Tokenizer
         return $tree;
     }
     /**
-    * Takes a part-of-speech tagged phrase and pre-tree with a
-    * parse-from position and builds a parse tree for a verb phrase if possible
-    *
-    * @param array $tagged_phrase
-    *      an array of pairs of the form ("token" => token_for_term,
-    *     "tag"=> part_of_speech_tag_for_term)
-    * @param array $tree that consists of ["curnode" =>
-    *      current parse position in $tagged_phrase]
-    * @return array has fields
-    *      "cur_node" index of how far we parsed $tagged_phrase
-    *      "VP" a subarray with possible fields
-    *      "VB" with value a verb subtree
-    */
+     * Takes a part-of-speech tagged phrase and pre-tree with a
+     * parse-from position and builds a parse tree for a verb phrase if possible
+     *
+     * @param array $tagged_phrase
+     *      an array of pairs of the form ("token" => token_for_term,
+     *     "tag"=> part_of_speech_tag_for_term)
+     * @param array $tree that consists of ["curnode" =>
+     *      current parse position in $tagged_phrase]
+     * @return array has fields
+     *      "cur_node" index of how far we parsed $tagged_phrase
+     *      "VP" a subarray with possible fields
+     *      "VB" with value a verb subtree
+     */
     public static function extractVerbPhrase($tagged_phrase, $tree)
     {
         $cur_node = $tree['cur_node'];
@@ -537,19 +554,19 @@ class Tokenizer
         return $tree_new;
     }
     /**
-    * Takes a part-of-speech tagged phrase and pre-tree with a
-    * parse-from position and builds a parse tree for an adjective if possible
-    *
-    * @param array $tagged_phrase
-    *      an array of pairs of the form ("token" => token_for_term,
-    *     "tag"=> part_of_speech_tag_for_term)
-    * @param array $tree that consists of ["cur_node" =>
-    *      current parse position in $tagged_phrase]
-    * @return array has fields
-    *      "cur_node" index of how far we parsed $tagged_phrase
-    *      "JJ" a subarray with a token node for the adjective that was
-    *      parsed
-    */
+     * Takes a part-of-speech tagged phrase and pre-tree with a
+     * parse-from position and builds a parse tree for an adjective if possible
+     *
+     * @param array $tagged_phrase
+     *      an array of pairs of the form ("token" => token_for_term,
+     *     "tag"=> part_of_speech_tag_for_term)
+     * @param array $tree that consists of ["cur_node" =>
+     *      current parse position in $tagged_phrase]
+     * @return array has fields
+     *      "cur_node" index of how far we parsed $tagged_phrase
+     *      "JJ" a subarray with a token node for the adjective that was
+     *      parsed
+     */
     public static function extractAdjective($tagged_phrase, $tree)
     {
         $adjective_string = "";
@@ -567,17 +584,17 @@ class Tokenizer
         return $tree;
     }
     /**
-    * Given a part-of-speeech tagged phrase array generates a parse tree
-    * for the phrase using a recursive descent parser.
-    *
-    * @param array $tagged_phrase
-    *      an array of pairs of the form ("token" => token_for_term,
-    *     "tag"=> part_of_speech_tag_for_term)
-    * @return array used to represent a tree. The array has up to three fields
-    *      $tree["cur_node"] index of how far we parsed our$tagged_phrase
-    *      $tree["NP"] contains a subtree for a noun phrase
-    *      $tree["VP"] contains a subtree for a verb phrase
-    */
+     * Given a part-of-speeech tagged phrase array generates a parse tree
+     * for the phrase using a recursive descent parser.
+     *
+     * @param array $tagged_phrase
+     *      an array of pairs of the form ("token" => token_for_term,
+     *     "tag"=> part_of_speech_tag_for_term)
+     * @return array used to represent a tree. The array has up to three fields
+     *      $tree["cur_node"] index of how far we parsed our$tagged_phrase
+     *      $tree["NP"] contains a subtree for a noun phrase
+     *      $tree["VP"] contains a subtree for a verb phrase
+     */
     public static function generatePhraseParseTree($tagged_phrase)
     {
         $tree = [];
@@ -596,16 +613,16 @@ class Tokenizer
         return $tree;
     }
     /**
-    * Scans a word list for phrases. For phrases found generate
-    * a list of question and answer pairs at two levels of granularity:
-    * CONCISE (using all terms in orginal phrase) and RAW (removing
-    * (adjectives, etc).
-    *
-    * @param array $word_and_phrase_list of statements
-    * @return array with two fields: QUESTION_LIST consisting of
-    *      (SUBJECT, COMPLEMENT) where one of the components has been
-    *      replaced with a question marker.
-    */
+     * Scans a word list for phrases. For phrases found generate
+     * a list of question and answer pairs at two levels of granularity:
+     * CONCISE (using all terms in orginal phrase) and RAW (removing
+     * (adjectives, etc).
+     *
+     * @param array $word_and_phrase_list of statements
+     * @return array with two fields: QUESTION_LIST consisting of
+     *      (SUBJECT, COMPLEMENT) where one of the components has been
+     *      replaced with a question marker.
+     */
     public static function extractTripletsPhrases($word_and_phrase_list)
     {
         $triplets_list = [];
@@ -633,6 +650,14 @@ class Tokenizer
         $out_triplets['QUESTION_ANSWER_LIST'] = $question_answer_list;
         return $out_triplets;
     }
+    /**
+     * Takes phrase tree $tree and a part-of-speech $pos returns
+     * the deepest $pos only path in tree.
+     *
+     * @param array $tree phrase to extract type from
+     * @param string $pos the part of speech to extract
+     * @return string the label of deepest $pos only path in $tree
+     */
     public static function extractDeepestSpeechPartPhrase($tree, $pos)
     {
         $extract = "";
@@ -644,6 +669,15 @@ class Tokenizer
         }
         return $extract;
     }
+    /**
+     * Takes a parse tree of a phrase or statement and returns an array
+     * with two fields CONCISE and RAW the former having the subject of
+     * the original phrase (as a string) the latter having the importart
+     * parts of the subject
+     *
+     * @param array representation of a parse tree of a phrase
+     * @return array with two fields CONCISE and RAW as described above
+     */
     public static function extractSubjectParseTree($tree)
     {
         $subject = [];
@@ -663,13 +697,22 @@ class Tokenizer
         }
         return $subject;
     }
+    /**
+     * Takes a parse tree of a phrase or statement and returns an array
+     * with two fields CONCISE and RAW the former having the predicate of
+     * the original phrase (as a string) the latter having the importart
+     * parts of the predicate
+     *
+     * @param array representation of a parse tree of a phrase
+     * @return array with two fields CONCISE and RAW as described above
+     */
     public static function extractPredicateParseTree($tree)
     {
         $predicate = [];
         if (!empty($tree['VP'])) {
             $tree_vp = $tree['VP'];
             $predicate['CONCISE'] = self::extractDeepestSpeechPartPhrase(
-                $tree_vp, "VB")
+                $tree_vp, "VB");
             $raw_predicate = "";
             if (!empty($tree_vp['VB'])) {
                 $tree_vb = $tree_vp['VB'];
@@ -686,6 +729,15 @@ class Tokenizer
         }
         return $predicate;
     }
+    /**
+     * Takes a parse tree of a phrase or statement and returns an array
+     * with two fields CONCISE and RAW the former having the object of
+     * the original phrase (as a string) the latter having the importart
+     * parts of the object
+     *
+     * @param array representation of a parse tree of a phrase
+     * @return array with two fields CONCISE and RAW as described above
+     */
     public static function extractObjectParseTree($tree)
     {
         $object = [];
@@ -713,15 +765,15 @@ class Tokenizer
         return $object;
     }
     /**
-    * Takes a parse tree of a phrase and computes subject, predicate, and
-    * object arrays. Each of these array consists of two components CONCISE and
-    * RAW, CONCISE corresponding to something more similar to the words in the
-    * original phrase and RAW to the case where extraneous words have been
-    * removed
-    *
-    * @param are $tree a parse tree for a sentence
-    * @return array triplet array
-    */
+     * Takes a parse tree of a phrase and computes subject, predicate, and
+     * object arrays. Each of these array consists of two components CONCISE and
+     * RAW, CONCISE corresponding to something more similar to the words in the
+     * original phrase and RAW to the case where extraneous words have been
+     * removed
+     *
+     * @param are $tree a parse tree for a sentence
+     * @return array triplet array
+     */
     public static function extractTripletsParseTree($parse_tree)
     {
         $triplets = [];
@@ -730,6 +782,15 @@ class Tokenizer
         $triplets['predicate'] = self::extractPredicateParseTree($parse_tree);
         return $triplets;
     }
+    /**
+     * Takes a triplets array with subject, predicate, object fields with
+     * CONCISE and RAW subfields and rearranges it to have two fields CONCISE
+     * and RAW with subject, predicate, object, and QUESTION_ANSWER_LIST
+     * subfields
+     *
+     * @param array $sub_pred_obj_triplets in format described above
+     * @return array $processed_triplets in format described above
+     */
     public static function rearrangeTripletsByType($sub_pred_obj_triplets)
     {
         $processed_triplet = [];
@@ -739,12 +800,22 @@ class Tokenizer
             self::extractTripletByType($sub_pred_obj_triplets, 'RAW');
         return $processed_triplets;
     }
+    /**
+     * Takes a triplets array with subject, predicate, object fields with
+     * CONCISE, RAW subfields and produces a triplits with $type subfield (where
+     * $type is one of CONCISE and RAW) and with subject, predicate, object,
+     * and QUESTION_ANSWER_LIST subfields
+     *
+     * @param array $sub_pred_obj_triplets  in format described above
+     * @param string $type either CONCISE or RAW
+     * @return array $triplets in format described above
+     */
     public static function extractTripletByType($sub_pred_obj_triplets, $type)
     {
         $triplets = [];
-        if (!empty($sub_pred_obj_triplets['subject'])
-            && !empty($sub_pred_obj_triplets['predicate'])
-            && !empty($sub_pred_obj_triplets['object'])) {
+        if (!empty($sub_pred_obj_triplets['subject'][$type])
+            && !empty($sub_pred_obj_triplets['predicate'][$type])
+            && !empty($sub_pred_obj_triplets['object'][$type])) {
             $question_answer_triplets = [];
             $question_marker = self::$question_marker;
             $sentence = [$sub_pred_obj_triplets['subject'][$type],
@@ -767,33 +838,38 @@ class Tokenizer
         return $triplets;
     }
     /**
-    * Takes tagged question string starts with Who
-    * and returns question triplet from the question string
-    *
-    * @param string $tagged_question part-of-speech tagged question
-    * @param int $index current index in statement
-    * @return array parsed triplet
-    */
+     * Takes tagged question string starts with Who
+     * and returns question triplet from the question string
+     *
+     * @param string $tagged_question part-of-speech tagged question
+     * @param int $index current index in statement
+     * @return array parsed triplet
+     */
     public static function parseWhoQuestion($tagged_question, $index)
     {
+        $start_pos = 0;
+        if ($index == 0)
+            $start_pos = $index + 1;
         $generated_questions = [];
         $question_marker = self::getQuestionMarker();
         $triplets = [];
-        $tree_np = self::extractNounPhrase($tagged_question, ["cur_node" => 0]);
+        $tree_np = self::extractNounPhrase($tagged_question, ["cur_node" =>
+            $start_pos]);
         $triplets['subject'] = self::extractSubjectParseTree($tree_np);
         $tree = ["cur_node" => $index];
-        $tree['NP'] = "कौन";
+        $tree['NP'] = $tagged_question[$index]['token'];
         $tree_vp = self::extractVerbPhrase($tagged_question, $tree);
         $triplets['predicate'] = self::extractPredicateParseTree($tree_vp);
-        $triplets['object'] = self::extractObjectParseTree($tree_vp);
         $triplet_types = ['CONCISE', 'RAW'];
         foreach ($triplet_types as $type) {
-            if (!empty($triplets['object'][$type])
+            if (!empty($triplets['subject'][$type])
                 && !empty($triplets['predicate'][$type])) {
-                $generated_questions[] = trim($triplets['object'][$type]) .
+                $generated_questions[$type][] =
+                    trim($triplets['subject'][$type]) .
                     " " . trim($triplets['predicate'][$type]) . " " .
                     $question_marker;
-                $generated_questions[] = trim($triplets['object'][$type]) .
+                $generated_questions[$type][] =
+                    trim($triplets['subject'][$type]) .
                     " " . $question_marker .
                     " " . trim($triplets['predicate'][$type]);
             }
@@ -801,12 +877,12 @@ class Tokenizer
         return $generated_questions;
     }
     /**
-    * Takes a phrase query entered by user and return true if it is question
-    * and false if not
-    *
-    * @param $phrase any statement
-    * @return bool returns true if statement is question
-    */
+     * Takes a phrase query entered by user and return true if it is question
+     * and false if not
+     *
+     * @param $phrase any statement
+     * @return bool returns true if statement is question
+     */
     public function isQuestion($phrase)
     {
         $who_question = "कौन";
@@ -816,22 +892,37 @@ class Tokenizer
         }
         return false;
     }
+    /**
+     * The function returns the question marker for the locale
+     *
+     * @return the question marker
+     */
     public static function getQuestionMarker()
     {
         return self::$question_marker;
     }
+    /**
+     * Takes WH questions and returns the triplet from the question
+     *
+     * @param string $question question to parse
+     * @return array question triplet
+     */
     public static function questionParser($question)
     {
+        /*
+         * Array of 'wh' questions: What, When, Where, Why, Who, Which, Whom,
+         * Whose
+         */
+        $wh_questions = array( "क्या", "कब", "कहा", "क्यों", "कौन", "जिसे",
+            "जिसका", "कहाँ");
         $tagged_question = self::tagTokenizePartOfSpeech($question);
-        $generated_questions = "";
         $index = -1;
-        foreach ($tagged_question as $key => $value) {
-            if (strcmp("कौन", $value['token']) == 0) {
-                $index = $key;
+        foreach ($tagged_question as $i => $term_pos) {
+            if (in_array($term_pos['token'], $wh_questions)) {
+                $index = $i;
                 break;
             }
         }
-        $generated_questions = self::parseWhoQuestion($tagged_question, $index);
-        return $generated_questions;
+        return self::parseWhoQuestion($tagged_question, $index);
     }
 }

ViewGit