viewgit/inc/functions.php:22 Function utf8_encode() is deprecated [8192]
Filename | |
---|---|
src/library/TripletExtractor.php |
diff --git a/src/library/TripletExtractor.php b/src/library/TripletExtractor.php index 6bb38874f..77ff6a1d6 100644 --- a/src/library/TripletExtractor.php +++ b/src/library/TripletExtractor.php @@ -107,17 +107,15 @@ class TripletExtractor $line = trim(substr($lex_string, $token_pos, $cur_pos - $token_pos)); $tag_list = explode(' ', $line); - $dictionary[strtolower(rtrim($token, "."))] = - array_slice($tag_list, 1); + $dictionary[$token] = array_slice($tag_list, 1); $cur_pos++; } } // now using our dictionary we tag $i = 0; - $tag_list = array(); + $tag_list = []; + $prev_tag_list = []; foreach ($matches[0] as $token) { - $prev_tag_list = $tag_list; - $tag_list = []; // default to a common noun $current = ['token' => $token, 'tag' => 'NN']; // remove trailing full stops @@ -193,6 +191,8 @@ class TripletExtractor $i++; $previous = $current; $previous_token = $token; + $prev_tag_list = $tag_list; + $tag_list = []; } return $result; } @@ -243,7 +243,7 @@ class TripletExtractor * @param $text any statement * @return array words tagged with POS tags */ - public static function partOfSpeechTagger_Brill($text) + public static function partOfSpeechTaggerBrill($text) { static $dict = null; $lexicon = C\LOCALE_DIR . "/en_US/resources/lexicon.txt"; @@ -509,6 +509,10 @@ class TripletExtractor } /** + * Takes current tree and returns + * tree by adding auxiliary verb + * node to it + * * @param $tagger_array POS tagged array * @param $tree current tree * @return mixed VP added tree @@ -539,6 +543,9 @@ class TripletExtractor } /** + * Takes current tree and returns + * tree by adding Verb node to it. + * * @param $tagger_array POS tagged tree * @param $tree current tree * @return mixed VB added tree @@ -572,6 +579,9 @@ class TripletExtractor return $tree; } /** + * Takes current tree and returns + * a triplet extracted from the tree. + * * @param $tree fully generated tree * @return array triplet array */ @@ -586,6 +596,10 @@ class TripletExtractor } /** + * Takes triplet tree and returns + * the processed triplet from the + * tree. + * * @param $triplet_tree any statement * @return array processed triplet */ @@ -600,6 +614,10 @@ class TripletExtractor } /** + * Takes triplet tree and returns + * tree an array of raw + * triplets. + * * @param $triplet_tree triplet array * @return array raw triplet array */ @@ -610,9 +628,12 @@ class TripletExtractor if (isset($triplet_tree['subject']['RAW']) && isset($triplet_tree['predicate']['RAW']) && isset($triplet_tree['object']['RAW']) - && !TripletExtractor::IsNullOrEmptyString($triplet_tree['subject']['RAW']) - && !TripletExtractor::IsNullOrEmptyString($triplet_tree['predicate']['RAW']) - && !TripletExtractor::IsNullOrEmptyString($triplet_tree['object']['RAW']) + && !TripletExtractor::isNullOrEmptyString( + $triplet_tree['subject']['RAW']) + && !TripletExtractor::isNullOrEmptyString( + $triplet_tree['predicate']['RAW']) + && !TripletExtractor::isNullOrEmptyString( + $triplet_tree['object']['RAW']) ) { $SUBJECT = trim($triplet_tree['subject']['RAW']); @@ -620,21 +641,31 @@ class TripletExtractor $OBJECT = trim($triplet_tree['object']['RAW']); $raw_triplet['SUBJECT'] = - TripletExtractor::$question_word . " " . $PREDICATE . " " . $OBJECT; + TripletExtractor::$question_word . " " . + $PREDICATE . " " . $OBJECT; $raw_triplet['PREDICATE'] = - $SUBJECT . " " . TripletExtractor::$question_word . " " . $OBJECT; + $SUBJECT . " " . TripletExtractor::$question_word . " " . + $OBJECT; $raw_triplet['OBJECT'] = - $SUBJECT . " " . $PREDICATE . " " . TripletExtractor::$question_word; + $SUBJECT . " " . $PREDICATE . " " . + TripletExtractor::$question_word; - $question_answer_triplet[TripletExtractor::$question_word . " " . $PREDICATE . " " . $OBJECT] = $SUBJECT; - $question_answer_triplet[$SUBJECT . " " . TripletExtractor::$question_word . " " . $OBJECT] = $PREDICATE; - $question_answer_triplet[$SUBJECT . " " . $PREDICATE . " " . TripletExtractor::$question_word] = $OBJECT; + $question_answer_triplet[TripletExtractor::$question_word . " " . + $PREDICATE . " " . $OBJECT] = $SUBJECT; + $question_answer_triplet[$SUBJECT . " " . + TripletExtractor::$question_word . " " . $OBJECT] = $PREDICATE; + $question_answer_triplet[$SUBJECT . " " . $PREDICATE . " " . + TripletExtractor::$question_word] = $OBJECT; $raw_triplet['QUESTION_ANSWER_LIST'] = $question_answer_triplet; } return $raw_triplet; } /** + * Takes triplet tree and returns + * tree an array of featured + * triplets. + * * @param $triplet_tree triplet array * @return array featured triplet array */ @@ -645,24 +676,34 @@ class TripletExtractor if (isset($triplet_tree['subject']['FEATURED']) && isset($triplet_tree['predicate']['FEATURED']) && isset($triplet_tree['object']['FEATURED']) - && !TripletExtractor::IsNullOrEmptyString($triplet_tree['subject']['FEATURED']) - && !TripletExtractor::IsNullOrEmptyString($triplet_tree['predicate']['FEATURED']) - && !TripletExtractor::IsNullOrEmptyString($triplet_tree['object']['FEATURED']) + && !TripletExtractor::isNullOrEmptyString( + $triplet_tree['subject']['FEATURED']) + && !TripletExtractor::isNullOrEmptyString( + $triplet_tree['predicate']['FEATURED']) + && !TripletExtractor::isNullOrEmptyString( + $triplet_tree['object']['FEATURED']) ) { $SUBJECT = trim($triplet_tree['subject']['FEATURED']); $PREDICATE = trim($triplet_tree['predicate']['FEATURED']); $OBJECT = trim($triplet_tree['object']['FEATURED']); $featured_triplet['SUBJECT'] = - TripletExtractor::$question_word . " " . $PREDICATE . " " . $OBJECT; + TripletExtractor::$question_word . " " . $PREDICATE . + " " . $OBJECT; $featured_triplet['PREDICATE'] = - $SUBJECT . " " . TripletExtractor::$question_word . " " . $OBJECT; + $SUBJECT . " " . TripletExtractor::$question_word . + " " . $OBJECT; $featured_triplet['OBJECT'] = - $SUBJECT . " " . $PREDICATE . " " . TripletExtractor::$question_word; + $SUBJECT . " " . $PREDICATE . " " . + TripletExtractor::$question_word; - $question_answer_triplet[TripletExtractor::$question_word . " " . $PREDICATE . " " . $OBJECT] = $SUBJECT; - $question_answer_triplet[$SUBJECT . " " . TripletExtractor::$question_word . " " . $OBJECT] = $PREDICATE; - $question_answer_triplet[$SUBJECT . " " . $PREDICATE . " " . TripletExtractor::$question_word] = $OBJECT; + $question_answer_triplet[TripletExtractor::$question_word . " " . + $PREDICATE . " " . $OBJECT] = $SUBJECT; + $question_answer_triplet[$SUBJECT . " " . + TripletExtractor::$question_word . + " " . $OBJECT] = $PREDICATE; + $question_answer_triplet[$SUBJECT . " " . $PREDICATE . " " . + TripletExtractor::$question_word] = $OBJECT; $featured_triplet['QUESTION_ANSWER_LIST'] = $question_answer_triplet; } @@ -670,15 +711,22 @@ class TripletExtractor } /** + * Takes a string and checks if + * it is set or empty. + * * @param $string any string * @return bool true if null of empty string */ - public static function IsNullOrEmptyString($string) + public static function isNullOrEmptyString($string) { return (!isset($string) || trim($string) === ''); } /** + * Takes current tree and returns + * the array of text tagged as + * Subject. + * * @param $tree generated tree * @return array subject array */ @@ -690,7 +738,8 @@ class TripletExtractor $value = TripletExtractor::extractFirstNounFromNPTree($tree_np); $subject['RAW'] = $value; $featured_subject = ""; - $it = new \RecursiveIteratorIterator(new \RecursiveArrayIterator($tree_np)); + $it = new \RecursiveIteratorIterator( + new \RecursiveArrayIterator($tree_np)); foreach ($it as $v) { $featured_subject .= $v . " "; } @@ -703,6 +752,10 @@ class TripletExtractor } /** + * Takes current tree and returns + * the array of text tagged as + * Predicate. + * * @param $tree generated tree * @return array predicate array */ @@ -716,7 +769,8 @@ class TripletExtractor $featured_predicate = ""; if (isset($tree_vp['VB']) && $tree_vp['VB'] != null) { $tree_vb = $tree_vp['VB']; - $it = new \RecursiveIteratorIterator(new \RecursiveArrayIterator($tree_vb)); + $it = new \RecursiveIteratorIterator( + new \RecursiveArrayIterator($tree_vb)); foreach ($it as $v) { $featured_predicate .= $v . " "; } @@ -730,6 +784,10 @@ class TripletExtractor } /** + * Takes current tree and returns + * the array of text tagged as + * Object. + * * @param $tree generated tree * @return array object array */ @@ -743,7 +801,8 @@ class TripletExtractor $value = TripletExtractor::extractFirstNounFromNPTree($nb); $object['RAW'] = $value; $featured_object = ""; - $it = new \RecursiveIteratorIterator(new \RecursiveArrayIterator($nb)); + $it = new \RecursiveIteratorIterator( + new \RecursiveArrayIterator($nb)); foreach ($it as $v) { $featured_object .= $v . " "; } @@ -760,6 +819,9 @@ class TripletExtractor } /** + * Takes noun phrase tree and return + * the first noun from the tree. + * * @param $tree_np noun phrase subtree * @return string first noun */ @@ -776,6 +838,9 @@ class TripletExtractor } /** + * Takes verb phrase tree and returns + * the base form of the verb. + * * @param $tree_vp verb phrase subtree * @return string deepest verb */ @@ -792,6 +857,10 @@ class TripletExtractor } /** + * Takes current tree and return + * attribute maps for noun, adjectives, + * preposition. + * * @param $tree generated tree * @return array attributes array */ @@ -822,6 +891,11 @@ class TripletExtractor } /** + * Takes the statement and apply + * the rules in the defined in the + * lexicon, assign parts of speech + * and generate a triplet tree. + * * @param $statement any statement * @return array processed triplet */ @@ -829,7 +903,7 @@ class TripletExtractor { try { $tagged_statement = - TripletExtractor::partOfSpeechTagger_Brill($statement); + TripletExtractor::partOfSpeechTaggerBrill($statement); $statement_tree = TripletExtractor::generateParseTreeUsingRDP($tagged_statement); $triplet_tree = TripletExtractor::extractTriplet($statement_tree); @@ -840,6 +914,11 @@ class TripletExtractor } /** + * Process individual statements + * from the statement array. Generate + * a list of question and answer + * pairs. + * * @param $statement_array array of statements * @return array list of triplets */ @@ -851,23 +930,32 @@ class TripletExtractor foreach ($statement_array as $key => $value) { try { if (str_word_count($key) >= 3) { - $extracted_triplet = TripletExtractor::storeStatementAsTriplet($key); + $extracted_triplet = + TripletExtractor::storeStatementAsTriplet($key); if (isset($extracted_triplet['RAW']) && sizeof($extracted_triplet['RAW']) > 0) { - $question_list[$extracted_triplet['RAW']['SUBJECT']] = $value; - $question_list[$extracted_triplet['RAW']['PREDICATE']] = $value; - $question_list[$extracted_triplet['RAW']['OBJECT']] = $value; - $question_answer_list = array_merge($question_answer_list, + $question_list[$extracted_triplet['RAW']['SUBJECT']] + = $value; + $question_list[$extracted_triplet['RAW']['PREDICATE']] + = $value; + $question_list[$extracted_triplet['RAW']['OBJECT']] + = $value; + $question_answer_list = + array_merge($question_answer_list, $extracted_triplet['RAW']['QUESTION_ANSWER_LIST']); } if (isset($extracted_triplet['FEATURED']) && sizeof($extracted_triplet['FEATURED']) > 0) { - $question_list[$extracted_triplet['FEATURED']['SUBJECT']] = $value; - $question_list[$extracted_triplet['FEATURED']['PREDICATE']] = $value; - $question_list[$extracted_triplet['FEATURED']['OBJECT']] = $value; - $question_answer_list = array_merge($question_answer_list, + $question_list[$extracted_triplet['FEATURED']['SUBJECT']] + = $value; + $question_list[$extracted_triplet['FEATURED']['PREDICATE']] + = $value; + $question_list[$extracted_triplet['FEATURED']['OBJECT']] + = $value; + $question_answer_list = + array_merge($question_answer_list, $extracted_triplet['FEATURED']['QUESTION_ANSWER_LIST']); } } @@ -890,33 +978,30 @@ class TripletExtractor */ public static function questionParser($question_string) { - $question_string_tagged = TripletExtractor::partOfSpeechTagger_Brill( + $question_string_tagged = TripletExtractor::partOfSpeechTaggerBrill( $question_string); $index = 0; $generated_question_array = []; - if (isset($question_string_tagged[$index]) && - ("WRB" == trim($question_string_tagged[$index]['tag']) || - "WP" == trim($question_string_tagged[$index]['tag'])) - ) { - if ("WHO" == strtoupper( - trim($question_string_tagged[$index]['token']))) { - $index = $index + 1; - $generated_question_array = - TripletExtractor::parseWHOQuestion( - $question_string_tagged, $index); - } else { - if ("WHERE" == strtoupper( - trim($question_string_tagged[$index]['token'])) || - "WHEN" == strtoupper( - trim($question_string_tagged[$index]['token'])) || - "WHAT" == strtoupper( - trim($question_string_tagged[$index]['token'])) - ) { + if (isset($question_string_tagged[$index])) { + $tag = trim($question_string_tagged[$index]['tag']); + if ("WRB" == $tag || "WP" == $tag) { + $token = strtoupper( + trim($question_string_tagged[$index]['token'])); + if ("WHO" == $token) { $index = $index + 1; $generated_question_array = + TripletExtractor::parseWHOQuestion( + $question_string_tagged, $index); + } else { + if ("WHERE" == $token || + "WHEN" == $token || + "WHAT" == $token) { + $index = $index + 1; + $generated_question_array = TripletExtractor::parseWHPlusQuestion_New( $question_string_tagged, $index); + } } } } @@ -945,9 +1030,9 @@ class TripletExtractor $tree_vp); if (isset($triplet['object']['RAW']) && isset($triplet['predicate']['RAW']) - && !TripletExtractor::IsNullOrEmptyString( + && !TripletExtractor::sNullOrEmptyString( $triplet['object']['RAW']) - && !TripletExtractor::IsNullOrEmptyString( + && !TripletExtractor::isNullOrEmptyString( $triplet['predicate']['RAW']) ) { $generated_question_array['RAW']['1'] = @@ -961,9 +1046,9 @@ class TripletExtractor } if (isset($triplet['object']['FEATURED']) && isset($triplet['predicate']['FEATURED']) - && !TripletExtractor::IsNullOrEmptyString( + && !TripletExtractor::isNullOrEmptyString( $triplet['object']['FEATURED']) - && !TripletExtractor::IsNullOrEmptyString( + && !TripletExtractor::isNullOrEmptyString( $triplet['predicate']['FEATURED']) ) { $generated_question_array['FEATURED']['1'] = @@ -990,15 +1075,18 @@ class TripletExtractor { $generated_question_array = []; $aux_verb = ""; - while (isset($question_string_tagged[$index]) && - ("VB" == trim($question_string_tagged[$index]['tag']) || - "VBD" == trim($question_string_tagged[$index]['tag']) || - "VBG" == trim($question_string_tagged[$index]['tag']) || - "VBN" == trim($question_string_tagged[$index]['tag']) || - "VBP" == trim($question_string_tagged[$index]['tag']) || - "VBZ" == trim($question_string_tagged[$index]['tag']))) { - $aux_verb .= " " . trim($question_string_tagged[$index]['token']); - $index = $index + 1; + while (isset($question_string_tagged[$index])) { + $tag = trim($question_string_tagged[$index]['tag']); + if ("VB" == $tag || + "VBD" == $tag || + "VBG" == $tag || + "VBN" == $tag || + "VBP" == $tag || + "VBZ" == $tag) { + $token = trim($question_string_tagged[$index]['token']); + $aux_verb .= " " . $token; + $index = $index + 1; + } } $tree = ["cur_node" => $index]; $tree['NP'] = "WHPlus"; @@ -1014,7 +1102,7 @@ class TripletExtractor $triplet['object'] = TripletExtractor::extractObjectFromTree( $tree_vp); if (isset($aux_verb) - && !TripletExtractor::IsNullOrEmptyString($aux_verb) + && !TripletExtractor::isNullOrEmptyString($aux_verb) ) { $triplet['predicate']['RAW'] = trim($aux_verb) . " " . $triplet['predicate']['RAW']; @@ -1026,9 +1114,9 @@ class TripletExtractor } if (isset($triplet['subject']['RAW']) && isset($triplet['predicate']['RAW']) - && !TripletExtractor::IsNullOrEmptyString( + && !TripletExtractor::isNullOrEmptyString( $triplet['subject']['RAW']) - && !TripletExtractor::IsNullOrEmptyString( + && !TripletExtractor::isNullOrEmptyString( $triplet['predicate']['RAW']) ) { $generated_question_array['RAW']['1'] = @@ -1042,9 +1130,9 @@ class TripletExtractor } if (isset($triplet['subject']['FEATURED']) && isset($triplet['predicate']['FEATURED']) - && !TripletExtractor::IsNullOrEmptyString( + && !TripletExtractor::isNullOrEmptyString( $triplet['subject']['FEATURED']) - && !TripletExtractor::IsNullOrEmptyString( + && !TripletExtractor::isNullOrEmptyString( $triplet['predicate']['FEATURED']) ) { $generated_question_array['FEATURED']['1'] =