diff --git a/src/configs/Config.php b/src/configs/Config.php index 293cacc7f..84ba574a3 100755 --- a/src/configs/Config.php +++ b/src/configs/Config.php @@ -806,7 +806,7 @@ nsconddefine('THUMB_DIM', 128); nsconddefine('THUMB_SIZE', 1000000); /** Characters we view as not part of words, not same as POSIX [:punct:]*/ nsconddefine('PUNCT', "\.|\,|\:|\;|\"|\'|\[|\/|\%|\?|-|" . - "\]|\{|\}|\(|\)|\!|\||\&|\`|" . + "\]|\{|\}|\(|\)|\!|\||।|\&|\`|" . "\’|\‘|©|®|™|℠|…|\/|\>|,|\=|。|)|:|、|" . "”|“|《|》|(|「|」|★|【|】|·|\+|\*|;". "|!|—|―|?|!|،|؛|؞|؟|٪|٬|٭"); diff --git a/src/configs/TokenTool.php b/src/configs/TokenTool.php index 47115cf67..6843c84ee 100644 --- a/src/configs/TokenTool.php +++ b/src/configs/TokenTool.php @@ -204,7 +204,7 @@ function makeNWordGramsFiles($args) } if (!isset($args[5]) && $args[3] == "all" && $args[4] == NWordGrams::PAGE_COUNT_WIKIPEDIA) { - $args[5] = 100000; + $args[5] = 75000; } else { $args[5] = -1; } diff --git a/src/controllers/components/CrawlComponent.php b/src/controllers/components/CrawlComponent.php index faecd6d31..ab94ce27f 100644 --- a/src/controllers/components/CrawlComponent.php +++ b/src/controllers/components/CrawlComponent.php @@ -1526,9 +1526,8 @@ class CrawlComponent extends Component implements CrawlConstants if ($site[self::TITLE] != "" ) { $lang = L\guessLocaleFromString($site[self::TITLE], $lang); } else { - $lang = L\guessLocaleFromString( - substr($site[self::DESCRIPTION], 0, - C\AD_HOC_TITLE_LENGTH), $lang); + $lang = L\guessLocaleFromString($site[self::DESCRIPTION], + $lang); } $word_lists = PhraseParser::extractPhrasesInLists( $phrase_string, $lang); diff --git a/src/executables/ArcTool.php b/src/executables/ArcTool.php index 9e0c88cd7..8db9c043b 100755 --- a/src/executables/ArcTool.php +++ b/src/executables/ArcTool.php @@ -48,8 +48,7 @@ if (php_sapi_name() != 'cli' || defined("seekquarry\\yioop\\configs\\IS_OWN_WEB_SERVER")) { echo "BAD REQUEST"; exit(); } -ini_set("memory_limit","2000M"); /* - reindex sometimes takes more than the default 128M, 850 to be safe */ +ini_set("memory_limit","2500M"); /** This tool does not need logging*/ $_SERVER["LOG_TO_FILES"] = false; /** USE_CACHE false rules out file cache as well*/ @@ -874,8 +873,7 @@ class ArcTool implements CrawlConstants intval(file_get_contents($shard_count_file)); echo "Restarting rebuild index from $start_generation\n"; } else { - $start_generation= 0; - file_put_contents($shard_count_file, $start_generation); + $start_generation = 0; } } $info = $archive_name::getArchiveInfo($archive_path); @@ -885,11 +883,12 @@ class ArcTool implements CrawlConstants file_get_contents("$archive_path/generation.txt")); $num_generations = $generation_info['ACTIVE'] + 1; $archive = new WebArchiveBundle($archive_path."/summaries"); + $dictionary_path = $archive_path . "/dictionary"; $dbms_manager = C\NS_DATASOURCES . ucfirst(C\DBMS) . "Manager"; $db = new $dbms_manager(); $db->unlinkRecursive($archive_path . "/dictionary", false); - IndexDictionary::makePrefixLetters($archive_path . "/dictionary"); - $dictionary = new IndexDictionary($archive_path . "/dictionary"); + IndexDictionary::makePrefixLetters($dictionary_path); + $dictionary = new IndexDictionary($dictionary_path); $seen = 0; $generation = $start_generation; $keypad = "\x00\x00\x00\x00"; diff --git a/src/executables/Fetcher.php b/src/executables/Fetcher.php index 6b5c16d63..d9d7fb7ea 100755 --- a/src/executables/Fetcher.php +++ b/src/executables/Fetcher.php @@ -2809,13 +2809,14 @@ class Fetcher implements CrawlConstants self::LANG]])) { $phrase_string = $site[self::DESCRIPTION]; } else { - $phrase_string = $host_words." .. ". + $phrase_string = $host_words . " .. ". $site[self::TITLE] . " .. ". $path_words . " .. ". $site[self::DESCRIPTION]; } } else { - $phrase_string = $host_words." ".$site[self::TITLE] . - " ". $path_words . " ". $site[self::DESCRIPTION]; + $phrase_string = $host_words . " " . + $site[self::TITLE] . " ". $path_words . " ". + $site[self::DESCRIPTION]; } } if (empty($site[self::LANG])) { diff --git a/src/library/BloomFilterFile.php b/src/library/BloomFilterFile.php index 6d9d3c6f9..917d14968 100755 --- a/src/library/BloomFilterFile.php +++ b/src/library/BloomFilterFile.php @@ -76,8 +76,12 @@ class BloomFilterFile extends PersistentStructure $save_frequency = self::DEFAULT_SAVE_FREQUENCY) { $log2 = log(2); + $log2_sq = $log2 * $log2; + /* choose number of keys so that the odds of false positive + is 1/$num_values. + */ $this->num_keys = ceil(log($num_values)/$log2); - $this->filter_size = ceil( ($this->num_keys) * $num_values/$log2 ); + $this->filter_size = ceil( ($this->num_keys) * $num_values/$log2_sq); $mem_before = memory_get_usage(true); $this->filter = pack("x". ceil(0.125 * $this->filter_size)); // 1/8 =.125 = num bits/bytes, want to make things floats diff --git a/src/library/IndexDictionary.php b/src/library/IndexDictionary.php index fae5f8348..b2539991b 100644 --- a/src/library/IndexDictionary.php +++ b/src/library/IndexDictionary.php @@ -163,22 +163,30 @@ class IndexDictionary implements CrawlConstants if (!is_dir($this->dir_name)) { mkdir($this->dir_name); IndexDictionary::makePrefixLetters($this->dir_name); + $this->active_tiers = []; $this->max_tier = 0; } else { $this->max_tier = unserialize( file_get_contents($this->dir_name."/max_tier.txt")); - $this->read_tier = $this->max_tier; - $tiers = glob($this->dir_name."/0/*A.dic"); - natsort($tiers); - $this->active_tiers = []; - foreach ($tiers as $tier) { - $path = pathinfo($tier); - array_unshift($this->active_tiers, - substr($path["filename"], 0, -1)); - } + $this->calculateActiveTiers(); } $this->parent_archive_bundle = $parent_archive_bundle; } + /** + * + */ + public function calculateActiveTiers() + { + $this->read_tier = $this->max_tier; + $tiers = glob($this->dir_name."/0/*A.dic"); + natsort($tiers); + $this->active_tiers = []; + foreach ($tiers as $tier) { + $path = pathinfo($tier); + array_unshift($this->active_tiers, + substr($path["filename"], 0, -1)); + } + } /** * Makes dictionary sub-directories for each of the 256 possible first * hash characters that crawHash in raw mode code output. @@ -205,7 +213,7 @@ class IndexDictionary implements CrawlConstants public function addShardDictionary($index_shard, $callback = null) { $out_slot = "A"; - if (file_exists($this->dir_name."/0/0A.dic")) { + if (file_exists($this->dir_name . "/0/0A.dic")) { $out_slot ="B"; } crawlLog("Adding shard data to index dictionary files..."); @@ -273,10 +281,10 @@ class IndexDictionary implements CrawlConstants $callback->join(); } $out_slot = "A"; - if (file_exists($this->dir_name."/0/".($tier + 1)."A.dic")) { + if (file_exists($this->dir_name . "/0/" . ($tier + 1) . "A.dic")) { $out_slot ="B"; } - crawlLog("..Merging index $tier to ".($tier +1).$out_slot); + crawlLog("..Merging index $tier to " . ($tier +1) . $out_slot); $this->mergeTier($tier, $out_slot); $tier++; if ($tier > $this->max_tier) { @@ -285,6 +293,7 @@ class IndexDictionary implements CrawlConstants serialize($this->max_tier)); } } + $this->calculateActiveTiers(); crawlLog("...Done Incremental Merging of Index Dictionary Tiers"); return true; } @@ -725,7 +734,10 @@ class IndexDictionary implements CrawlConstants * $start_generation * @param bool $with_remaining_total * @return mixed an array of entries of the form - * generation, first offset, last offset, count + * generation, first offset, last offset, count, matched_key + * If also have with remaining true, then get a pair, with second + * element as above and first element the estimated total number of + * of docs */ public function getWordInfo($word_id, $raw = false, $shift = 0, $threshold = -1, $start_generation = -1, $num_distinct_generations = -1, @@ -798,8 +810,8 @@ class IndexDictionary implements CrawlConstants * to return information about * @return mixed a pair(total_count, max_found_generation, * an array of entries of the form - * generation, first offset, last offset, count) or false if - * no data + * generation, first offset, last offset, count, matched_key) or + * false if no data */ public function getWordInfoTier($word_id, $raw, $tier, $shift = 0, $threshold = -1, $start_generation = -1, $num_distinct_generations = -1) diff --git a/src/library/IndexShard.php b/src/library/IndexShard.php index 93f37dd4e..bb87d56cf 100644 --- a/src/library/IndexShard.php +++ b/src/library/IndexShard.php @@ -521,6 +521,8 @@ class IndexShard extends PersistentStructure implements $this->mergeWordPostingsToString(); $this->packWords(null); $this->outputPostingLists(); + } else if ($this->read_only_from_disk && empty($this->num_docs)) { + $this->getShardHeader(); } $num_docs_so_far = 0; $results = []; @@ -1247,9 +1249,9 @@ class IndexShard extends PersistentStructure implements crawlLog("Saving index shard .. done merge postings to string"); } $this->prepareWordsAndPrefixes($with_logging); - if ($with_logging) { - crawlLog("Saving index shard .. make prefixes"); - } + if ($with_logging) { + crawlLog("Saving index shard .. make prefixes"); + } $header = pack("N*", $this->prefixes_len, $this->words_len, $this->word_docs_len, @@ -1322,6 +1324,7 @@ class IndexShard extends PersistentStructure implements if($with_logging) { crawlLog("..without dictionary version of shard header written"); } + $this->packWords(null, $with_logging); $remaining = $this->word_docs_len; $offset = 0; $buffer_size = 16 * self::SHARD_BLOCK_SIZE; @@ -1747,7 +1750,7 @@ class IndexShard extends PersistentStructure implements */ public function getShardHeader() { - if (isset($this->num_docs) && $this->num_docs > 0) { + if (!empty($this->num_docs)) { return true; // if $this->num_docs > 0 assume have read in } $header = substr($this->readBlockShardAtOffset(0, false), diff --git a/src/library/LocaleFunctions.php b/src/library/LocaleFunctions.php index 49dd94d30..44412ebde 100755 --- a/src/library/LocaleFunctions.php +++ b/src/library/LocaleFunctions.php @@ -100,41 +100,40 @@ function guessLocale() */ function guessLocaleFromString($phrase_string, $locale_tag = null) { - $original_phrase_string = $phrase_string; + $original_phrase_string = mb_substr($phrase_string, 0, + C\AD_HOC_TITLE_LENGTH); $locale_tag = ($locale_tag == null) ? getLocaleTag() : $locale_tag; $sub = C\PUNCT."|[0-9]|\s"; $phrase_string = preg_replace('/'.$sub.'/u', "", $phrase_string); $phrase_string = mb_convert_encoding($phrase_string, "UTF-32", "UTF-8"); $len = strlen($phrase_string); - $guess['zh-CN'] = 0; - $guess['ru'] = 0; - $guess['he'] = 0; - $guess['ar'] = 0; - $guess['th'] = 0; - $guess['ja'] = 0; - $guess['ko'] = 0; + $guess = ['ar' => 0, 'he' => 0, 'hi' => 0, 'ko' => 0, 'ja' => 0, 'ru' => 0, + 'th' => 0, 'zh-CN' => 0]; $guess[$locale_tag] = 1; for ($i = 0; $i < $len; $i += 4) { $start = ord($phrase_string[$i+2]); $next = ord($phrase_string[$i+3]); - if ($start >= 78 && $start <= 159) { - $guess['zh-CN'] += 4; - } else if ($start == 4 || ($start == 5 && $next < 48)) { - $guess['ru']++; - } else if ($start == 5 && $next >= 144) { - $guess['he'] += 2; - } else if ($start >= 6 && $start <= 7) { + if ($start >= 6 && $start <= 7) { if ($locale_tag == "fa") { $guess[$locale_tag] +=2; } else { $guess['ar'] += 2; } - } else if ($start == 14 && $next < 128) { - $guess['th'] += 2; - } else if ($start >= 48 && $start <= 49) { - $guess['ja'] += 3; + } else if ($start == 5 && $next >= 144) { + $guess['he'] += 2; + } else if (($start == 9 && $next < 128) || ($start == 168 && + $next >= 224)) { + $guess['hi'] += 2; } else if ($start == 17 || $start >= 172 && $start < 215) { $guess['ko'] += 2; + } else if ($start >= 48 && $start <= 49) { + $guess['ja'] += 3; + } else if ($start == 4 || ($start == 5 && $next < 48)) { + $guess['ru']++; + } else if ($start == 14 && $next < 128) { + $guess['th'] += 2; + } else if ($start >= 78 && $start <= 159) { + $guess['zh-CN'] += 4; } else if ($start == 0 && $next < 128) { $guess[$locale_tag]++; // assume ascii is from $locale_tag } diff --git a/src/library/NWordGrams.php b/src/library/NWordGrams.php index 2a54cac26..0348c8e05 100644 --- a/src/library/NWordGrams.php +++ b/src/library/NWordGrams.php @@ -200,7 +200,7 @@ class NWordGrams * @param int $ngram_type where in Wiki Dump to extract grams from * @param int $max_terms maximum number of n-grams to compute and put in * file - * @return int $num_ngrams_found count of bigrams in text file. + * @return int $num_ngrams_found count of n-grams in text file. */ public static function makeNWordGramsTextFile($wiki_file, $lang, $locale, $num_gram = 2, $ngram_type = self::PAGE_COUNT_WIKIPEDIA, @@ -310,6 +310,10 @@ class NWordGrams } $ngram_num_words = mb_substr_count($ngram, " ") + 1; + if ($lang == 'en' && preg_match( + '/^(a\s|the\s|of\s|if\s)/', $ngram)) { + $ngram_num_words--; + } if (($is_all && $ngram_num_words > 1) || (!$is_all && $ngram_num_words == $num_gram)) { @@ -330,7 +334,7 @@ class NWordGrams } if ($is_count_type && count($ngrams) > 4 * $max_terms && $max_terms > 0) { - echo "..pruning results to $max_terms many\n"; + echo "..pruning results to $max_terms terms.\n"; arsort($ngrams); $ngrams = array_slice($ngrams, 0, $max_terms); } diff --git a/src/library/PhraseParser.php b/src/library/PhraseParser.php index 097963ce8..adf6437df 100755 --- a/src/library/PhraseParser.php +++ b/src/library/PhraseParser.php @@ -134,7 +134,7 @@ class PhraseParser $string = trim(substr($string, strlen($control_word) + 1)); } else { self::canonicalizePunctuatedTerms($string, $lang); - self::underscoreEntities($string, $lang); + self::hyphenateEntities($string, $lang); } $terms = self::stemCharGramSegment($string, $lang); $num = count($terms); @@ -154,9 +154,8 @@ class PhraseParser return $terms; } $tokenizer = self::getTokenizer($lang); - if (method_exists($tokenizer, "getQuestionMarker") && - stristr($whole_phrase, $tokenizer::getQuestionMarker()) - !== false) { + if (!empty($tokenizer::$question_token) && + stristr($whole_phrase, $tokenizer::$question_token) !== false) { $terms = [$whole_phrase, $terms[0]]; return $terms; } @@ -238,7 +237,7 @@ class PhraseParser 'QUESTION_ANSWER_EXTRACT' => 0]]; if (!isset(self::$programming_language_map[$lang])) { self::canonicalizePunctuatedTerms($string, $lang); - self::underscoreEntities($string, $lang); + self::hyphenateEntities($string, $lang); $phrase_list['TIMES']['CANONICALIZE'] = changeInMicrotime($start_time); } @@ -271,7 +270,7 @@ class PhraseParser return $phrase_list; } /** - * This functions tries to convert acronyms, e-mail, urls, etc into + * This method tries to convert acronyms, e-mail, urls, etc into * a format that does not involved punctuation that will be stripped * as we extract phrases. * @@ -282,31 +281,21 @@ class PhraseParser */ public static function canonicalizePunctuatedTerms(&$string, $lang = null) { - $acronym_pattern = "/\b[A-Za-z](\.\s*[A-Za-z])+(\.|\b)/"; + $acronym_pattern = "/\b\p{L}(\.\s*\p{L})+(\.|\b)/u"; $string = preg_replace_callback($acronym_pattern, function($matches) { - $result = "_".mb_strtolower( - mb_ereg_replace("\.\s*", "", $matches[0])); + $result = "_" . mb_ereg_replace("\.\s*", "", $matches[0]); return $result; }, $string); $ap = "(\'|\u{2019}|\u{02BC})"; - $ampersand_pattern = "/[A-Za-z]+". - "(\s*(\s({$ap}n|{$ap}N)\s|\&)\s*[A-Za-z])+/u"; + $ampersand_pattern = "/\p{L}+". + "(\s*(\s({$ap}n|{$ap}N)\s|\&)\s*\p{L})+/u"; $string = preg_replace_callback($ampersand_pattern, function($matches) { $ap = "(\'|\u{2019}|\u{02BC})"; - $result = mb_strtolower( - mb_ereg_replace("\s*(" . $ap . "n|" . $ap . "N|\&)\s*", - "_and_", $matches[0])); - return $result; - }, $string); - $contraction_pattern = "/\b[A-Za-z]+" . - "({$ap}[A-Za-z]+|\s*{$ap}\s*(s|t))\b/u"; - $string = preg_replace_callback($contraction_pattern, - function($matches) { - $result = mb_strtolower( - mb_ereg_replace("\s*\'|\u2019|\u02BC\s*", - "_ap_", $matches[0])); + $result = mb_ereg_replace( + "\s*(" . $ap . "n|" . $ap . "N|\&)\s*", + "_and_", $matches[0]); return $result; }, $string); $url_or_email_pattern = @@ -314,21 +303,16 @@ class PhraseParser '([A-Z0-9._%-]+\@[A-Z0-9.-]+\.[A-Z]{2,4})@i'; $string = preg_replace_callback($url_or_email_pattern, function($matches) { - $result = mb_ereg_replace("\.", "_d_",$matches[0]); - $result = mb_ereg_replace("\:", "_c_",$result); - $result = mb_ereg_replace("\/", "_s_",$result); - $result = mb_ereg_replace("\@", "_at_",$result); - $result = mb_ereg_replace("\[", "_bo_",$result); - $result = mb_ereg_replace("\]", "_bc_",$result); - $result = mb_ereg_replace("\(", "_po_",$result); - $result = mb_ereg_replace("\)", "_pc_",$result); - $result = mb_ereg_replace("\?", "_q_",$result); - $result = mb_ereg_replace("\=", "_e_",$result); - $result = mb_ereg_replace("\&", "_and_",$result); - $result = mb_strtolower($result); - return $result; + return preg_replace(['/\./', "/\:/", "/\//", "/\@/", + "/\[/", "/\]/", "/\(/", "/\)/", "/\?/", "/\=/", "/\&/"], + ["_d_", "_c_", "_s_", "_at_", "_bo_", "_bc_", "_po_", + "_pc_", "_q_", "_e_", "_and_"], $matches[0]); }, $string); + $tokenizer = self::getTokenizer($lang); + if (method_exists($tokenizer, "canonicalizePunctuatedTerms")) { + $tokenizer->canonicalizePunctuatedTerms($string); + } } /** * @param string& $string a string of words, etc which might involve such @@ -336,16 +320,16 @@ class PhraseParser * @param $lang a language tag to use as part of the canonicalization * process not used right now */ - public static function underscoreEntities(&$string, $lang = null) + public static function hyphenateEntities(&$string, $lang = null) { if (!$lang) { return; } - $string = mb_strtolower($string); $parts = preg_split("/\s+/u", $string); $parts = array_filter($parts); $num_parts = count($parts); $current_entity = ""; + $lower_entity = ""; $out_string = ""; $space = ""; $i = 0; @@ -355,24 +339,36 @@ class PhraseParser $j++; $current_entity = trim(implode(" ", array_slice($parts, $i, $j - $i))); + $lower_entity = mb_strtolower($current_entity); if ($j - $i > 1) { if (NWordGrams::ngramsContains( - $current_entity, $lang, "all")) { + $lower_entity, $lang, "all")) { $last_entity = $current_entity; + $lower_last_entity = $lower_entity; $k = $j; } if (!NWordGrams::ngramsContains( - $current_entity . "*", $lang, "all")) { - $out_string .= $space . str_replace(" ", "_", - trim($last_entity)); + $lower_entity . "*", $lang, "all")) { + $last_entity = trim($last_entity); + $lower_last_entity = trim($lower_last_entity); + // extra checks as Bloom filter not 100% + if (strpos(substr($last_entity, 4), " ") > 0 && + !preg_match('/\-|\(|\)|\[|\]|,|\./', $last_entity) && + NWordGrams::ngramsContains($lower_last_entity, $lang, + "all")) { + $last_entity = str_replace(" ", "-", $last_entity); + } + $out_string .= $space . $last_entity; $space = " "; $current_entity = ""; $last_entity = ""; + $lower_last_entity =""; $i = $k; $j = $k - 1; } } else { $last_entity = $current_entity; + $lower_last_entity = $lower_entity; $k = $j; } } @@ -456,6 +452,10 @@ class PhraseParser public static function stemCharGramSegment($string, $lang, $to_string = false) { + static $non_hyphens = ""; + if (empty($non_hyphens)) { + $non_hyphens = str_replace("-|", "", C\PUNCT); + } if (isset(self::$programming_language_map[$lang])) { mb_internal_encoding("UTF-8"); $tokenizer_name = self::$programming_language_map[$lang] . @@ -467,7 +467,7 @@ class PhraseParser if ($lang == "hi") { $string = preg_replace('/(,:)\p{P}/u', "", $string); } - $string = mb_ereg_replace("\s+|" . C\PUNCT, " ", $string); + $string = mb_ereg_replace("\s+|$non_hyphens", " ", $string); $terms = self::segmentSegment($string, $lang); $terms = self::charGramTerms($terms, $lang); $terms = self::stemTerms($terms, $lang); @@ -812,6 +812,9 @@ class PhraseParser * not contain spaces), this function segments them according to the given * locales segmenter * + * Note: this method is not used when trying to extract keywords from urls. + * Instead, UrlParser::getWordsInHostUrl($url) is used. + * * @param string $segment string to split into terms * @param string $lang IANA tag to look up segmenter under * from some other language @@ -819,21 +822,24 @@ class PhraseParser */ public static function segmentSegment($segment, $lang) { - if ($segment == "") { + static $non_hyphens = ""; + if (empty($non_hyphens)) { + $non_hyphens = str_replace("-|", "", C\PUNCT); + } + if (empty($segment) || empty($lang)) { return []; } + $segment_obj = self::getTokenizer($lang); $term_string = ""; - if ($lang != null) { - $segment_obj = self::getTokenizer($lang); - } else { - $segment_obj = null; - } - if ($segment_obj != null && method_exists($segment_obj, "segment")) { + if (!empty($segment_obj) && method_exists($segment_obj, "segment") + && strpos($segment, '-') === false) { $term_string .= $segment_obj->segment($segment); } else { $term_string = $segment; } - $terms = mb_split("\s+", trim($term_string)); + $terms = preg_split("/(\s|$non_hyphens)+/u", + mb_strtolower(trim($term_string))); + $terms = array_values(array_filter($terms)); return $terms; } /** diff --git a/src/library/UrlParser.php b/src/library/UrlParser.php index 88939871e..14d224e1a 100755 --- a/src/library/UrlParser.php +++ b/src/library/UrlParser.php @@ -499,11 +499,11 @@ class UrlParser return ""; } array_pop($host_parts); // get rid of tld - if (stristr($host_parts[0],"www")) { + if (stristr($host_parts[0], "www")) { array_shift($host_parts); } $words = array_merge($words, $host_parts); - $word_string = " ".implode(" ", $words). " "; + $word_string = " " . implode(" ", $words). " "; return $word_string; } /** diff --git a/src/library/processors/HtmlProcessor.php b/src/library/processors/HtmlProcessor.php index 1aeb650cb..7f738dafe 100755 --- a/src/library/processors/HtmlProcessor.php +++ b/src/library/processors/HtmlProcessor.php @@ -156,10 +156,11 @@ class HtmlProcessor extends TextProcessor $location = self::relCanonical($dom, $url); if ($location) { $summary[self::LINKS] = []; - $summary[self::LINKS][$location] = "location:".$url; + $summary[self::LINKS][$location] = "location:" . $url; $summary[self::LOCATION] = true; if (!$summary[self::DESCRIPTION]) { - $summary[self::DESCRIPTION].=$url." => ".$location; + $summary[self::DESCRIPTION] .= + $url." => ".$location; } if (!$summary[self::TITLE]) { $summary[self::TITLE] = $url; diff --git a/src/library/processors/TextProcessor.php b/src/library/processors/TextProcessor.php index 901e4ff02..0f4be9cca 100755 --- a/src/library/processors/TextProcessor.php +++ b/src/library/processors/TextProcessor.php @@ -93,17 +93,18 @@ class TextProcessor extends PageProcessor { $summary = null; if (is_string($page)) { - $dom_page = preg_replace('@<style[^>]*?>.*?</style>@si', ' ', + $remove_styles_page = preg_replace('@<style[^>]*?>.*?</style>@si', ' ', $page); - $dom = self::dom($dom_page); + $dom = self::dom($remove_styles_page); $summary[self::TITLE] = ""; - $summary[self::LANG] = self::calculateLang($dom_page); + $summary[self::LANG] = self::calculateLang($remove_styles_page); list($summary[self::DESCRIPTION], $summary[self::WORD_CLOUD]) = - $this->summarizer->getSummary($dom, $dom_page, + $this->summarizer->getSummary($dom, $remove_styles_page, $summary[self::LANG]); - $summary[self::LINKS] = self::extractHttpHttpsUrls($dom_page); - $summary[self::PAGE] = "<html><body><div><pre>". - strip_tags($dom_page)."</pre></div></body></html>"; + $summary[self::LINKS] = self::extractHttpHttpsUrls( + $remove_styles_page); + $summary[self::PAGE] = "<html><body><div><pre>" . + strip_tags($remove_styles_page) . "</pre></div></body></html>"; } return $summary; } @@ -121,7 +122,9 @@ class TextProcessor extends PageProcessor { if ($url != null) { $lang = UrlParser::getLang($url); - if ($lang != null) { return $lang; } + if ($lang && !in_array($lang, ["en", "en-US"])) { + return $lang; + } } if ($sample_text != null) { $lang = L\guessLocaleFromString($sample_text); diff --git a/src/library/summarizers/CentroidSummarizer.php b/src/library/summarizers/CentroidSummarizer.php index fc743a7b7..ccba98f85 100644 --- a/src/library/summarizers/CentroidSummarizer.php +++ b/src/library/summarizers/CentroidSummarizer.php @@ -44,29 +44,6 @@ use seekquarry\yioop\library\processors\PageProcessor; */ class CentroidSummarizer extends Summarizer { - /** - * Number of bytes in a sentence before it is considered long - * We use strlen rather than mbstrlen. This might actually be - * a better metric of the potential of a sentence to have info. - */ - const LONG_SENTENCE_LEN = 50; - /** - * Number of sentences in a document before only consider longer - * sentences in centroid - */ - const LONG_SENTENCE_THRESHOLD = 100; - /** - * Number of distinct terms to use in generating summary - */ - const MAX_DISTINCT_TERMS = 1000; - /** - * Number of words in word cloud - */ - const WORD_CLOUD_LEN = 5; - /** - * Number of nonzero centroid components - */ - const CENTROID_COMPONENTS = 50; /** * whether to output the results to the disk or not */ @@ -92,94 +69,20 @@ class CentroidSummarizer extends Summarizer alphanumerics. */ $formatted_doc = self::formatDoc($page); - $stop_obj = PhraseParser::getTokenizer($lang); - /* Splitting into sentences */ $out_sentences = self::getSentences($page); - $n = count($out_sentences); - $sentences = []; - if ($stop_obj && method_exists($stop_obj, "stopwordsRemover")) { - for ($i = 0; $i < $n; $i++ ) { - $sentences[$i] = $stop_obj->stopwordsRemover( - self::formatDoc($out_sentences[$i])); - } - } else { - $sentences = $out_sentences; - } + $stop_obj = PhraseParser::getTokenizer($lang); + $sentences = self::removeStopWords($out_sentences, $stop_obj); /* Splitting into terms */ $terms = []; foreach ($sentences as $sentence) { $terms = array_merge($terms, PhraseParser::segmentSegment($sentence, $lang)); } + $n = count($out_sentences); $terms = array_filter($terms); - $terms_counts = array_count_values($terms); - arsort($terms_counts); - $terms_counts = array_slice($terms_counts, 0, - self::MAX_DISTINCT_TERMS); - $terms = array_unique(array_keys($terms_counts)); - $t = count($terms); - if ($t == 0) { - return ["", ""]; - } - /* Initialize Nk [Number of sentences the term occurs] */ - $nk = []; - $nk = array_fill(0, $t, 0); - $nt = []; - /* Count TF for each word */ - for ($i = 0; $i < $n; $i++) { - for ($j = 0; $j < $t; $j++) { - if (strpos($sentences[$i], $terms[$j]) !== false) { - $nk[$j]++; - } - } - } - /* Calculate weights of each term for every sentence */ - $w = []; - $idf = []; - $idf_temp = 0; - for ($k = 0; $k < $t; $k++) { - if ($nk[$k] == 0) { - $idf_temp = 0; - $tmp = 0; - } else { - $idf_temp = $n / $nk[$k]; - $tmp = log($idf_temp); - } - $idf[$k] = $tmp; - } - /* Count TF for finding centroid */ - $wc = []; - $max_nt = -1; - $b = "\b"; - if (in_array($lang, ["zh-CN", "ja", "ko"])) { - $b = ""; - } - set_error_handler(null); - for ($j = 0; $j < $t; $j++) { - $quoted = preg_quote($terms[$j], '/'); - $nt = @preg_match_all("/$b" . $quoted . "$b/", $formatted_doc, - $matches); //$matches included for backwards compatibility - $wc[$j] = $nt * $idf[$j]; - if (is_nan($wc[$j]) || is_infinite($wc[$j])) { - $wc[$j] = 0; - } - } - set_error_handler(C\NS_CONFIGS . "yioop_error_handler"); - /* Calculate centroid */ - arsort($wc); - $centroid = array_slice($wc, 0, self::CENTROID_COMPONENTS, true); - /* Initializing centroid weight array by 0 */ - $wc = array_fill(0, $t, 0); - /* Word cloud */ - $i = 0; - $word_cloud = []; - foreach ($centroid as $key => $value) { - $wc[$key] = $value; - if ($i < self::WORD_CLOUD_LEN) { - $word_cloud[$i] = $terms[$key]; - } - $i++; - } + list($word_cloud, $wc, $idf) = + self::wordCloudAndCountsFromTermsSentences($terms, $sentences, + $lang); if (strlen($page) < PageProcessor::$max_description_len || $n == 1) { //if input short only use above to get a word cloud @@ -212,15 +115,16 @@ class CentroidSummarizer extends Summarizer } arsort($sim); /* Getting how many sentences should be there in summary */ - $top = self::summarySentenceCount($out_sentences, $sim); + $top = self::numSentencesForSummary($out_sentences, $sim); $sum_array = []; $sum_array = array_keys(array_slice($sim, 0, $top - 1, true)); sort($sum_array); $summary = ''; + $eos = ($lang == 'hi') ? "।" : "."; //default end of sentence symbol foreach ($sum_array as $key) { $compressed_sentence = PhraseParser::compressSentence($out_sentences[$key], $lang); - $summary .= $compressed_sentence . ". "; + $summary .= rtrim($compressed_sentence, $eos) . "$eos "; } if (self::OUTPUT_TO_FILE) { $output_file_contents = ""; @@ -228,7 +132,8 @@ class CentroidSummarizer extends Summarizer $compressed_sentence = PhraseParser::compressSentence($out_sentences[$key], $lang); - $output_file_contents .= $compressed_sentence . ".\r\n"; + $output_file_contents .= rtrim($compressed_sentence, + $eos) . "$eos\r\n"; } file_put_contents(C\WORK_DIRECTORY . self::OUTPUT_FILE_PATH, $output_file_contents); @@ -245,7 +150,7 @@ class CentroidSummarizer extends Summarizer * similarity score to centroid (sorted from highest to lowest score). * @return int number of sentences */ - public static function summarySentenceCount($sentences, $sim) + public static function numSentencesForSummary($sentences, $sim) { $top = null; $count = 0; @@ -258,101 +163,4 @@ class CentroidSummarizer extends Summarizer } return $top; } - /** - * Breaks any content into sentences by splitting it on spaces or carriage - * returns - * @param string $content complete page. - * @return array array of sentences from that content. - */ - public static function getSentences($content) - { - $lines = preg_split( - '/(\.|\||\!|\?|!|?|。)\s+|(\n|\r)(\n|\r)+|\s{5}/ui', - $content, 0, PREG_SPLIT_NO_EMPTY); - $out = []; - $sentence = ""; - $count = 0; - $theshold_factor = 1; - foreach ($lines as $line) { - $sentence .= " " . $line; - if (strlen($line) < 2) { - continue; - } - if ($count < self::LONG_SENTENCE_THRESHOLD || - strlen($sentence) > $theshold_factor * - self::LONG_SENTENCE_LEN){ - $sentence = preg_replace("/\s+/ui", " ", $sentence); - $out[] = trim($sentence); - $count++; - $theshold_factor = - pow(1.5, floor($count/self::LONG_SENTENCE_THRESHOLD)); - } - $sentence = ""; - } - if (trim($sentence) != "") { - $sentence = preg_replace("/\s+/ui", " ", $sentence); - $out[] = trim($sentence); - } - return $out; - } - /** - * Formats the sentences to remove all characters except words, - * digits and spaces - * @param string $sent complete page. - * @return string formatted sentences. - */ - public static function formatSentence($sent) - { - $sent = trim(preg_replace('/[^\p{L}\p{N}\s]+/u', - ' ', mb_strtolower($sent))); - return $sent; - } - /** - * Formats the document to remove carriage returns, hyphens and digits - * as we will not be using digits in word cloud. - * The formatted document generated by this function is only used to - * compute centroid. - * @param string $content formatted page. - * @return string formatted document. - */ - public static function formatDoc($content) - { - $substitute = ['/[\n\r\-]+/', '/[^\p{L}\s\.]+/u', '/[\.]+/ui']; - $content = preg_replace($substitute, ' ', mb_strtolower($content)); - return $content; - } - /** - * This function does an additional processing on the page - * such as removing all the tags from the page - * @param string $page complete page. - * @return string processed page. - */ - public static function pageProcessing($page) - { - $substitutions = ['@<script[^>]*?>.*?</script>@si', - '/\ \;|\&rdquo\;|\&ldquo\;|\&mdash\;/si', - '@<style[^>]*?>.*?</style>@si', '/[\^\(\)]/', - '/\[(.*?)\]/', '/\t\n/' - ]; - $page = preg_replace($substitutions, ' ', $page); - $new_page = preg_replace("/\<br\s*(\/)?\s*\>/", "\n", $page); - $changed = false; - if ($new_page != $page) { - $changed = true; - $page = $new_page; - } - $page = preg_replace("/\<\/(h1|h2|h3|h4|h5|h6|table|tr|td|div|". - "p|address|section)\s*\>/", "\n\n", $page); - $page = preg_replace("/\<a/", " <a", $page); - $page = preg_replace("/\&\#\d{3}(\d?)\;|\&\w+\;/", " ", $page); - $page = preg_replace("/\</", " <", $page); - $page = strip_tags($page); - if ($changed) { - $page = preg_replace("/(\r?\n[\t| ]*){2}/", "\n", $page); - } - $page = preg_replace("/(\r?\n[\t| ]*)/", "\n", $page); - $page = preg_replace("/\n\n\n+/", "\n\n", $page); - $page = preg_replace('/\s\s+/', ' ', $page); - return $page; - } } diff --git a/src/library/summarizers/CentroidWeightedSummarizer.php b/src/library/summarizers/CentroidWeightedSummarizer.php index 2b5892206..7f384ab88 100644 --- a/src/library/summarizers/CentroidWeightedSummarizer.php +++ b/src/library/summarizers/CentroidWeightedSummarizer.php @@ -45,29 +45,6 @@ use seekquarry\yioop\library\processors\PageProcessor; */ class CentroidWeightedSummarizer extends Summarizer { - /** - * Number of bytes in a sentence before it is considered long - * We use strlen rather than mbstrlen. This might actually be - * a better metric of the potential of a sentence to have info. - */ - const LONG_SENTENCE_LEN = 50; - /** - * Number of sentences in a document before only consider longer - * sentences in centroid - */ - const LONG_SENTENCE_THRESHOLD = 100; - /** - * Number of distinct terms to use in generating summary - */ - const MAX_DISTINCT_TERMS = 1000; - /** - * Number of words in word cloud - */ - const WORD_CLOUD_LEN = 5; - /** - * Number of nonzero centroid components - */ - const CENTROID_COMPONENTS = 50; /** * whether to output the results to the disk or not */ @@ -93,272 +70,31 @@ class CentroidWeightedSummarizer extends Summarizer /* Format the document to remove characters other than periods and alphanumerics. */ - $page = mb_strtolower($page); $formatted_doc = self::formatDoc($page); /* Splitting into sentences */ $out_sentences = self::getSentences($page); $stop_obj = PhraseParser::getTokenizer($lang); $sentences = self::removeStopWords($out_sentences, $stop_obj); - $sentence_array = self::splitSentences($sentences, $lang); - $terms = $sentence_array[0]; - $tf_per_sentence = $sentence_array[1]; - $tf_per_sentence_normalized = $sentence_array[2]; + list($terms, $tf_per_sentence, $tf_per_sentence_normalized) = + self::computeTermsAndStatistics($sentences, $lang); $tf_average_sentence = self::getAverageSentence($tf_per_sentence_normalized); $tf_dot_product_per_sentence = self::getDotProduct($tf_per_sentence_normalized, $tf_average_sentence); - usort($tf_dot_product_per_sentence, 'self::sortInAscendingOrder'); + usort($tf_dot_product_per_sentence, function($a, $b) { + return $b > $a ? 1 : -1; + }); $summary = self::getSummaryFromProducts($tf_dot_product_per_sentence, $out_sentences, $lang); - $n = count($out_sentences); $terms = array_filter($terms); - $terms_counts = array_count_values($terms); - arsort($terms_counts); - $terms_counts = array_slice($terms_counts, 0, - self::MAX_DISTINCT_TERMS); - $terms = array_unique(array_keys($terms_counts)); - $t = count($terms); - if ($t == 0) { - return ["", ""]; - } - /* Initialize Nk [Number of sentences the term occurs] */ - $nk = []; - $nk = array_fill(0, $t, 0); - $nt = []; - /* Count TF for each word */ - for ($i = 0; $i < $n; $i++) { - for ($j = 0; $j < $t; $j++) { - if (strpos($sentences[$i], $terms[$j]) !== false) { - $nk[$j]++; - } - } - } - /* Calculate weights of each term for every sentence */ - $w = []; - $idf = []; - $idf_temp = 0; - for ($k = 0; $k < $t; $k++) { - if ($nk[$k] == 0) { - $idf_temp = 0; - $tmp = 0; - } else { - $idf_temp = $n / $nk[$k]; - $tmp = log($idf_temp); - } - $idf[$k] = $tmp; - } - /* Count TF for finding centroid */ - $wc = []; - $max_nt = -1; - $b = "\b"; - if (in_array($lang, ["zh-CN", "ja", "ko"])) { - $b = ""; - } - set_error_handler(null); - for ($j = 0; $j < $t; $j++) { - $quoted = preg_quote($terms[$j]); - $nt = @preg_match_all("/$b" . $quoted . "$b/", $formatted_doc, - $matches); //$matches included for backwards compatibility - $wc[$j] = $nt * $idf[$j]; - if (is_nan($wc[$j]) || is_infinite($wc[$j])) { - $wc[$j] = 0; - } - } - set_error_handler(C\NS_CONFIGS . "yioop_error_handler"); - /* Calculate centroid */ - arsort($wc); - $centroid = array_slice($wc, 0, self::CENTROID_COMPONENTS, true); - /* Initializing centroid weight array by 0 */ - $wc = array_fill(0, $t, 0); - /* Word cloud */ - $i = 0; - $word_cloud = []; - foreach ($centroid as $key => $value) { - $wc[$key] = $value; - if ($i < self::WORD_CLOUD_LEN) { - $word_cloud[$i] = $terms[$key]; - } - $i++; - } + list($word_cloud,) = + self::wordCloudAndCountsFromTermsSentences($terms, $sentences, + $lang); /* Summary of text summarization */ return [$summary, $word_cloud]; } - /** - * Calculates how many sentences to put in the summary to match the - * MAX_DESCRIPTION_LEN. - * - * @param array $sentences sentences in doc in their original order - * @param array $sim associative array of sentence-number-in-doc => - * similarity score to centroid (sorted from highest to lowest score). - * @return int number of sentences - */ - public static function summarySentenceCount($sentences, $sim) - { - $top = null; - $count = 0; - foreach ($sim as $key => $value) - { - if ($count < PageProcessor::$max_description_len) { - $count += strlen($sentences[$key]); - $top++; - } - } - return $top; - } - /** - * Breaks any content into sentences by splitting it on spaces or carriage - * returns - * @param string $content complete page. - * @return array array of sentences from that content. - */ - public static function getSentences($content) - { - $lines = preg_split( - '/(\.|\||\!|\?|!|?|。)\s+|(\n|\r)(\n|\r)+|\s{5}/', - $content, 0, PREG_SPLIT_NO_EMPTY); - $out = []; - $sentence = ""; - $count = 0; - $theshold_factor = 1; - $threshold = self::LONG_SENTENCE_THRESHOLD; - foreach ($lines as $line) { - $sentence .= " " . $line; - if (strlen($line) < 2) { - continue; - } - if ($count < $threshold || - strlen($sentence) > $theshold_factor * - self::LONG_SENTENCE_LEN) { - $sentence = preg_replace("/\s+/ui", " ", $sentence); - $out[] = trim($sentence); - $count++; - $theshold_factor = - pow(1.5, floor($count/$threshold)); - } - $sentence = ""; - } - if (trim($sentence) != "") { - $sentence = preg_replace("/\s+/ui", " ", $sentence); - $out[] = trim($sentence); - } - return $out; - } - /** - * Formats the sentences to remove all characters except words, - * digits and spaces - * @param string $sent complete page. - * @return string formatted sentences. - */ - public static function formatSentence($sent) - { - $sent = trim(preg_replace('/[^\p{L}\p{N}\s]+/u', - ' ', $sent)); - return $sent; - } - /** - * Formats the document to remove carriage returns, hyphens and digits - * as we will not be using digits in word cloud. - * The formatted document generated by this function is only used to - * compute centroid. - * @param string $content formatted page. - * @return string formatted document. - */ - public static function formatDoc($content) - { - $substitute = ['/[\n\r\-]+/', '/[^\p{L}\s\.]+/u', '/[\.]+/']; - $content = preg_replace($substitute, ' ', $content); - return $content; - } - /** - * This function does an additional processing on the page - * such as removing all the tags from the page - * @param string $page complete page. - * @return string processed page. - */ - public static function pageProcessing($page) - { - $substitutions = ['@<script[^>]*?>.*?</script>@si', - '/\ \;|\&rdquo\;|\&ldquo\;|\&mdash\;/si', - '@<style[^>]*?>.*?</style>@si', '/[\^\(\)]/', - '/\[(.*?)\]/', '/\t\n/' - ]; - $page = preg_replace($substitutions, ' ', $page); - $page = preg_replace('/\s{2,}/', ' ', $page); - $new_page = preg_replace("/\<br\s*(\/)?\s*\>/", "\n", $page); - $changed = false; - if ($new_page != $page) { - $changed = true; - $page = $new_page; - } - $page = preg_replace("/\<\/(h1|h2|h3|h4|h5|h6|table|tr|td|div|". - "p|address|section)\s*\>/", "\n\n", $page); - $page = preg_replace("/\<a/", " <a", $page); - $page = preg_replace("/\&\#\d{3}(\d?)\;|\&\w+\;/", " ", $page); - $page = preg_replace("/\</", " <", $page); - $page = strip_tags($page); - if ($changed) { - $page = preg_replace("/(\r?\n[\t| ]*){2}/", "\n", $page); - } - $page = preg_replace("/(\r?\n[\t| ]*)/", "\n", $page); - $page = preg_replace("/\n\n\n+/", "\n\n", $page); - return $page; - } - /** - * Calculates an array with key terms and values their frequencies - * based on a supplied sentence - * - * @param array $terms the list of all terms in the doc - * @param array $sentence the sentences in the doc - * @return array a two dimensional array where the word is the key and - * the frequency is the value - */ - public static function getTermFrequencies($terms, $sentence) - { - $t = count($terms); - $nk = []; - $nk = array_fill(0, $t, 0); - $nt = []; - for ($j = 0; $j < $t; $j++) { - $nk[$j] += preg_match_all("/\b" . preg_quote($terms[$j],'/') . - "\b/iu", $sentence, $matches); - } - $term_frequencies = []; - for ($i = 0; $i < count($nk); $i++ ) { - $term_frequencies[$terms[$i]] = $nk[$i]; - } - return $term_frequencies; - } - /** - * Normalize the term frequencies based on the sum of the squares. - * @param array $term_frequencies the array with the terms as the key - * and its frequency as the value - * @return array array of term frequencies normalized - */ - public static function normalizeTermFrequencies($term_frequencies) - { - $sum_of_squares = 0; - $result_sum = 0; - if (count($term_frequencies) == 0) { - $result = []; - } else { - foreach ($term_frequencies as $k => $v) { - $sum_of_squares += ($v * $v); - } - $square_root = sqrt($sum_of_squares); - foreach ($term_frequencies as $k => $v) { - if ($square_root == 0) { - $result[$k] = 0; - } else { - $result[$k] = ($v / $square_root); - } - } - foreach ($result as $k => $v) { - $result_sum += $v; - } - } - return $result; - } + /** * Get the average sentence by adding up the values from each column and * dividing it by the rows in the array. @@ -404,55 +140,25 @@ class CentroidWeightedSummarizer extends Summarizer $result = []; $count = 0; foreach ($term_frequencies_normalized as $k => $v) { - $tempResult = 0; + $temp_result = 0; foreach ($v as $l => $w) { - if (@array_key_exists($l, $average_sentence)) { - $tempResult = $tempResult + - ($average_sentence[$l] * $w); + if (!empty($average_sentence[$l])) { + $temp_result += ($average_sentence[$l] * $w); } } - $result[$count] = $tempResult; + $result[$count] = $temp_result; $count++; } return $result; } /** - * Compare the two values and return if b is greater than a - * @param string $a the first value to compare - * @param string $b the second value to compare - * @return boolean if b is greater than a - */ - public static function sortInAscendingOrder($a, $b) - { - return $b > $a ? 1 : -1; - } - /** - * Returns a new array of sentences without the stop words - * @param array $sentences the array of sentences to process - * @param object $stop_obj the class that has the stopworedRemover method - * @return array a new array of sentences without the stop words - */ - public static function removeStopWords($sentences, $stop_obj) - { - $n = count($sentences); - $result = []; - if ($stop_obj && method_exists($stop_obj, "stopwordsRemover")) { - for ($i = 0; $i < $n; $i++ ) { - $result[$i] = $stop_obj->stopwordsRemover( - self::formatDoc($sentences[$i])); - } - } else { - $result = $sentences; - } - return $result; - } - /** - * Split up the sentences and return an array with all of the needed parts + * Splits sentences into terms and returns [array of terms, + * array term frequencies, array normalized term frequencies] * @param array $sentences the array of sentences to process * @param string $lang the current locale * @return array an array with all of the needed parts */ - public static function splitSentences($sentences, $lang) + public static function computeTermsAndStatistics($sentences, $lang) { $result = []; $terms = []; @@ -468,10 +174,7 @@ class CentroidWeightedSummarizer extends Summarizer self::normalizeTermFrequencies($tf_per_sentence[$tf_index]); $tf_index++; } - $result[0] = $terms; - $result[1] = $tf_per_sentence; - $result[2] = $tf_per_sentence_normalized; - return $result; + return [$terms, $tf_per_sentence, $tf_per_sentence_normalized]; } /** * Split up the sentences and return an array with all of the needed parts @@ -489,9 +192,9 @@ class CentroidWeightedSummarizer extends Summarizer $result = ""; $result_length = 0; $i = 0; + $eos = ($lang == 'hi') ? "।" : "."; //default end of sentence symbol foreach ($tf_dot_product_per_sentence as $k => $v) { - $sentence = PhraseParser::compressSentence($sentences[$k], - $lang); + $sentence = PhraseParser::compressSentence($sentences[$k], $lang); if ($result_length + strlen($sentence) > PageProcessor::$max_description_len) { break; @@ -499,15 +202,15 @@ class CentroidWeightedSummarizer extends Summarizer $result_length += strlen($sentence); if ($i == 0) { $i = 1; - $result = $sentence . ". "; + $result = rtrim($sentence, $eos) . "$eos "; if (self::OUTPUT_TO_FILE) { - $output_file_contents = $sentence . ". "; + $output_file_contents = $sentence . "$eos "; } } else { - $result .= " " . $sentence . ". "; + $result .= " " . rtrim($sentence, $eos) . "$eos "; if (self::OUTPUT_TO_FILE) { $output_file_contents = $output_file_contents . - "\r\n" . $sentence . ". "; + "\r\n" . rtrim($sentence, $eos) . "$eos "; } } } diff --git a/src/library/summarizers/GraphBasedSummarizer.php b/src/library/summarizers/GraphBasedSummarizer.php index 87b27a562..408222d89 100644 --- a/src/library/summarizers/GraphBasedSummarizer.php +++ b/src/library/summarizers/GraphBasedSummarizer.php @@ -76,8 +76,8 @@ class GraphBasedSummarizer extends Summarizer $page = self::pageProcessing($page); $formatted_doc = self::formatDoc($page); //not filtering non-ascii characters - $sentences = self::getSentences($page . " ", true); - $sentences = self::removeStopWords($sentences, $lang); + $sentences_with_punctuation = self::getSentences($page); + $sentences = self::removeStopWords($sentences_with_punctuation, $lang); $sentences = self::removePunctuation($sentences); $sentences = PhraseParser::stemTermsK($sentences, $lang, true); $terms = self::getTerms($sentences, $lang); @@ -87,10 +87,9 @@ class GraphBasedSummarizer extends Summarizer $adjacency = self::computeAdjacency($term_frequencies_normalized, $sentences, $lang, $unmodified_doc); $p = self::getSentenceRanks($adjacency); - $sentences_with_punctuation = self::getSentences($page . " ", true); $summary = self::getFinalSummary($sentences_with_punctuation, $p, $lang); - return [$summary, []]; + return [$summary, self::wordCloudFromSummary($summary, $lang)]; } /** * Given as array of sentences and an array of their importance between 0 @@ -255,49 +254,6 @@ class GraphBasedSummarizer extends Summarizer } return $sentences; } - /** - * Remove the stop words from the array of sentences - * @param array $sentences the sentences in the doc - * @param string $lang locale tag for stemming - * @return array the array of sentences with the stop words removed - */ - public static function removeStopWords($sentences, $lang) - { - $n = count($sentences); - $stop_obj = PhraseParser::getTokenizer($lang); - if ($stop_obj && method_exists($stop_obj, "stopwordsRemover")) { - for ($i = 0; $i < $n; $i++ ) { - $sentences[$i] = $stop_obj->stopwordsRemover( - self::formatDoc($sentences[$i])); - } - } - return $sentences; - } - /** - * Calculate the term frequencies. - * @param array $terms the list of all terms in the doc - * @param array $sentences the sentences in the doc - * @return array a two dimensional array where the word is the key and - * the frequency is the value - */ - public static function getTermFrequencies($terms, $sentences) - { - $t = count($terms); - $n = count($sentences); - $nk = []; - $nk = array_fill(0, $t, 0); - $nt = []; - for ($j = 0; $j < $t; $j++) { - for ($i = 0; $i < $n; $i++) { - $nk[$j] += preg_match_all("/\b" . $terms[$j] . "\b/iu", - $sentences[$i], $matches); - } - } - for ($i = 0; $i < count($nk); $i++ ) { - $term_frequencies[$terms[$i]] = $nk[$i]; - } - return $term_frequencies; - } /** * Get the terms from an array of sentences * @param array $sentences the sentences in the doc @@ -323,62 +279,6 @@ class GraphBasedSummarizer extends Summarizer } return $terms; } - /** - * Breaks any content into sentences by splitting it on spaces or carriage - * returns - * @param string $content complete page. - * @param boolean $keep_punctuation whether to keep the punctuation or not. - * @return array array of sentences from that content. - */ - public static function getSentences($content, $keep_punctuation) - { - $result = []; - if ($keep_punctuation) { - $sentences = - preg_split('/(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)/u', - $content, 0, PREG_SPLIT_NO_EMPTY); - $n = count($sentences); - for ($i = 0; $i < $n; $i++ ) { - $sentences[$i] = trim($sentences[$i]); - } - $result = array_filter($sentences); - } else { - $sentences = preg_split( - '/(\.|\||\!|\?|!|?|。)\s+|(\n|\r)(\n|\r)+|\s{5}/u', - $content, 0, PREG_SPLIT_NO_EMPTY); - $result = array_filter($sentences); - } - return $result; - } - /** - * Normalize the term frequencies based on the sum of the squares. - * @param array $term_frequencies the array with the terms as the key - * and its frequency as the value - * @return array array of term frequencies normalized - */ - public static function normalizeTermFrequencies($term_frequencies) - { - $sum_of_squares = 0; - $result_sum = 0; - foreach ($term_frequencies as $k => $v) { - $sum_of_squares += ($v * $v); - } - $square_root = sqrt($sum_of_squares); - if ($square_root == 0) { - $num_terms = count($term_frequencies); - if ($num_terms == 0) { - return false; - } - foreach ($term_frequencies as $k => $v) { - $result[$k] = 1/$num_terms; - } - return $result; - } - foreach ($term_frequencies as $k => $v) { - $result[$k] = ($v / $square_root); - } - return $result; - } /** * Calculate the distortion measure. * 1. Check each word in sentence1 to see if it exists in sentence2. @@ -391,7 +291,7 @@ class GraphBasedSummarizer extends Summarizer * sum. * 3. Then check the sentence2 to find its not-common words * with sentence1, in case the word Y is not in sentence1, - * square the score of word Y and add to sum and increase + * square the score of word Y and add tosum and increase * the number of not-common words by one. * 4. At the end, calculate the distortion between sentence1 and * sentence2 by dividing sum by the number of not-common @@ -407,8 +307,8 @@ class GraphBasedSummarizer extends Summarizer $term_frequencies, $lang, $doc) { $result = 0; - $first_sentence_split = preg_split('/ +/u', $first_sentence); - $second_sentence_split = preg_split('/ +/u', $second_sentence); + $first_sentence_split = preg_split('/\s+/u', $first_sentence); + $second_sentence_split = preg_split('/\s+/u', $second_sentence); $sum = 0; $non_common_words = 0; $n = count($first_sentence_split); @@ -462,64 +362,4 @@ class GraphBasedSummarizer extends Summarizer } return $result; } - /** - * Formats the sentences to remove all characters except words, - * digits and spaces - * @param string $sent complete page. - * @return string formatted sentences. - */ - public static function formatSentence($sent) - { - $sent = trim(preg_replace('/[^\p{L}\p{N}\s]+/u', - ' ', mb_strtolower($sent))); - return $sent; - } - /** - * Formats the document to remove carriage returns, hyphens and digits - * as we will not be using digits in word cloud. - * The formatted document generated by this function is only used to - * compute centroid. - * @param string $content formatted page. - * @return string formatted document. - */ - public static function formatDoc($content) - { - $substitute = ['/[\n\r\-]+/', '/[^\p{L}\s\.]+/u', '/[\.]+/u']; - $content = preg_replace($substitute, ' ', mb_strtolower($content)); - return $content; - } - /** - * This function does an additional processing on the page - * such as removing all the tags from the page - * @param string $page complete page. - * @return string processed page. - */ - public static function pageProcessing($page) - { - $substitutions = ['@<script[^>]*?>.*?</script>@si', - '/\ \;|\&rdquo\;|\&ldquo\;|\&mdash\;/si', - '@<style[^>]*?>.*?</style>@si', '/[\^\(\)]/', - '/\[(.*?)\]/', '/\t\n/' - ]; - $page = preg_replace($substitutions, ' ', $page); - $page = preg_replace('/\s{2,}/u', ' ', $page); - $new_page = preg_replace("/\<br\s*(\/)?\s*\>/u", "\n", $page); - $changed = false; - if ($new_page != $page) { - $changed = true; - $page = $new_page; - } - $page = preg_replace("/\<\/(h1|h2|h3|h4|h5|h6|table|tr|td|div|". - "p|address|section)\s*\>/u", "\n\n", $page); - $page = preg_replace("/\<a/u", " <a", $page); - $page = preg_replace("/\&\#\d{3}(\d?)\;|\&\w+\;/u", " ", $page); - $page = preg_replace("/\</u", " <", $page); - $page = strip_tags($page); - if ($changed) { - $page = preg_replace("/(\r?\n[\t| ]*){2}/u", "\n", $page); - } - $page = preg_replace("/(\r?\n[\t| ]*)/u", "\n", $page); - $page = preg_replace("/\n\n\n+/u", "\n\n", $page); - return $page; - } } diff --git a/src/library/summarizers/ScrapeSummarizer.php b/src/library/summarizers/ScrapeSummarizer.php index e15353b3a..e179de9ca 100644 --- a/src/library/summarizers/ScrapeSummarizer.php +++ b/src/library/summarizers/ScrapeSummarizer.php @@ -64,7 +64,9 @@ class ScrapeSummarizer extends Summarizer */ public static function getSummary($dom, $page, $lang) { - return [self::description($dom, $page, $lang), []]; + $summary = self::description($dom, $page, $lang); + $word_cloud = self::wordCloudFromSummary($summary, $lang); + return [$summary, $word_cloud]; } /** * Returns descriptive text concerning a webpage based on its document diff --git a/src/library/summarizers/Summarizer.php b/src/library/summarizers/Summarizer.php index 8d6f02796..10b1b3b6f 100644 --- a/src/library/summarizers/Summarizer.php +++ b/src/library/summarizers/Summarizer.php @@ -30,6 +30,9 @@ */ namespace seekquarry\yioop\library\summarizers; +use seekquarry\yioop\configs as C; +use seekquarry\yioop\library\PhraseParser; + /** For Yioop global defines used by subclasses*/ require_once __DIR__."/../../configs/Config.php"; /** @@ -40,6 +43,18 @@ require_once __DIR__."/../../configs/Config.php"; */ class Summarizer { + /** + * Number of distinct terms to use in generating summary + */ + const MAX_DISTINCT_TERMS = 1000; + /** + * Number of nonzero centroid components + */ + const CENTROID_COMPONENTS = 50; + /** + * Number of words in word cloud + */ + const WORD_CLOUD_LEN = 5; /** * The value to represent the weight for class one tags. */ @@ -168,4 +183,235 @@ class Summarizer } return $result; } + /** + * Breaks any content into sentences by splitting it on spaces or carriage + * returns + * @param string $content complete page. + * @return array array of sentences from that content. + */ + public static function getSentences($content) + { + $content = preg_replace([ "/\n+(\.| |\t)+/u", + "/((\p{L}|\p{N}|\)|\}|\]){5,}\s?(\.|\|।|\!|\?|!|?|。))\s+/u", + "/।/u", "/(\n|\r)(\n|\r)+/", "/।./u"], ["\n", "$1.\n", "।\n\n", + "..\n", "।"], $content); + $lines = preg_split('/\.\n/', $content, 0, PREG_SPLIT_NO_EMPTY); + $lines = preg_replace("/\s+/", " ", $lines); + return $lines; + } + /** + * Formats the sentences to remove all characters except words, + * digits and spaces + * @param string $sentence complete page. + * @return string formatted sentences. + */ + public static function formatSentence($sentence) + { + $sentence = trim(preg_replace('/[^\p{L}\p{N}\s]+/u', + ' ', mb_strtolower($sentence))); + return $sentence; + } + /** + * Formats the document to remove carriage returns, hyphens and digits + * as we will not be using digits in word cloud. + * The formatted document generated by this function is only used to + * compute centroid. + * @param string $content formatted page. + * @return string formatted document. + */ + public static function formatDoc($content) + { + $substitute = ['/[\n\r\-]+/', '/[^\p{L}\s\.]+/u', '/\.+/']; + $content = preg_replace($substitute, ' ', $content); + return $content; + } + /** + * This function does an additional processing on the page + * such as removing all the tags from the page + * @param string $page complete page. + * @return string processed page. + */ + public static function pageProcessing($page) + { + $substitutions = ['@<script[^>]*?>.*?</script>@si', + '/\ \;|\&rdquo\;|\&ldquo\;|\&mdash\;/si', + '@<style[^>]*?>.*?</style>@si', '/\t\n/', '/\s{2,}/' + ]; + $page = preg_replace($substitutions, ' ', $page); + $new_page = preg_replace("/\<br\s*(\/)?\s*\>/", "\n", $page); + $changed = false; + if ($new_page != $page) { + $changed = true; + $page = $new_page; + } + $page = preg_replace("/\<\/(h1|h2|h3|h4|h5|h6|table|tr|td|div|". + "p|address|section)\s*\>/iu", "\n\n", $page); + $page = preg_replace("/\<a/iu", " <a", $page); + $page = html_entity_decode($page); + $page = preg_replace("/\</", " <", $page); + $page = strip_tags($page); + if ($changed) { + $page = preg_replace("/(\r?\n[\t| ]*){2}/", "\n", $page); + } + $page = preg_replace("/(\r?\n[\t| ]*)/", "\n", $page); + $page = preg_replace("/\n\n\n+/", "\n\n", $page); + return $page; + } + /** + * Returns a new array of sentences without the stop words + * @param array $sentences the array of sentences to process + * @param object $stop_obj the class that has the stopworedRemover method + * @return array a new array of sentences without the stop words + */ + public static function removeStopWords($sentences, $stop_obj) + { + if ($stop_obj && method_exists($stop_obj, "stopwordsRemover")) { + $results = $stop_obj->stopwordsRemover($sentences); + } else { + $results = $sentences; + } + return $results; + } + /** + * Calculates an array with key terms and values their frequencies + * based on a supplied sentence + * + * @param array $terms the list of all terms in the doc + * @param array $sentence the sentences in the doc + * @return array a two dimensional array where the word is the key and + * the frequency is the value + */ + public static function getTermFrequencies($terms, $sentence) + { + $t = count($terms); + $nk = array_fill(0, $t, 0); + for ($j = 0; $j < $t; $j++) { + $nk[$j] += preg_match_all("/\b" . preg_quote($terms[$j], '/') . + "\b/iu", $sentence); + } + return array_combine($terms, $nk); + } + /** + * Normalize the term frequency vector by dividing its entries by its + * L_2 norm. + * @param array $term_frequencies the array with the terms as the key + * and its frequency as the value + * @return array array of term frequencies normalized + */ + public static function normalizeTermFrequencies($term_frequencies) + { + $sum_of_squares = 0; + $result_sum = 0; + if (count($term_frequencies) == 0) { + return []; + } + foreach ($term_frequencies as $k => $v) { + $sum_of_squares += ($v * $v); + } + $square_root = sqrt($sum_of_squares); + foreach ($term_frequencies as $k => $v) { + if ($square_root == 0) { + $result[$k] = 0; + } else { + $result[$k] = ($v / $square_root); + } + } + return $result; + } + /** + * + */ + public static function wordCloudFromSummary($summary, $lang, + $term_frequencies = null) + { + if ($term_frequencies == null) { + $stop_obj = PhraseParser::getTokenizer($lang); + if ($stop_obj && method_exists($stop_obj, "stopwordsRemover")) { + $summary = $stop_obj->stopwordsRemover($summary); + } + $summary = mb_strtolower($summary); + $terms = PhraseParser::segmentSegment($summary, $lang); + $term_frequencies = self::getTermFrequencies($terms, $summary); + } + arsort($term_frequencies); + $top5 = array_slice($term_frequencies, 0 , 5); + return array_keys($top5); + } + /** + * + */ + public static function wordCloudAndCountsFromTermsSentences($terms, + $sentences, $lang) + { + $n = count($sentences); + $terms_counts = array_count_values($terms); + arsort($terms_counts); + $terms_counts = array_slice($terms_counts, 0, + self::MAX_DISTINCT_TERMS); + $terms = array_unique(array_keys($terms_counts)); + $t = count($terms); + if ($t == 0) { + return ["", ""]; + } + /* Initialize Nk [Number of sentences the term occurs] */ + $nk = []; + $nk = array_fill(0, $t, 0); + $nt = []; + /* Count TF for each word */ + for ($i = 0; $i < $n; $i++) { + for ($j = 0; $j < $t; $j++) { + if (strpos($sentences[$i], $terms[$j]) !== false) { + $nk[$j]++; + } + } + } + /* Calculate weights of each term for every sentence */ + $w = []; + $idf = []; + $idf_temp = 0; + for ($k = 0; $k < $t; $k++) { + if ($nk[$k] == 0) { + $idf_temp = 0; + $tmp = 0; + } else { + $idf_temp = $n / $nk[$k]; + $tmp = log($idf_temp); + } + $idf[$k] = $tmp; + } + /* Count TF for finding centroid */ + $wc = []; + $max_nt = -1; + $b = "\b"; + if (in_array($lang, ["zh-CN", "ja", "ko"])) { + $b = ""; + } + set_error_handler(null); + for ($j = 0; $j < $t; $j++) { + $quoted = preg_quote($terms[$j]); + $nt = @preg_match_all("/$b" . $quoted . "$b/", $formatted_doc, + $matches); //$matches included for backwards compatibility + $wc[$j] = $nt * $idf[$j]; + if (is_nan($wc[$j]) || is_infinite($wc[$j])) { + $wc[$j] = 0; + } + } + set_error_handler(C\NS_CONFIGS . "yioop_error_handler"); + /* Calculate centroid */ + arsort($wc); + $centroid = array_slice($wc, 0, self::CENTROID_COMPONENTS, true); + /* Initializing centroid weight array by 0 */ + $wc = array_fill(0, $t, 0); + /* Word cloud */ + $i = 0; + $word_cloud = []; + foreach ($centroid as $key => $value) { + $wc[$key] = $value; + if ($i < self::WORD_CLOUD_LEN) { + $word_cloud[$i] = $terms[$key]; + } + $i++; + } + return [$word_cloud, $wc, $idf]; + } } diff --git a/src/locale/ar/resources/Tokenizer.php b/src/locale/ar/resources/Tokenizer.php index 7709f64d5..cc3965737 100755 --- a/src/locale/ar/resources/Tokenizer.php +++ b/src/locale/ar/resources/Tokenizer.php @@ -61,12 +61,13 @@ class Tokenizer /** * Removes the stop words from the page (used for Word Cloud generation) * - * @param string $page the page to remove stop words from. - * @return string $page with no stop words + * @param mixed $data either a string or an array of string to remove + * stop words from + * @return mixed $data with no stop words */ - public static function stopwordsRemover($page) + public static function stopwordsRemover($data) { - $stop_words = [ + static $stop_words = [ "ا", "أ", "،", "عشر", "عدد", "عدة","عشرة", "عدم", "عام", "عاما", "عن", "عند", "عندما", "على", "عليه", "عليها", "زيارة", "سنة", "سنوات", @@ -97,9 +98,12 @@ class Tokenizer "منها", "مليار", "لوكالة", "يكون", "يمكن", "مليون" ]; - $page = preg_replace('/\b('.implode('|',$stop_words).')\b/u', '', - $page); - return $page; + static $pattern = ""; + if (empty($pattern)) { + $pattern = '/\b(' . implode('|', $stop_words) . ')\b/u'; + } + $data = preg_replace($pattern, '', $data); + return $data; } /** * Computes the stem of an Arabic word @@ -193,4 +197,4 @@ class Tokenizer } return $word; } -} \ No newline at end of file +} diff --git a/src/locale/de/resources/Tokenizer.php b/src/locale/de/resources/Tokenizer.php index 545df2fbe..86d18de0c 100755 --- a/src/locale/de/resources/Tokenizer.php +++ b/src/locale/de/resources/Tokenizer.php @@ -112,17 +112,18 @@ class Tokenizer /** * Removes the stop words from the page (used for Word Cloud generation) * - * @param string $page the page to remove stop words from. - * @return string $page with no stop words + * @param mixed $data either a string or an array of string to remove + * stop words from + * @return mixed $data with no stop words */ - public static function stopwordsRemover($page) + public static function stopwordsRemover($data) { - $stop_words = ['aber', 'alle', 'allem', 'allen', 'aller', 'alles', - 'als', 'as', 'also', 'am', 'an', 'ander', 'andere', 'anderem', - 'anderen', 'anderer', 'anderes', 'anderm', 'andern', 'anderr', - 'anders', 'auch', 'auf', 'aus', 'bei', 'bin', 'bis', 'bist', - 'da', 'damit', 'dann', 'der', 'den', 'des', 'dem', 'die', 'das', - 'daß', 'derselbe', 'derselben', 'denselben', 'desselben', + static $stop_words = ['aber', 'alle', 'allem', 'allen', 'aller', + 'alles', 'als', 'as', 'also', 'am', 'an', 'ander', 'andere', + 'anderem', 'anderen', 'anderer', 'anderes', 'anderm', 'andern', + 'anderr', 'anders', 'auch', 'auf', 'aus', 'bei', 'bin', 'bis', + 'bist', 'da', 'damit', 'dann', 'der', 'den', 'des', 'dem', 'die', + 'das', 'daß', 'derselbe', 'derselben', 'denselben', 'desselben', 'demselben', 'dieselbe', 'dieselben', 'dasselbe', 'dazu', 'dein', 'deine', 'deinem', 'deinen', 'deiner', 'deines', 'denn', 'derer', 'dessen', 'dich', 'dir', 'du', 'dies', 'diese', 'diesem', 'diesen', @@ -152,9 +153,12 @@ class Tokenizer 'wollte', 'würde', 'würden', 'zu', 'zum', 'zur', 'zwar', 'zwischen' ]; - $page = preg_replace('/\b('.implode('|',$stop_words).')\b/u', '', - strtolower($page)); - return $page; + static $pattern = ""; + if (empty($pattern)) { + $pattern = '/\b(' . implode('|', $stop_words) . ')\b/u'; + } + $data = preg_replace($pattern, '', $data); + return $data; } /** * Computes the stem of a German word diff --git a/src/locale/en_US/resources/Tokenizer.php b/src/locale/en_US/resources/Tokenizer.php index db7ecc92d..003ed54d8 100755 --- a/src/locale/en_US/resources/Tokenizer.php +++ b/src/locale/en_US/resources/Tokenizer.php @@ -73,22 +73,39 @@ class Tokenizer * can be answered using a question answer list * @string */ - public static $question_marker = "qqq"; + public static $question_token = "qqq"; /** - * List of verb-like parts of speech that might appear in lexicon file + * List of adjective-like parts of speech that might appear in lexicon file + * @array + */ + public static $adjective_type = ["JJ", "JJR", "JJS"]; + /** + * List of adverb-like parts of speech that might appear in lexicon file + * @array + */ + public static $adverb_type = ["RB", "RBR", "RBS"]; + /** + * List of conjunction-like parts of speech that might appear in lexicon + * file + * @array + */ + public static $conjunction_type = ["CC"]; + /** + * List of determiner-like parts of speech that might appear in lexicon + * file * @array */ - public static $verb_phrases = ["VB", "VBD", "VBG", "VBN", "VBP", "VBZ"]; + public static $determiner_type = ["DT", "PDT"]; /** * List of noun-like parts of speech that might appear in lexicon file * @array */ - public static $noun_phrases = ["NN", "NNS", "NNP", "NNPS", "PRP"]; + public static $noun_type = ["NN", "NNS", "NNP", "NNPS", "PRP"]; /** - * List of adjective-like parts of speech that might appear in lexicon file + * List of verb-like parts of speech that might appear in lexicon file * @array */ - public static $adjective_phrases = ["JJ", "JJR", "JJS"]; + public static $verb_type = ["VB", "VBD", "VBG", "VBN", "VBP", "VBZ"]; /** * storage used in computing the stem * @var string @@ -128,12 +145,13 @@ class Tokenizer /** * Removes the stop words from the page (used for Word Cloud generation) * - * @param string $page the page to remove stop words from. - * @return string $page with no stop words + * @param mixed $data either a string or an array of string to remove + * stop words from + * @return mixed $data with no stop words */ - public static function stopwordsRemover($page) + public static function stopwordsRemover($data) { - $stop_words = ['a','able','about','above','abst', + static $stop_words = ['a','able','about','above','abst', 'accordance','according','based','accordingly','across','act', 'actually','added','adj','affected','affecting','affects','after', 'afterwards','again','against','ah','all','almost','alone','along', @@ -239,9 +257,62 @@ class Tokenizer 'without','wont','words','world', 'would','wouldnt','www','x','y','yes','yet','you','youd','youll', 'your','youre','yours','yourself','yourselves','youve','z','zero']; - $page = preg_replace('/\b('.implode('|',$stop_words).')\b/', '', - mb_strtolower($page)); - return $page; + static $pattern = ""; + if (empty($pattern)) { + $pattern = '/\b(' . implode('|', $stop_words) . ')\b/ui'; + } + $data = preg_replace($pattern, '', $data); + return $data; + } + /** + * This methods tries to handle punctuation in terms specific to the + * English language such as abbreviations. + * + * @param string& $string a string of words, etc which might involve such + * terms + */ + public function canonicalizePunctuatedTerms(&$string) + { + static $substitutions = [ + //abbreviated titles + "/([mM]r|[mM]rs|[mM]s|[dD]r|dD]rs|[iI]n|". + "[cC]apt|[cC]pl|sS]t|fF]t|[vV]s)\.(\s*)(\p{Lu}|\Z)/u" => '$1 $3', + "/,(\p{Lu})\.(\s*)(\p{Lu}|\Z)/u" => '$1_ $3', + "/gimme/i" => "give me", + "/gonna/i" => "going to", + "/gotta/i" => "got to", + "/ma\'am/i" => "madam", + "/\'tis/i" => "it is", + "/\'twas/i" => "it was", + "/y\'all/i" => "you all", + "/I\'m/" => "I am", + "/I ain\'t/" => "I am not", + "/You ain\'t/i" => "you are not", + "/(why|who|which|when|what|this|that|there|how|" . + "it|everyone|one|he|she)\'s/i" => "$1 is", + "/is been/" => "has been", + "/is had/" => "has had", + // shan't + "/shan\'t/" => "shall not", + // contractions with not + "/\b(\p{L}+)\'d/" => ' $1 would', + // contractions with not + "/\b(\p{L}+)n\'t/" => ' $1 not', + // contractions with will + "/\b(\p{L}+)\'ll/" => ' $1 will', + // contractions with have + "/\b(\p{L}+)\'ve/" => ' $1 have', + // contractions with have + "/\b(\p{L}+)\'re/" => ' $1 are', + "/\b(\p{L}+)\'s/" => ' $1_pos_s' + ]; + static $patterns = []; + static $replacements = []; + if (empty($patterns)) { + $patterns = array_keys($substitutions); + $replacements = array_values($substitutions); + } + $string = preg_replace($patterns, $replacements, $string); } /** * Takes a phrase and tags each term in it with its part of speech. @@ -278,15 +349,15 @@ class Tokenizer */ public static function tagTokenizePartOfSpeech($text) { + /* if run as own server dictionary only loaded once for all requests */ static $dictionary = []; $lexicon_file = C\LOCALE_DIR . "/en_US/resources/lexicon.txt.gz"; if (empty($dictionary)) { if (file_exists($lexicon_file)) { - $lines = gzfile($lexicon_file); - foreach ($lines as $line) { - $tags = preg_split('/(\s+|\,)/u', trim($line)); - $dictionary[array_shift($tags)] = array_filter($tags); - } + $lex_data = gzdecode(file_get_contents($lexicon_file)); + preg_match_all("/([^\s\,]+)[\s|\,]+([^\n]+)/u", + $lex_data, $lex_parts); + $dictionary = array_combine($lex_parts[1], $lex_parts[2]); } } preg_match_all("/[\w\d]+/", $text, $matches); @@ -307,11 +378,12 @@ class Tokenizer // remove trailing full stops $token = strtolower($token); if (!empty($dictionary[$token])) { - $tag_list = $dictionary[$token]; + $tag_list = explode(" ", $dictionary[$token]); $current['tag'] = $tag_list[0]; } // Converts verbs after 'the' to nouns - if ($previous['tag'] == 'DT' && in_array($current['tag'], $verbs)){ + if ($previous['token'] == 'the' && + in_array($current['tag'], $verbs)){ $current['tag'] = 'NN'; } // Convert noun to number if . appears @@ -352,7 +424,7 @@ class Tokenizer $result[$i - 1]['tag'] = 'JJ'; $current['tag'] = 'NN'; } - /* If we get noun, and the second can be a verb, + /* If we have a noun, and the second can be a verb, * convert to verb; if noun noun and previous could be an * adjective convert to adjective */ @@ -426,12 +498,37 @@ class Tokenizer */ public static function compressSentence($sentence_to_compress) { - $result = $sentence_to_compress; - $result = self::compressSentenceStep2($result); - $result = self::compressSentenceStep3($result); - $result = self::compressSentenceStep4($result); - $result = self::compressSentenceStep5($result); - return $result; + // patterns are based on From Back to Basics: CLASSY 2006 page 3: + static $delete_patterns = [ + /* + 2. We remove many adverbs and all conjunctions, + including phrases such as "As a matter of fact," and + "At this point," that occur at the start of a sentence. + */ + "/^At this point,?\b/i", "/^As a matter of fact,?\b/i", + "/^[a-zA-Z]*ly\b/i", "/(^and,?\b)|(^but,?\b)|(^for,?\b)|" . + "(^nor,?\b)|(^or,?\b)|(^so,?\b)|(^yet,?\b)/i", + /* + 3. We remove a small selections of words that occur in the middle + of a sentence, such as ", however," and ", also," (not always + requiring the commas). + */ + "/(;|,)?\s*(nevertheless|today|tomorrow|soon|instead|in practice|" . + "however|as a practical matter|further(more)?|". + "as such|also)\s*(;|,)?\s*/i", + /* + 4. For DUC 2006, we added the removal of ages such as ", 51," or + ", aged 24,". + */ + "/,\s?\d{1,3},/i", "/,\s?aged\s?\d{1,3},/i", + /* + 6. We remove relative clause attributives (clauses beginning with + "who(m)", "which", "when", and "where") wherever possible. + */ + "/(,\s?whom?[^,]*,)|(,\s?which[^,]*,)|" . + "(,\s?when[^,]*,)|(,\s?where[^,]*,)/i" + ]; + return preg_replace($delete_patterns, " ", $sentence_to_compress); } /** * Takes a triplets array with subject, predicate, object fields with @@ -451,60 +548,84 @@ class Tokenizer self::extractTripletByType($sub_pred_obj_triplets, "RAW"); return $processed_triplets; } + /** + * + */ + public static function parseTypeList(&$cur_node, $tagged_phrase, $type) + { + $string = ""; + $previous_string = ""; + $previous_tag = ""; + $start_node = $cur_node; + $next_tag = (empty($tagged_phrase[$cur_node]['tag'])) ? "" : + trim($tagged_phrase[$cur_node]['tag']); + $allowed_conjuncts = []; + while ($next_tag && (in_array($next_tag, $type) || + in_array($next_tag, $allowed_conjuncts))) { + $previous_string = $string; + $string .= " ". $tagged_phrase[$cur_node]['token']; + $cur_node++; + $allowed_conjuncts = self::$conjunction_type; + $previous_tag = $next_tag; + $next_tag = (empty($tagged_phrase[$cur_node]['tag'])) ? "" : + trim($tagged_phrase[$cur_node]['tag']); + } + if (in_array($previous_tag, $allowed_conjuncts) && $start_node < + $cur_node) { + $cur_node--; + $string = $previous_string; + } + return $string; + } /** * Takes a part-of-speech tagged phrase and pre-tree with a - * parse-from position and builds a parse tree for a determiner if possible + * parse-from position and builds a parse tree for an adjective if possible * * @param array $tagged_phrase * an array of pairs of the form ("token" => token_for_term, * "tag"=> part_of_speech_tag_for_term) - * @param array $tree that consists of ["curnode" => + * @param array $tree that consists of ["cur_node" => * current parse position in $tagged_phrase] * @return array has fields * "cur_node" index of how far we parsed $tagged_phrase - * "DT" a subarray with a token node for the determiner that was + * "JJ" a subarray with a token node for the adjective that was * parsed */ - public static function extractDeterminer($tagged_phrase, $tree) + public static function parseAdjective($tagged_phrase, $tree) { - $cur_node = $tree['cur_node']; - if (isset($tagged_phrase[$cur_node]['tag']) && - trim($tagged_phrase[$cur_node]['tag']) == "DT" ) { - $tree['DT'] = $tagged_phrase[$cur_node]['token']; - $tree['cur_node']++; - return $tree; + $adjective_string = self::parseTypeList($tree['cur_node'], + $tagged_phrase, self::$adjective_type); + if (!empty($adjective_string)) { + $tree["JJ"] = $adjective_string; } return $tree; } /** * Takes a part-of-speech tagged phrase and pre-tree with a - * parse-from position and builds a parse tree for an adjective if possible + * parse-from position and builds a parse tree for a determiner if possible * * @param array $tagged_phrase * an array of pairs of the form ("token" => token_for_term, * "tag"=> part_of_speech_tag_for_term) - * @param array $tree that consists of ["cur_node" => + * @param array $tree that consists of ["curnode" => * current parse position in $tagged_phrase] * @return array has fields * "cur_node" index of how far we parsed $tagged_phrase - * "JJ" a subarray with a token node for the adjective that was + * "DT" a subarray with a token node for the determiner that was * parsed */ - public static function extractAdjective($tagged_phrase, $tree) + public static function parseDeterminer($tagged_phrase, $tree) { - $adjective_string = ""; - $cur_node = $tree['cur_node']; - while (isset($tagged_phrase[$cur_node]['tag']) && - in_array(trim($tagged_phrase[$cur_node]['tag']), - self::$adjective_phrases)) { - $adjective_string .= " " . $tagged_phrase[$cur_node]['token']; - $cur_node++; - } - if (!empty($adjective_string)) { - $tree["JJ"] = $adjective_string; - } - $tree['cur_node'] = $cur_node; - return $tree; + $determiner_string = ""; + /* In: All the cows low, "All the" is considered a determiner. + That is, we will mush together the predeterminer with the determiner + */ + $determiner_string = self::parseTypeList($tree['cur_node'], + $tagged_phrase, self::$determiner_type); + if (!empty($determiner_string)) { + $tree["DT"] = $determiner_string; + } + return $tree; } /** * Takes a part-of-speech tagged phrase and pre-tree with a @@ -520,21 +641,37 @@ class Tokenizer * "NN" a subarray with a token node for the noun string that was * parsed */ - public static function extractNoun($tagged_phrase, $tree) + public static function parseNoun($tagged_phrase, $tree) { //Combining multiple noun into one - $noun_string = ""; - $cur_node = $tree['cur_node']; - while (isset($tagged_phrase[$cur_node]['tag']) && - (in_array(trim($tagged_phrase[$cur_node]['tag']), - self::$noun_phrases))) { - $noun_string .= " " . $tagged_phrase[$cur_node]['token']; - $cur_node++; - } + $noun_string = self::parseTypeList($tree['cur_node'], $tagged_phrase, + self::$noun_type); if (!empty($noun_string)) { $tree["NN"] = $noun_string; } - $tree['cur_node'] = $cur_node; + return $tree; + } + /** + * Takes a part-of-speech tagged phrase and pre-tree with a + * parse-from position and builds a parse tree for a verb if possible + * + * @param array $tagged_phrase + * an array of pairs of the form ("token" => token_for_term, + * "tag"=> part_of_speech_tag_for_term) + * @param array $tree that consists of ["curnode" => + * current parse position in $tagged_phrase] + * @return array has fields + * "cur_node" index of how far we parsed $tagged_phrase + * "VB" a subarray with a token node for the verb string that was + * parsed + */ + public static function parseVerb($tagged_phrase, $tree) + { + $verb_string = self::parseTypeList($tree['cur_node'], $tagged_phrase, + self::$verb_type); + if (!empty($verb_string)) { + $tree["VB"] = $verb_string; + } return $tree; } /** @@ -547,7 +684,7 @@ class Tokenizer * "tag"=> part_of_speech_tag_for_term) * @param array $tree that consists of ["cur_node" => * current parse position in $tagged_phrase] - * @param int $index which term in $tagged_phrase to start to try to extract + * @param int $index which term in $tagged_phrase to start to try to parse * a preposition from * @return array has fields * "cur_node" index of how far we parsed $tagged_phrase @@ -558,51 +695,42 @@ class Tokenizer * "JJ_i" with value an adjective subtree * "NN_i" with value an additional noun subtree */ - public static function extractPrepositionalPhrases($tagged_phrase, $tree, + public static function parsePrepositionalPhrases($tagged_phrase, $tree, $index = 1) { - $cur_node = $tree['cur_node']; - // Checking for preposition.I.e, format: prep [det] [adjective] noun + $cur_node = $tree['cur_node']; + // Checking for preposition. I.e, format: prep [det] [adjective] noun if (isset($tagged_phrase[$cur_node]['tag']) && trim($tagged_phrase[$cur_node]['tag']) == "IN") { /* can have multiple prep's in a row, for example, it is known in over 20 countries*/ - $preposition_string = ""; - while (isset($tagged_phrase[$cur_node]['tag']) && - trim($tagged_phrase[$cur_node]['tag']) == "IN") { - $preposition_string .= " ". $tagged_phrase[$cur_node]['token']; - $cur_node++; - } + $preposition_string = self::parseTypeList($cur_node, $tagged_phrase, + ["IN"]); if (!empty($preposition_string)) { $tree["IN_$index"] = $preposition_string; } - if (isset($tagged_phrase[$cur_node]['tag']) && - trim($tagged_phrase[$cur_node]['tag']) == "DT") { - $tree['DT_$index'] = $tagged_phrase[$cur_node]['token']; - $cur_node++; - } - $adjective_string = ""; - while (isset($tagged_phrase[$cur_node]['tag']) && - in_array(trim($tagged_phrase[$cur_node]['tag']), - self::$adjective_phrases)) { - $adjective_string .= " " . $tagged_phrase[$cur_node]['token']; - $cur_node++; + $determiner_string = self::parseTypeList($cur_node, $tagged_phrase, + self::$determiner_type); + if (!empty($determiner_string)) { + $tree["DT_$index"] = $determiner_string; } + $adjective_string = self::parseTypeList($cur_node, $tagged_phrase, + self::$adjective_type); if (!empty($adjective_string)) { $tree["JJ_$index"] = $adjective_string; } - $prep_noun_string = ""; - while (isset($tagged_phrase[$cur_node]['tag']) && - in_array(trim($tagged_phrase[$cur_node]['tag']), - self::$noun_phrases)) { - $prep_noun_string .= " " . $tagged_phrase[$cur_node]['token']; - $cur_node++; - } + $prep_noun_string = self::parseTypeList($cur_node, $tagged_phrase, + self::$noun_type); if ($prep_noun_string) { $tree["NP_$index"] = $prep_noun_string; } - $tree_next = self::extractPrepositionalPhrases($tagged_phrase, + /* if have more than one phrase in a row: + the drought happened in many countries over many years. + */ + $tree_next = self::parsePrepositionalPhrases($tagged_phrase, ["cur_node" => $cur_node], $index + 1); + unset($tree_next['cur_node']); + $tree['PRP'] = $tree_next; } $tree['cur_node'] = $cur_node; return $tree; @@ -623,70 +751,47 @@ class Tokenizer * "JJ" with value an adjective subtree * "NN" with value a noun tree */ - public static function extractNounPhrase($tagged_phrase, $tree) + public static function parseNounPhrase($tagged_phrase, $tree) { $cur_node = $tree['cur_node']; - $tree_dt = self::extractDeterminer($tagged_phrase, + $tree_dt = self::parseDeterminer($tagged_phrase, ['cur_node' => $cur_node]); - $tree_jj = self::extractAdjective($tagged_phrase, + $tree_jj = self::parseAdjective($tagged_phrase, ['cur_node' => $tree_dt['cur_node']]); - $tree_nn = self::extractNoun($tagged_phrase, + $tree_nn = self::parseNoun($tagged_phrase, ['cur_node' => $tree_jj['cur_node']]); - $tree_pp = self::extractPrepositionalPhrases($tagged_phrase, - ['cur_node' => $tree_nn['cur_node']]); if ($tree_nn['cur_node'] == $cur_node) { $tree['NP'] = ""; - } else { - $cur_node = $tree_pp['cur_node']; - unset($tree_dt['cur_node']); - $tree_new_sub['DT'] = $tree_dt; - unset($tree_jj['cur_node']); - $tree_new_sub['JJ'] = $tree_jj; - unset($tree_nn['cur_node']); - $tree_new_sub['NN'] = $tree_nn; - unset($tree_pp['cur_node']); - $tree_new_sub['PRP'] = $tree_pp; - $tree_new['cur_node'] = $cur_node; - $tree_new['NP'] = $tree_new_sub; - return $tree_new; - } - return $tree; - } - /** - * Takes a part-of-speech tagged phrase and pre-tree with a - * parse-from position and builds a parse tree for a verb if possible - * - * @param array $tagged_phrase - * an array of pairs of the form ("token" => token_for_term, - * "tag"=> part_of_speech_tag_for_term) - * @param array $tree that consists of ["curnode" => - * current parse position in $tagged_phrase] - * @return array has fields - * "cur_node" index of how far we parsed $tagged_phrase - * "VB" a subarray with a token node for the verb string that was - * parsed - */ - public static function extractVerb($tagged_phrase, $tree) - { - $cur_node = $tree['cur_node']; - // skip stuff before verb (intensifiers and adverbs) - while (isset($tagged_phrase[$cur_node]['tag']) && - !in_array(trim($tagged_phrase[$cur_node]['tag']), - self::$verb_phrases)) { - $cur_node++; + return $tree; } - $verb_string = ""; - while (isset($tagged_phrase[$cur_node]['tag']) && - in_array(trim($tagged_phrase[$cur_node]['tag']), - self::$verb_phrases)) { - $verb_string .= " " . $tagged_phrase[$cur_node]['token']; + $tree_pp = self::parsePrepositionalPhrases($tagged_phrase, + ['cur_node' => $tree_nn['cur_node']]); + $tree_aux = self::parseAuxClause($tagged_phrase, + ['cur_node' => $tree_pp['cur_node']]); + $cur_node = $tree_aux['cur_node']; + $cc = ""; + if (!empty($tagged_phrase[$cur_node]['tag']) && + in_array($tagged_phrase[$cur_node]['tag'], + self::$conjunction_type)) { + $cc = $tagged_phrase[$cur_node]['token']; $cur_node++; } - if (!empty($verb_string)) { - $tree["VB"] = $verb_string; + $tree_np = self::parseNounPhrase($tagged_phrase, + ['cur_node' => $cur_node]); + if ($tree_np['cur_node'] == $cur_node && $cc) { + $cur_node--; + $tree_np = []; + $cc = ""; + } else { + $cur_node = $tree_np['cur_node']; } - $tree['cur_node'] = $cur_node; - return $tree; + unset($tree_dt['cur_node'], $tree_jj['cur_node'], + $tree_nn['cur_node'], $tree_pp['cur_node'], + $tree_aux['cur_node'], $tree_np['cur_node']); + $sub_tree = ['DT' => $tree_dt, 'JJ' => $tree_jj, 'NN' => $tree_nn, + 'PRP' => $tree_pp, 'AUX' => $tree_aux, 'CC' => $cc, + 'ADD_NP' => $tree_np]; + return ['cur_node' => $cur_node, 'NP' => $sub_tree]; } /** * Takes a part-of-speech tagged phrase and pre-tree with a @@ -703,42 +808,40 @@ class Tokenizer * "VB" with value a verb subtree * "NP" with value an noun phrase subtree */ - public static function extractVerbPhrase($tagged_phrase, $tree) + public static function parseVerbPhrase($tagged_phrase, $tree) { $cur_node = $tree['cur_node']; - $tree_vb = self::extractVerb($tagged_phrase, ['cur_node' => $cur_node]); - if ($tree_vb['cur_node'] == $cur_node) { + $adverb_string = self::parseTypeList($cur_node, $tagged_phrase, + self::$adverb_type); + $tree_vb = self::parseVerb($tagged_phrase, ['cur_node' => $cur_node]); + if ($cur_node == $tree_vb['cur_node']) { + // if no verb return what started with return $tree; } $cur_node = $tree_vb['cur_node']; - $preposition_string = ""; - while (isset($tagged_phrase[$cur_node]['tag']) && - trim($tagged_phrase[$cur_node]['tag']) == "IN") { - $preposition_string .= " ". $tagged_phrase[$cur_node]['token']; - $cur_node++; + $add_to_adverb_string = self::parseTypeList($cur_node, + $tagged_phrase, self::$adverb_type); + if (trim($add_to_adverb_string) != 'very') { + $adverb_string .= $add_to_adverb_string; + $tree_vb['cur_node'] = $cur_node; + } else { + $tagged_phrase[$tree_vb['cur_node']]['tag'] = 'JJ'; } - if (!empty($preposition_string)) { - $tree_vb["IN"] = $preposition_string; + $tree_np = self::parseNounPhrase($tagged_phrase, + ['cur_node' => $tree_vb['cur_node']]); + $adverb_string .= self::parseTypeList($tree_np['cur_node'], + $tagged_phrase, self::$adverb_type); + if (!empty($adverb_string)) { + $tree_vb["RB"] = $adverb_string; } - $tree_np = self::extractNounPhrase($tagged_phrase, - ['cur_node' => $cur_node]); - $tree_new = []; - $tree_new_sub = []; - if ($tree_np['cur_node'] != $cur_node) { - $cur_node = $tree_np['cur_node']; - unset($tree_vb['cur_node']); - unset($tree_np['cur_node']); - $tree_new_sub['VB'] = $tree_vb; - $tree_new_sub['NP'] = $tree_np['NP']; - $tree_new['cur_node'] = $cur_node; - $tree_new['VP'] = $tree_new_sub; - return $tree_new; + $cur_node = $tree_np['cur_node']; + if (!empty($tree_np['NP'])) { + unset($tree_vb['cur_node'], $tree_np['cur_node']); + return ['VP' => ['VB' => $tree_vb, 'NP' => $tree_np['NP']], + 'cur_node' => $cur_node]; } unset($tree_vb['cur_node']); - $tree_new_sub['VB'] = $tree_vb; - $tree_new['cur_node'] = $cur_node; - $tree_new['VP'] = $tree_new_sub; - return $tree_new; + return ['VP' => ['VB' => $tree_vb], 'cur_node' => $cur_node]; } /** * Given a part-of-speeech tagged phrase array generates a parse tree @@ -752,22 +855,73 @@ class Tokenizer * $tree["NP"] contains a subtree for a noun phrase * $tree["VP"] contains a subtree for a verb phrase */ - public static function generatePhraseParseTree($tagged_phrase) + public static function parseWholePhrase($tagged_phrase, $tree) { - $tree = []; + // for example: In the dark of winter, he walked silently. + $tree_start = self::parsePrepositionalPhrases($tagged_phrase, + ["cur_node" => $tree['cur_node']]); + $cur_node = empty($tree_start['cur_node']) ? $tree['cur_node'] : + $tree_start['cur_node']; + unset($tree_start['cur_node']); //cur_node is the index in tagged_phrase we've parse to so far - $tree_np = self::extractNounPhrase($tagged_phrase, ["cur_node" => 0]); - $tree = ["cur_node" => $tree_np['cur_node']]; - $tree_vp = self::extractVerbPhrase($tagged_phrase, $tree); - if ($tree == $tree_vp) { + $tree_np = self::parseNounPhrase($tagged_phrase, + ["cur_node" => $cur_node]); + if ($tree_np['cur_node'] == $cur_node) { return $tree; } - $tree['cur_node'] = $tree_vp['cur_node']; - unset($tree_np['cur_node']); - unset($tree_vp['cur_node']); - $tree['NP'] = $tree_np['NP']; - $tree['VP'] = $tree_vp['VP']; - return $tree; + $tree_vp = self::parseVerbPhrase($tagged_phrase, + ["cur_node" => $tree_np['cur_node']]); + if ($tree_np['cur_node'] == $tree_vp['cur_node']) { + return $tree; + } + $cur_node = $tree_vp['cur_node']; + unset($tree_np['cur_node'], $tree_vp['cur_node']); + if (!empty($tree_start) && !empty($tree_np['NP'])) { + $tree_np['NP']['PRP-1'] = $tree_start; + } + return ['cur_node' => $cur_node, 'NP' => $tree_np['NP'], + 'VP' => $tree_vp['VP']]; + } + /** + * Takes a part-of-speech tagged phrase and pre-tree with a + * parse-from position and builds a parse tree for a auxiliary clause + * if possible + * + * @param array $tagged_phrase + * an array of pairs of the form ("token" => token_for_term, + * "tag"=> part_of_speech_tag_for_term) + * @param array $tree that consists of ["cur_node" => + * current parse position in $tagged_phrase] + * @return array has fields + * "cur_node" index of how far we parsed $tagged_phrase + */ + public static function parseAuxClause($tagged_phrase, $tree) + { + $cur_node = $tree["cur_node"]; + $token = empty($tagged_phrase[$cur_node]["token"]) ? "" : + trim($tagged_phrase[$cur_node]["token"]); + if (!in_array($token, ["that", "who", "which", "because", "like", + "as"])) { + return $tree; + } + $cur_node++; + $tree_vp = self::parseVerbPhrase($tagged_phrase, + ["cur_node" => $cur_node]); + if ($cur_node != $tree_vp['cur_node']) { + $cur_node = $tree_vp['cur_node']; + unset($tree_vp['cur_node']); + return ['cur_node' => $cur_node, + 'IN' => $token, 'PHRASE' => $tree_vp]; + } + $tree_wp = self::parseWholePhrase($tagged_phrase, + ["cur_node" => $cur_node]); + if ($tree_wp['cur_node'] == $cur_node) { + return $tree; + } + $cur_node = $tree_wp['cur_node']; + unset($tree_wp['cur_node']); + return ['cur_node' => $cur_node, + 'IN' => $token, 'PHRASE' => $tree_wp]; } /** * Takes a parse tree of a phrase and computes subject, predicate, and @@ -809,8 +963,13 @@ class Tokenizer }, \ARRAY_FILTER_USE_KEY ); $triplet_types = ['CONCISE', 'RAW']; foreach ($word_and_phrase_list as $word_and_phrase => $position_list) { + $word_and_phrase = self::compressSentence($word_and_phrase); + // strip parentheticals + $word_and_phrase = preg_replace("/[\{\[\(][^\}\]\)]+[\}\]\)]/u", + "", $word_and_phrase); $tagged_phrase = self::tagTokenizePartOfSpeech($word_and_phrase); - $parse_tree = self::generatePhraseParseTree($tagged_phrase); + $parse_tree = self::parseWholePhrase($tagged_phrase, + ['cur_node' => 0]); $triplets = self::extractTripletsParseTree($parse_tree); $extracted_triplets = self::rearrangeTripletsByType($triplets); foreach ($triplet_types as $type) { @@ -954,11 +1113,10 @@ class Tokenizer public static function parseWhoQuestion($tagged_question, $index) { $generated_questions = []; - $question_marker = self::getQuestionMarker(); $tree = ["cur_node" => $index]; $tree['NP'] = "WHO"; $triplets = []; - $tree_vp = self::extractVerbPhrase($tagged_question, $tree); + $tree_vp = self::parseVerbPhrase($tagged_question, $tree); $triplets['predicate'] = self::extractPredicateParseTree( $tree_vp); $triplets['object'] = self::extractObjectParseTree( @@ -970,8 +1128,8 @@ class Tokenizer $generated_questions[$type][] = trim($triplets['object'][$type]) . " " . trim($triplets['predicate'][$type]) . " " . - $question_marker; - $generated_questions[$type][] = $question_marker . + self::$question_token; + $generated_questions[$type][] = self::$question_token . " " . trim($triplets['predicate'][$type]) . " " . trim($triplets['object'][$type]); } @@ -993,10 +1151,9 @@ class Tokenizer { $generated_questions = []; $aux_verb = ""; - $question_marker = self::getQuestionMarker(); while (isset($tagged_question[$index]) && in_array(trim($tagged_question[$index]['tag']), - self::$verb_phrases)) { + self::$verb_type)) { $token = trim($tagged_question[$index]['token']); $aux_verb .= " " . $token; $index++; @@ -1004,9 +1161,9 @@ class Tokenizer $tree = ["cur_node" => $index]; $tree['NP'] = "WHPlus"; $triplets = []; - $tree_np = self::extractNounPhrase($tagged_question, $tree); + $tree_np = self::parseNounPhrase($tagged_question, $tree); $triplets['subject'] = self::extractSubjectParseTree($tree_np); - $tree_vp = self::extractVerbPhrase($tagged_question, $tree_np); + $tree_vp = self::parseVerbPhrase($tagged_question, $tree_np); $triplets['predicate'] = self::extractPredicateParseTree($tree_vp); if (!empty($aux_verb)) { if (!isset($triplets['predicate']['RAW'])) { @@ -1022,8 +1179,8 @@ class Tokenizer $generated_questions[$type][] = trim($triplets['subject'][$type]) . " " . trim($triplets['predicate'][$type]) . - " " . $question_marker; - $generated_questions[$type][] = $question_marker. + " " . self::$question_token; + $generated_questions[$type][] = self::$question_token . " " . trim($triplets['predicate'][$type]) . " " . trim($triplets['subject'][$type]); } @@ -1073,7 +1230,6 @@ class Tokenizer && !empty($sub_pred_obj_triplets['predicate'][$type]) && !empty($sub_pred_obj_triplets['object'][$type])) { $question_answer_triplets = []; - $question_marker = self::$question_marker; $sentence = [ trim($sub_pred_obj_triplets['subject'][$type]), trim($sub_pred_obj_triplets['predicate'][$type]), trim($sub_pred_obj_triplets['object'][$type])]; @@ -1088,7 +1244,7 @@ class Tokenizer } for ($i = 0; $i < 3; $i++) { $q_sentence = $sentence; - $q_sentence[$i] = $question_marker; + $q_sentence[$i] = self::$question_token; $q_sentence_string = implode(" ", $q_sentence); $q_sentence_string = self::stemPhrase($q_sentence_string); $question_triplets[] = $q_sentence_string; @@ -1504,80 +1660,4 @@ class Tokenizer } return $tagged_phrase; } - /** - * The function returns the question marker for the locale - * - * @return the question marker - */ - public static function getQuestionMarker() - { - return self::$question_marker; - } - /** - * From Back to Basics: CLASSY 2006 page 3: - * 2. We remove many adverbs and all conjunctions, including phrases such - * as "As a matter of fact," and "At this point," that occur at the start - * of a sentence. - * - * @param string $sentence_to_compress the sentence to compress - * @return the compressed sentence - */ - public static function compressSentenceStep2($sentence_to_compress) - { - $result = $sentence_to_compress; - $result = preg_replace("/^At this point,?/i", "", $result); - $result = preg_replace("/^As a matter of fact,?/i", "", $result); - //adverbs - $result = preg_replace("/^[a-zA-Z]*ly\s?/i", "", $result); - //conjunctions - $result = preg_replace("/(^and,?)|(^but,?)|(^for,?)|(^nor,?)|(^or,?)" . - "|(^so,?)|(^yet,?)/i", "", $result); - return $result; - } - /** - * From Back to Basics: CLASSY 2006 page 3: - * 3. We remove a small selections of words that occur in the middle of a - * sentence, such as ", however," and ", also," (not always requiring the - * commas). - * - * @param string $sentence_to_compress the sentence to compress - * @return the compressed sentence - */ - public static function compressSentenceStep3($sentence_to_compress) - { - $result = $sentence_to_compress; - $result = preg_replace("/,?\s?however,?/i", "", $result); - $result = preg_replace("/,?\s?also,?/i", "", $result); - return $result; - } - /** - * From Back to Basics: CLASSY 2006 page 3: - * 4. For DUC 2006, we added the removal of ages such as ", 51," or - * ", aged 24,". - * - * @param string $sentence_to_compress the sentence to compress - * @return the compressed sentence - */ - public static function compressSentenceStep4($sentence_to_compress) - { - $result = $sentence_to_compress; - $result = preg_replace("/,\s?\d{1,3},/i", "", $result); - $result = preg_replace("/,\s?aged\s?\d{1,3},/i", "", $result); - return $result; - } - /** - * From Back to Basics: CLASSY 2006 page 3: - * 6. We remove relative clause attributives (clauses beginning with - * "who(m)", "which", "when", and "where") wherever possible. - * - * @param string $sentence_to_compress the sentence to compress - * @return the compressed sentence - */ - public static function compressSentenceStep5($sentence_to_compress) - { - $result = $sentence_to_compress; - $result = preg_replace("/(,\s?whom?[^,]*,)|(,\s?which[^,]*,)|" . - "(,\s?when[^,]*,)|(,\s?where[^,]*,)/i", "", $result); - return $result; - } } diff --git a/src/locale/en_US/resources/all_aux_grams.txt b/src/locale/en_US/resources/all_aux_grams.txt new file mode 100755 index 000000000..2593797db --- /dev/null +++ b/src/locale/en_US/resources/all_aux_grams.txt @@ -0,0 +1,194 @@ +governor general +governor generals +lieutenant governor +lieutenant governors +prime ministers +executive power +executive powers +justin trudeau +pierre trudeau +chief justice +charter of rights and freedoms +privy council +agreed on +agreed to +aral 88 +aye yah ah ah +back up +backed up +battle ax +beat up +beefed up +belly up +blacked in +blow up +blown up +boxed in +break in +break up +bucking up +buckle on +build up +built in +bulked up +bust up +buttoned up +cable tv +call in +carbon 14 +carry in +carry on +catch 22 +catch up +cave in +chevrolet pontiac gm +chin up +classified ad +clean up +cleaned up +close in +close up +cobalt 60 +color tv +cover up +dairy oh +dammed up +dc 8 +dc 10 +derring do +double a +dried up +drive in +drop in +dual road up +dust up +efficient in +either or +fade in +fall in +fenced in +fill in +follow on +follow up +foul up +full on +garbage in +gathering in +go go go +goings on +good by +grown up +gung ho +hands on +hangers on +hard come by +he goes or i go +head on +heigh ho +high up +hook up +huang ti +hunched up +hyped up +jumped up +just say no +kung fu +lash up +lean to +line up +link up +live in +lock up +locking in +made for tv +made up +make up +mark up +mixed up +models on the way up +mouth up +move up +movie to be +near by +nearly 30 +occupation as +odds on +oh the pain of it +on the go +one pound or so +over 40 +over 50 +paid in +paid up +painted in +passer by +passers by +pasted in +pay as you go +pent up +phase in +pick up +plug in +powers that be +public tv +puffed up +pumped up +push up +radio tv +rolled up +runner up +runners up +sales of +satellite tv +seven up +shack up +shake up +shangri la +shape up +shoo in +shot up +should be +shut in +snap in +snap on +soon to be +souped up +speed up +speeded up +split up +stand by +stand in +stand up +start up +step up +stepped up +stern to +stored up +stuck up +swearing in +take up +tobacco ad +trade ad +trade in +trade up +triple a +trumped up +trussed up +tune in +turned up +under 35 +under 50 +unheard of +wake up +walk in +walk on +walk to +walk up +warm up +wash up +well to do +wife to be +with it +would be +wrap up +write in diff --git a/src/locale/en_US/resources/all_word_grams.ftr b/src/locale/en_US/resources/all_word_grams.ftr new file mode 100644 index 000000000..02b9b5d60 Binary files /dev/null and b/src/locale/en_US/resources/all_word_grams.ftr differ diff --git a/src/locale/es/resources/Tokenizer.php b/src/locale/es/resources/Tokenizer.php index cdc63905c..1e816e868 100755 --- a/src/locale/es/resources/Tokenizer.php +++ b/src/locale/es/resources/Tokenizer.php @@ -112,12 +112,13 @@ class Tokenizer /** * Removes the stop words from the page (used for Word Cloud generation) * - * @param string $page the page to remove stop words from. - * @return string $page with no stop words + * @param mixed $data either a string or an array of string to remove + * stop words from + * @return mixed $data with no stop words */ - public static function stopwordsRemover($page) + public static function stopwordsRemover($data) { - $stop_words = ["de", "la", "que", "el","en", "y", "a", "los", + static $stop_words = ["de", "la", "que", "el","en", "y", "a", "los", "del", "se", "las", "por", "un", "para", "con", "no", "una", "su", "al", "lo", "como", "más", "pero", "sus", "le", "ya", "o", "este", "sí", "porque", "esta", "entre", "cuando", "muy", "sin", @@ -171,9 +172,12 @@ class Tokenizer "tuviéramos", "tuvierais", "tuvieran", "tuviese", "tuvieses", "tuviésemos", "tuvieseis", "tuviesen", "teniendo", "tenido", "tenida", "tenidos", "tenidas", "tened"]; - $page = preg_replace('/\b('.implode('|',$stop_words).')\b/', '', - mb_strtolower($page)); - return $page; + static $pattern = ""; + if (empty($pattern)) { + $pattern = '/\b(' . implode('|', $stop_words) . ')\b/u'; + } + $data = preg_replace($pattern, '', $data); + return $data; } /** * Computes the stem of a French word diff --git a/src/locale/fa/resources/Tokenizer.php b/src/locale/fa/resources/Tokenizer.php index d76065fd8..bef154e7c 100755 --- a/src/locale/fa/resources/Tokenizer.php +++ b/src/locale/fa/resources/Tokenizer.php @@ -67,12 +67,13 @@ class Tokenizer /** * Removes the stop words from the page (used for Word Cloud generation) * - * @param string $page the page to remove stop words from. - * @return string $page with no stop words + * @param mixed $data either a string or an array of string to remove + * stop words from + * @return mixed $data with no stop words */ - public static function stopwordsRemover($page) + public static function stopwordsRemover($data) { - $stop_words = [ + static $stop_words = [ "در", "به", "از", "كه", "مي", "اين", "است", "را", "با", "هاي", "براي", "آن", "يك", "شود", "شده","خود", "ها", "كرد", "شد", "اي", "تا", "كند", "بر", "بود", "گفت", "نيز", "وي", "هم", "كنند", @@ -123,9 +124,12 @@ class Tokenizer "لطفاً", "ّه", "انکه", "وقتیکه", "همین", "پیش", "مدّتی", "هنگامی", "مان", "تان" ]; - $page = preg_replace('/\b('.implode('|',$stop_words).')\b/u', '', - mb_strtolower($page)); - return $page; + static $pattern = ""; + if (empty($pattern)) { + $pattern = '/\b(' . implode('|', $stop_words) . ')\b/u'; + } + $data = preg_replace($pattern, '', $data); + return $data; } /** * Computes the stem of a Persian word diff --git a/src/locale/fr_FR/resources/Tokenizer.php b/src/locale/fr_FR/resources/Tokenizer.php index cb3460642..667e69756 100755 --- a/src/locale/fr_FR/resources/Tokenizer.php +++ b/src/locale/fr_FR/resources/Tokenizer.php @@ -109,10 +109,11 @@ class Tokenizer /** * Removes the stop words from the page (used for Word Cloud generation) * - * @param string $page the page to remove stop words from. - * @return string $page with no stop words + * @param mixed $data either a string or an array of string to remove + * stop words from + * @return mixed $data with no stop words */ - public static function stopwordsRemover($page) + public static function stopwordsRemover($data) { $stop_words = ['alors', 'au', 'aucuns', 'aussi', 'autre', 'avant', 'avec', 'avoir', 'bon', 'car', 'ce', 'cela', 'ces', 'ceux', @@ -132,9 +133,12 @@ class Tokenizer 'tout', 'trop', 'très', 'tu','valeur', 'voie', 'voient', 'vont', 'votre','vous','vu','ça','étaient', 'état', 'étions', 'été', 'être']; - $page = preg_replace('/\b('.implode('|',$stop_words).')\b/', '', - mb_strtolower($page)); - return $page; + static $pattern = ""; + if (empty($pattern)) { + $pattern = '/\b(' . implode('|', $stop_words) . ')\b/u'; + } + $data = preg_replace($pattern, '', $data); + return $data; } /** * Computes the stem of a French word diff --git a/src/locale/hi/resources/Tokenizer.php b/src/locale/hi/resources/Tokenizer.php index c4a53107b..c97d3c3d0 100755 --- a/src/locale/hi/resources/Tokenizer.php +++ b/src/locale/hi/resources/Tokenizer.php @@ -47,36 +47,36 @@ class Tokenizer * List of verb-like parts of speech that might appear in lexicon * @var array */ - public static $verb_phrases = ["VB", "VBD", "VBG", "VBN", "VBP", "VBZ", + public static $verb_type = ["VB", "VBD", "VBG", "VBN", "VBP", "VBZ", "RB"]; /** * List of noun-like parts of speech that might appear in lexicon * @var array */ - public static $noun_phrases = ["NN", "NNS", "NNP", "NNPS", "DT"]; + public static $noun_type = ["NN", "NNS", "NNP", "NNPS", "DT"]; /** * List of adjective-like parts of speech that might appear in lexicon * @var array */ - public static $adjective_phrases = ["JJ", "JJR", "JJS"]; + public static $adjective_type = ["JJ", "JJR", "JJS"]; /** * List of postpositional-like parts of speech that might appear in lexicon * @var array */ - public static $postpositional_phrases = ["IN", "inj", "PREP", "proNN", + public static $postpositional_type = ["IN", "inj", "PREP", "proNN", "CONJ", "INT", "particle", "case", "PSP", "direct_DT", "PRP"]; /** * List of questions in Hindi * @var array */ - public static $questions = ["क्या", "कब", "कहा", "क्यों", "कौन", "जिसे", - "जिसका", "कहाँ", "कहां"]; + public static $question_pattern = + "/\b[क्या|कब|कहा|क्यों|कौन|जिसे|जिसका|कहाँ|कहां]\b/ui"; /** * Any unique identifier corresponding to the component of a triplet which * can be answered using a question answer list * @var string */ - public static $question_marker = "qqq"; + public static $question_token = "qqq"; /** * Words we don't want to be stemmed * @var array @@ -144,11 +144,10 @@ class Tokenizer $lexicon_file = C\LOCALE_DIR . "/hi/resources/lexicon.txt.gz"; if (empty($dictionary)) { if (file_exists($lexicon_file)) { - $lines = gzfile($lexicon_file); - foreach ($lines as $line) { - $tags = preg_split('/(\s+|\,)/u', trim($line)); - $dictionary[array_shift($tags)] = array_filter($tags); - } + $lex_data = gzdecode(file_get_contents($lexicon_file)); + preg_match_all("/([^\s\,]+)[\s|\,]+([^\n]+)/u", + $lex_data, $lex_parts); + $dictionary = array_combine($lex_parts[1], $lex_parts[2]); } } $tokens = preg_split("/\s+/u", $text); @@ -162,12 +161,12 @@ class Tokenizer $current = ["token" => $token, "tag" => "UNKNOWN"]; $term = $current["token"]; if (!empty($dictionary[$token])) { - $tag_list = $dictionary[$token]; + $tag_list = explode(" ", $dictionary[$token]); $current['tag'] = $tag_list[0]; } if (is_numeric($token)) { $current["tag"] = "NN"; - } else if (strcmp($token,"है") == 0 || strcmp($token, "हैं") == 0) { + } else if (in_array($token, ["है", "हैं"])) { $current["tag"] = "VB"; } if (empty($current["tag"])) { @@ -176,7 +175,8 @@ class Tokenizer $result[$i] = $current; $i++; } - return self::tagUnknownWords($result); + $result = self::tagUnknownWords($result); + return $result; } /** * This method tags the remaining words in a partially tagged text array. @@ -189,7 +189,7 @@ class Tokenizer public static function tagUnknownWords($partially_tagged_text) { $result = $partially_tagged_text; - $verbs = ["VBZ","VBD","VBN"]; + $verbs = ["VBZ", "VBD", "VBN"]; $length = count($result); $previous = $result[0]; for ($i = 1; $i < $length; $i++) @@ -296,6 +296,24 @@ class Tokenizer } return $tagged_phrase; } + /** + * + */ + public static function parseTypeList(&$cur_node, $tagged_phrase, $type) + { + $string = ""; + $start_node = $cur_node; + $next_tag = (empty($tagged_phrase[$cur_node]['tag'])) ? "" : + trim($tagged_phrase[$cur_node]['tag']); + $allowed_conjuncts = []; + while ($next_tag && (in_array($next_tag, $type))) { + $string .= " ". $tagged_phrase[$cur_node]['token']; + $cur_node++; + $next_tag = (empty($tagged_phrase[$cur_node]['tag'])) ? "" : + trim($tagged_phrase[$cur_node]['tag']); + } + return $string; + } /** * Takes a part-of-speech tagged phrase and pre-tree with a * parse-from position and builds a parse tree for a noun if possible @@ -310,21 +328,60 @@ class Tokenizer * "NN" a subarray with a token node for the noun string that was * parsed */ - public static function extractNoun($tagged_phrase, $tree) + public static function parseNoun($tagged_phrase, $tree) { //Combining multiple noun into one - $noun_string = ""; - $cur_node = $tree["cur_node"]; - while (isset($tagged_phrase[$cur_node]["tag"]) && - (in_array(trim($tagged_phrase[$cur_node]["tag"]), - self::$noun_phrases))) { - $noun_string .= " " . $tagged_phrase[$cur_node]["token"]; - $cur_node++; - } + $noun_string = self::parseTypeList($tree['cur_node'], $tagged_phrase, + self::$noun_type); if (!empty($noun_string)) { $tree["NN"] = $noun_string; } - $tree["cur_node"] = $cur_node; + return $tree; + } + /** + * Takes a part-of-speech tagged phrase and pre-tree with a + * parse-from position and builds a parse tree for a verb if possible + * + * @param array $tagged_phrase + * an array of pairs of the form ("token" => token_for_term, + * "tag"=> part_of_speech_tag_for_term) + * @param array $tree that consists of ["curnode" => + * current parse position in $tagged_phrase] + * @return array has fields + * "cur_node" index of how far we parsed $tagged_phrase + * "VB" a subarray with a token node for the verb string that was + * parsed + */ + public static function parseVerb($tagged_phrase, $tree) + { + $verb_string = self::parseTypeList($tree['cur_node'], $tagged_phrase, + self::$verb_type); + if (!empty($verb_string)) { + $tree["VB"] = $verb_string; + } + return $tree; + } + /** + * Takes a part-of-speech tagged phrase and pre-tree with a + * parse-from position and builds a parse tree for an adjective if possible + * + * @param array $tagged_phrase + * an array of pairs of the form ("token" => token_for_term, + * "tag"=> part_of_speech_tag_for_term) + * @param array $tree that consists of ["cur_node" => + * current parse position in $tagged_phrase] + * @return array has fields + * "cur_node" index of how far we parsed $tagged_phrase + * "JJ" a subarray with a token node for the adjective that was + * parsed + */ + public static function parseAdjective($tagged_phrase, $tree) + { + $adjective_string = self::parseTypeList($tree['cur_node'], + $tagged_phrase, self::$adjective_type); + if (!empty($adjective_string)) { + $tree["JJ"] = $adjective_string; + } return $tree; } /** @@ -334,59 +391,43 @@ class Tokenizer * * @param array $tagged_phrase * an array of pairs of the form ("token" => token_for_term, - * "tag"=> part_of_speech_tag_for_term) + * "tag" => part_of_speech_tag_for_term) * @param array $tree that consists of ["cur_node" => * current parse position in $tagged_phrase] * @param int $index position in array to start from * @return array has fields * "cur_node" index of how far we parsed $tagged_phrase */ - public static function extractPostpositionPhrase($tagged_phrase, $tree, + public static function parsePostpositionPhrase($tagged_phrase, $tree, $index = 1) { $cur_node = $tree["cur_node"]; $tree_pp["cur_node"] = $tree["cur_node"]; if (isset ($tagged_phrase[$cur_node]["tag"]) && in_array($tagged_phrase[$cur_node]["tag"], - self::$postpositional_phrases)) { - $pp_string =""; - while (isset($tagged_phrase[$cur_node]["tag"]) && - in_array($tagged_phrase[$cur_node]["tag"], - self::$postpositional_phrases)) { - $pp_string .= " " . $tagged_phrase[$cur_node]["token"]; - $cur_node++; - } + self::$postpositional_type)) { + $pp_string = self::parseTypeList($cur_node, $tagged_phrase, + self::$postpositional_type); if (!empty($pp_string)) { $tree_pp["IN_$index"] = $pp_string; } - $adjective_string = ""; - while (isset($tagged_phrase[$cur_node]["tag"]) && - in_array($tagged_phrase[$cur_node]["tag"], - self::$adjective_phrases)) { - $adjective_string .= " " . - $tagged_phrase[$cur_node]["token"]; - $cur_node++; - } + $adjective_string = self::parseTypeList($cur_node, $tagged_phrase, + self::$adjective_type); if (!empty($adjective_string)) { $tree_pp["JJ_$index"] = $adjective_string; } - $nn_string = ""; - while (isset($tagged_phrase[$cur_node]["tag"]) && - in_array($tagged_phrase[$cur_node]["tag"], - self::$noun_phrases)) { - $nn_string .= " " . $tagged_phrase[$cur_node]["token"]; - $cur_node++; - } + $nn_string = self::parseTypeList($cur_node, $tagged_phrase, + self::$noun_type); if (!empty($nn_string)) { $tree_pp["NN_$index"] = $nn_string; } $tree_pp["cur_node"] = $cur_node; - $tree_next = self::extractPostpositionPhrase($tagged_phrase, + $tree_next = self::parsePostpositionPhrase($tagged_phrase, $tree_pp, $index + 1); - $tree_pp = array_merge ($tree_pp, $tree_next); + $tree_pp = array_merge($tree_pp, $tree_next); } $tree["cur_node"] = $tree_pp["cur_node"]; - unset ($tree_pp["cur_node"]); + unset($tree_pp["cur_node"]); $tree["POST"] = $tree_pp; return $tree; } @@ -404,56 +445,21 @@ class Tokenizer * "JJ" with value an Adjective subtree * "NN" with value of a Noun Subtree */ - public static function extractNounPhrase($tagged_phrase, $tree) + public static function parseNounPhrase($tagged_phrase, $tree) { $cur_node = $tree["cur_node"]; - $tree_jj = self::extractAdjective($tagged_phrase, + $tree_jj = self::parseAdjective($tagged_phrase, ["cur_node" => $tree["cur_node"]]); - $tree_nn = self::extractNoun($tagged_phrase, + $tree_nn = self::parseNoun($tagged_phrase, ["cur_node" => $tree_jj["cur_node"]]); if ($tree_nn["cur_node"] == $cur_node) { $tree["NP"] = ""; - } else { - $cur_node = $tree_nn["cur_node"]; - unset($tree_jj["cur_node"]); - $tree_new_sub["JJ"] = $tree_jj; - unset($tree_nn["cur_node"]); - $tree_new_sub["NN"] = $tree_nn; - $tree_new["cur_node"] = $cur_node; - $tree_new["NP"] = $tree_new_sub; - return $tree_new; - } - return $tree; - } - /** - * Takes a part-of-speech tagged phrase and pre-tree with a - * parse-from position and builds a parse tree for a verb if possible - * - * @param array $tagged_phrase - * an array of pairs of the form ("token" => token_for_term, - * "tag"=> part_of_speech_tag_for_term) - * @param array $tree that consists of ["curnode" => - * current parse position in $tagged_phrase] - * @return array has fields - * "cur_node" index of how far we parsed $tagged_phrase - * "VB" a subarray with a token node for the verb string that was - * parsed - */ - public static function extractVerb($tagged_phrase, $tree) - { - $cur_node = $tree["cur_node"]; - $verb_string = ""; - while (isset($tagged_phrase[$cur_node]["tag"]) && - in_array(trim($tagged_phrase[$cur_node]["tag"]), - self::$verb_phrases)) { - $verb_string .= " " . $tagged_phrase[$cur_node]["token"]; - $cur_node++; - } - if (!empty($verb_string)) { - $tree["VB"] = $verb_string; + return $tree; } - $tree["cur_node"] = $cur_node; - return $tree; + $cur_node = $tree_nn["cur_node"]; + unset($tree_jj["cur_node"], $tree_nn["cur_node"]); + return ["cur_node" => $cur_node, "NP" => + ["JJ" => $tree_jj, "NN" => $tree_nn] ]; } /** * Takes a part-of-speech tagged phrase and pre-tree with a @@ -469,73 +475,30 @@ class Tokenizer * "VP" a subarray with possible fields * "VB" with value a verb subtree */ - public static function extractVerbPhrase($tagged_phrase, $tree) + public static function parseVerbPhrase($tagged_phrase, $tree) { $cur_node = $tree["cur_node"]; - $tree_vb = self::extractVerb($tagged_phrase, ["cur_node" => $cur_node]); + $tree_vb = self::parseVerb($tagged_phrase, ["cur_node" => $cur_node]); if ($tree_vb["cur_node"] == $cur_node) { $tree["VP"] = []; return $tree; } $cur_node = $tree_vb["cur_node"]; - $postposition_string = ""; - while (isset($tagged_phrase[$cur_node]["tag"]) && - in_array(trim($tagged_phrase[$cur_node]["tag"]), - self::$postpositional_phrases)) { - $postposition_string .= " ". $tagged_phrase[$cur_node]["token"]; - $cur_node++; - } + $postposition_string = self::parseTypeList($cur_node, + $tagged_phrase, self::$postpositional_type); if (!empty($postposition_string)) { $tree_vb["IN"] = $postposition_string; } - $tree_np = self::extractNounPhrase($tagged_phrase, - ["cur_node" => $cur_node]); - $tree_new = []; - $tree_new_sub = []; + $tree_np = self::parseNounPhrase($tagged_phrase, + ["cur_node" => $cur_node]);; if ($tree_np["cur_node"] != $cur_node) { $cur_node = $tree_np["cur_node"]; unset($tree_vb["cur_node"], $tree_np["cur_node"]); - $tree_new_sub["VB"] = $tree_vb; - $tree_new_sub["NP"] = $tree_np["NP"]; - $tree_new["cur_node"] = $cur_node; - $tree_new["VP"] = $tree_new_sub; - return $tree_new; + return ['cur_node' => $cur_node, 'VP' =>['VB' => $tree_vb, + 'NP' => $tree_np["NP"]]]; } unset($tree_vb["cur_node"]); - $tree_new_sub["VB"] = $tree_vb; - $tree_new["cur_node"] = $cur_node; - $tree_new["VP"] = $tree_new_sub; - return $tree_new; - } - /** - * Takes a part-of-speech tagged phrase and pre-tree with a - * parse-from position and builds a parse tree for an adjective if possible - * - * @param array $tagged_phrase - * an array of pairs of the form ("token" => token_for_term, - * "tag"=> part_of_speech_tag_for_term) - * @param array $tree that consists of ["cur_node" => - * current parse position in $tagged_phrase] - * @return array has fields - * "cur_node" index of how far we parsed $tagged_phrase - * "JJ" a subarray with a token node for the adjective that was - * parsed - */ - public static function extractAdjective($tagged_phrase, $tree) - { - $adjective_string = ""; - $cur_node = $tree["cur_node"]; - while (isset($tagged_phrase[$cur_node]["tag"]) && - in_array(trim($tagged_phrase[$cur_node]["tag"]), - self::$adjective_phrases)) { - $adjective_string .= " " . $tagged_phrase[$cur_node]["token"]; - $cur_node++; - } - if (!empty($adjective_string)) { - $tree["JJ"] = $adjective_string; - } - $tree["cur_node"] = $cur_node; - return $tree; + return ['cur_node' => $cur_node, 'VP' => ['VB' => $tree_vb]]; } /** * Given a part-of-speeech tagged phrase array generates a parse tree @@ -550,20 +513,17 @@ class Tokenizer * $tree["POST"] contains a subtree for a object phrase * $tree["VP"] contains a subtree for a predicate phrase */ - public static function generatePhraseParseTree($tagged_phrase) + public static function parseWholePhrase($tagged_phrase, $tree) { - $tree = []; - $tree_np = self::extractNounPhrase($tagged_phrase,["cur_node" => 0]); - $tree = ["cur_node" => $tree_np["cur_node"]]; - $tree_pp = self::extractPostpositionPhrase($tagged_phrase, $tree); - $tree["cur_node"] = $tree_pp["cur_node"]; - $tree_vp = self::extractVerbPhrase($tagged_phrase, $tree); - $tree["cur_node"] = $tree_vp["cur_node"]; + $tree_np = self::parseNounPhrase($tagged_phrase,["cur_node" => 0]); + $tree_pp = self::parsePostpositionPhrase($tagged_phrase, + ["cur_node" => $tree_np["cur_node"]] ); + $tree_vp = self::parseVerbPhrase($tagged_phrase, + ["cur_node" => $tree_pp["cur_node"]] ); + $cur_node = $tree_vp["cur_node"]; unset($tree_np["cur_node"], $tree_pp["cur_node"], $tree_vp["cur_node"]); - $tree["NP"] = $tree_np["NP"]; - $tree["POST"] = $tree_pp["POST"]; - $tree["VP"] = $tree_vp["VP"]; - return $tree; + return ["cur_node" => $cur_node, "NP" => $tree_np["NP"], + "POST" => $tree_pp["POST"], "VP" => $tree_vp["VP"]]; } /** * Scans a word list for phrases. For phrases found generate @@ -587,7 +547,8 @@ class Tokenizer $sentence = preg_replace("/\s+/u", " ", $word_and_phrase); $sentence = trim($sentence); $tagged_phrase = self::tagTokenizePartOfSpeech($sentence); - $parse_tree = self::generatePhraseParseTree($tagged_phrase); + $parse_tree = self::parseWholePhrase($tagged_phrase, + ["cur_node" => 0]); $triplets = self::extractTripletsParseTree($parse_tree); $extracted_triplets = self::rearrangeTripletsByType($triplets); foreach ($triplet_types as $type) { @@ -772,7 +733,6 @@ class Tokenizer && !empty($sub_pred_obj_triplets["predicate"][$type]) && !empty($sub_pred_obj_triplets["object"][$type])) { $question_answer_triplets = []; - $question_marker = self::$question_marker; $sentence = [$sub_pred_obj_triplets["subject"][$type], $sub_pred_obj_triplets["object"][$type], $sub_pred_obj_triplets["predicate"][$type]]; @@ -780,7 +740,7 @@ class Tokenizer for ($j = 0; $j < 2; $j++) { for ($i = 0; $i < 3; $i++) { $question = $sentence; - $question[$i] = $question_marker; + $question[$i] = self::$question_token; $question_string = implode(" ", $question); $question_string = trim($question_string); $question_string = preg_replace("/\s+/u", " ", @@ -806,20 +766,19 @@ class Tokenizer public static function parseQuestion($tagged_question, $index) { $generated_questions = []; - $question_marker = trim(self::getQuestionMarker()); $triplets = []; - $tree_np = self::extractNounPhrase($tagged_question, + $tree_np = self::parseNounPhrase($tagged_question, ["cur_node" => 0]); $triplets["subject"] = self::extractSubjectParseTree($tree_np); - $tree_vp = self::extractVerbPhrase($tagged_question, - ["cur_node" => $index+1]); + $tree_vp = self::parseVerbPhrase($tagged_question, + ["cur_node" => $index + 1]); $triplets["predicate"] = self::extractPredicateParseTree($tree_vp); $triplet_types = ["CONCISE", "RAW"]; foreach ($triplet_types as $type) { if (!empty($triplets["subject"][$type]) && !empty($triplets["predicate"][$type])) { $question = trim (trim($triplets["subject"][$type]) . - " " . $question_marker . + " " . self::$question_token . " " . trim($triplets["predicate"][$type])); $question = preg_replace("/\s+/u", " ", $question); $generated_questions[$type][] = $question; @@ -836,22 +795,7 @@ class Tokenizer */ public function isQuestion($phrase) { - $phrase = trim($phrase); - for ($i = 0; $i < count(self::$questions); $i++) { - if (mb_strpos($phrase, trim(self::$questions[$i])) !== false) { - return true; - } - } - return false; - } - /** - * The function returns the question marker for the locale - * - * @return the question marker - */ - public static function getQuestionMarker() - { - return self::$question_marker; + return preg_match(self::$question_pattern, $phrase); } /** * Takes questions and returns the triplet from the question diff --git a/src/locale/hi/resources/all_aux_grams.txt b/src/locale/hi/resources/all_aux_grams.txt new file mode 100755 index 000000000..c9b76d32f --- /dev/null +++ b/src/locale/hi/resources/all_aux_grams.txt @@ -0,0 +1 @@ +महामा गाँधी diff --git a/src/locale/hi/resources/all_word_grams.ftr b/src/locale/hi/resources/all_word_grams.ftr new file mode 100644 index 000000000..6d8246a5c Binary files /dev/null and b/src/locale/hi/resources/all_word_grams.ftr differ diff --git a/src/locale/it/resources/Tokenizer.php b/src/locale/it/resources/Tokenizer.php index f6968e4fa..20d1bc1f0 100755 --- a/src/locale/it/resources/Tokenizer.php +++ b/src/locale/it/resources/Tokenizer.php @@ -102,12 +102,13 @@ class Tokenizer /** * Removes the stop words from the page (used for Word Cloud generation) * - * @param string $page the page to remove stop words from. - * @return string $page with no stop words + * @param mixed $data either a string or an array of string to remove + * stop words from + * @return mixed $data with no stop words */ - public static function stopwordsRemover($page) + public static function stopwordsRemover($data) { - $stop_words = [ + static $stop_words = [ 'http', 'https', "ad", "al", "allo", "ai", "agli", "all", "agl", "alla", "alle", "con", "col", "coi", "da", "dal", "dallo", "dai", @@ -150,9 +151,12 @@ class Tokenizer "steste", "stettero", "stessi", "stesse", "stessimo", "stessero", "stando" ]; - $page = preg_replace('/\b('.implode('|',$stop_words).')\b/', '', - mb_strtolower($page)); - return $page; + static $pattern = ""; + if (empty($pattern)) { + $pattern = '/\b(' . implode('|', $stop_words) . ')\b/u'; + } + $data = preg_replace($pattern, '', $data); + return $data; } /** * Computes the stem of an Italian word diff --git a/src/locale/ru/resources/Tokenizer.php b/src/locale/ru/resources/Tokenizer.php index 78d3ff7b7..ea75a4af2 100755 --- a/src/locale/ru/resources/Tokenizer.php +++ b/src/locale/ru/resources/Tokenizer.php @@ -68,12 +68,13 @@ class Tokenizer /** * Removes the stop words from the page (used for Word Cloud generation) * - * @param string $page the page to remove stop words from. - * @return string $page with no stop words + * @param mixed $data either a string or an array of string to remove + * stop words from + * @return mixed $data with no stop words */ - public static function stopwordsRemover($page) + public static function stopwordsRemover($data) { - $stop_words = ["й", "ч", "чп", "ое", "юфп", + static $stop_words = ["й", "ч", "чп", "ое", "юфп", "по", "об", "с", "у", "уп", "лбл", "б", "фп", "чуе", "поб", "фбл", "езп", "оп", "дб", "фщ", "л", "х", "це", "чщ", "ъб", "вщ", "рп", @@ -106,9 +107,12 @@ class Tokenizer "фблпк", "йн", "впмее", "чуездб", "лпоеюоп", "чуа", "нецдх", 'http', 'https' ]; - $page = preg_replace('/\b('.implode('|',$stop_words).')\b/u', '', - strtolower($page)); - return $page; + static $pattern = ""; + if (empty($pattern)) { + $pattern = '/\b(' . implode('|', $stop_words) . ')\b/u'; + } + $data = preg_replace($pattern, '', $data); + return $data; } /** * Computes the stem of a Russian word diff --git a/tests/EnTokenizerTest.php b/tests/EnTokenizerTest.php index b36598eca..4c2aed370 100644 --- a/tests/EnTokenizerTest.php +++ b/tests/EnTokenizerTest.php @@ -103,7 +103,7 @@ class EnTokenizerTest extends UnitTest } /** * Tests the question answering system for English. Sees if correctly - * ectract [s v o] stemmed triplets from sentences, and whether it can + * extracts [s v o] stemmed triplets from sentences, and whether it can * use those to answer questions. */ public function questionAnswerTestCase() diff --git a/tests/IndexDictionaryTest.php b/tests/IndexDictionaryTest.php new file mode 100644 index 000000000..8e6560940 --- /dev/null +++ b/tests/IndexDictionaryTest.php @@ -0,0 +1,128 @@ +<?php +/** + * SeekQuarry/Yioop -- + * Open Source Pure PHP Search Engine, Crawler, and Indexer + * + * Copyright (C) 2009 - 2018 Chris Pollett chris@pollett.org + * + * LICENSE: + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <https://www.gnu.org/licenses/>. + * + * END LICENSE + * + * @author Chris Pollett chris@pollett.org + * @license https://www.gnu.org/licenses/ GPL3 + * @link https://www.seekquarry.com/ + * @copyright 2009 - 2018 + * @filesource + */ +namespace seekquarry\yioop\tests; + +use seekquarry\yioop\configs as C; +use seekquarry\yioop\library as L; +use seekquarry\yioop\library\CrawlConstants; +use seekquarry\yioop\library\IndexShard; +use seekquarry\yioop\library\IndexDictionary; +use seekquarry\yioop\library\UnitTest; + +/** + * Used to test that the IndexDictionary class can properly add shards + * and retrieve correct posting slice ranges in the shards. + * + * @author Chris Pollett + */ +class IndexDictionaryTest extends UnitTest +{ + /** + * Construct some index shard we can add documents to + */ + public function setUp() + { + $this->test_objects['shard'] = new IndexShard(C\WORK_DIRECTORY. + "/shard.txt", 0); + $this->test_objects['shard2'] = new IndexShard(C\WORK_DIRECTORY. + "/shard2.txt", 1); + $this->test_objects['shard3'] = new IndexShard(C\WORK_DIRECTORY. + "/shard3.txt", 2); + $this->test_objects['dictionary'] = new IndexDictionary( + C\WORK_DIRECTORY . "/dictionary", null); + } + /** + * Deletes any index shard files we may have created + */ + public function tearDown() + { + set_error_handler(null); + @unlink(C\WORK_DIRECTORY . "/shard.txt"); + @unlink(C\WORK_DIRECTORY . "/shard2.txt"); + @unlink(C\WORK_DIRECTORY . "/shard3.txt"); + $dbms_manager = C\NS_DATASOURCES . ucfirst(C\DBMS) . "Manager"; + $db = new $dbms_manager(); + $db->unlinkRecursive(C\WORK_DIRECTORY . "/dictionary", true); + set_error_handler(C\NS_CONFIGS . "yioop_error_handler"); + } + /** + * Check that appending two index shards works correctly + */ + public function addShardDictionaryTestCase() + { + $docid = "AAAAAAAABBBBBBBBCCCCCCCC"; //set up doc + $offset = 5; + $word_counts = [ + 'MMMMMMMM' => [1, 3, 5], + 'NNNNNNNN' => [2, 4, 6], + 'OOOOOOOO' => [7, 8, 9], + ]; + $meta_ids = ["PPPPPPPP", "QQQQQQQQ"]; + $this->test_objects['shard']->addDocumentWords($docid, + $offset, $word_counts, $meta_ids); + $this->assertEqual($this->test_objects['shard']->len_all_link_docs, 9, + "Len All Docs Correctly Counts Length of First Doc"); + $this->test_objects['shard']->save(); + $shard = new IndexShard(C\WORK_DIRECTORY . + "/shard.txt", 0, C\NUM_DOCS_PER_GENERATION, true); + $word_id = L\crawlHashWord('MMMMMMMM'); + $shard_info = $shard->getWordInfo($word_id); + $this->test_objects['dictionary']->addShardDictionary($shard); + $dict_info = $this->test_objects['dictionary']->getWordInfo($word_id); + array_shift($dict_info[0]); + $first_entry = array_shift($dict_info); + $this->assertEqual($shard_info, $first_entry, + "Shard word entry agrees with dictionary word entry"); + $docid = "AAAAAAAABBBBBBBBEEEEEEEE"; + $offset = 10; + $word_counts = [ + 'BBBBBBBB' => [1], + 'CCCCCCCC' => [2], + 'MMMMMMMM' => [6], + ]; + $meta_ids = ["EEEEEEEE", "FFFFFFFF"]; + $this->test_objects['shard2']->addDocumentWords($docid, + $offset, $word_counts, $meta_ids); + $this->test_objects['shard2']->save(); + $shard = new IndexShard(C\WORK_DIRECTORY . + "/shard2.txt", 1, C\NUM_DOCS_PER_GENERATION, true); + $word_id = L\crawlHashWord('MMMMMMMM'); + $shard_info2 = $shard->getWordInfo($word_id); + $this->test_objects['dictionary']->addShardDictionary($shard); + $dict_info = $this->test_objects['dictionary']->getWordInfo($word_id); + $this->assertEqual(count($dict_info), 2, + "After second shard insert have two entries for all M word"); + array_shift($dict_info[1]); + $second_entry = $dict_info[1]; + $this->assertEqual($shard_info2, $second_entry, + "Second entry in two shard case for M word matches expected"); + } +} diff --git a/tests/IndexShardTest.php b/tests/IndexShardTest.php index 22d6f758e..a05fc593c 100644 --- a/tests/IndexShardTest.php +++ b/tests/IndexShardTest.php @@ -338,12 +338,12 @@ class IndexShardTest extends UnitTest $this->test_objects['shard']->save(); $this->test_objects['shard2'] = IndexShard::load(C\WORK_DIRECTORY. "/shard.txt"); + $word_info = $this->test_objects['shard']->getWordInfo( + L\crawlHashWord('FFFFFFFF')); $this->assertEqual($this->test_objects['shard2']->len_all_docs, 3, "Len All Docs Correctly Counts Length of First Doc"); - $c_data = $this->test_objects['shard2']->getPostingsSliceById( L\crawlHashWord('BBBBBBBB', true), 5); - $this->assertTrue(isset($c_data["AAAAAAAABBBBBBBBCCCCCCCC"]), "Doc lookup by word works"); $c_data = $this->test_objects['shard2']->getPostingsSliceById( @@ -386,5 +386,15 @@ class IndexShardTest extends UnitTest L\crawlHashWord('FFFFFFFF', true), 5); $this->assertTrue(isset($c_data["AAAAAAAABBBBBBBBCCCCCCCC"]), "String Load Doc lookup 2 by word works"); + // Check if save without dictionary preserves postings + $word_info = $this->test_objects['shard']->getWordInfo( + L\crawlHashWord('FFFFFFFF')); + $this->test_objects['shard']->saveWithoutDictionary(); + $shard = new IndexShard(C\WORK_DIRECTORY . + "/shard.txt", 0, C\NUM_DOCS_PER_GENERATION, true); + $c_data = $shard->getPostingsSlice($word_info[0], + $word_info[0], $word_info[1], 5); + $this->assertTrue(isset($c_data["AAAAAAAABBBBBBBBCCCCCCCC"]), + "Save without dictionary test works"); } } diff --git a/tests/PhraseParserTest.php b/tests/PhraseParserTest.php index ea429e151..9775f8250 100644 --- a/tests/PhraseParserTest.php +++ b/tests/PhraseParserTest.php @@ -70,7 +70,6 @@ EOD; $this->assertTrue(in_array("dr", $words), "Abbreviation 1"); $this->assertTrue(in_array("_ty", $words), "Initials 1"); $this->assertTrue(in_array("_jrr", $words), "Initials 2"); - $phrase_string = <<< EOD THE THE ‘Deep Space nine’ ‘Deep Space’ version of GIANT the the @@ -84,9 +83,9 @@ EOD; $this->assertTrue(in_array("the the", $words), "Extract Bigram 1"); $this->assertTrue(in_array("deep space", $words), "Extract Bigram 2"); $this->assertTrue(in_array("deep", $words), "Unigrams still present 1"); - $this->assertTrue(in_array("space", $words), "Unigrams still present 2"); + $this->assertTrue(in_array("space", $words), + "Unigrams still present 2"); $this->assertTrue(in_array("2012", $words), "Punctuation removal 1"); - $phrase_string = <<< EOD 百度一下,你就知道 .. 知 道 MP3 图 片 视 频 地 图 输入法 手写 @@ -126,7 +125,7 @@ EOD; $this->assertTrue(in_array("a_and_w", $words), "Ampersand Test 1"); $this->assertTrue(in_array("a_and_tt", $words), "Ampersand Test 2"); $this->assertTrue(in_array("fish_and_chip", $words), "n for and test"); - $this->assertTrue(in_array("chris_a_pollett_d_org", $words), + $this->assertTrue(in_array("chris_at_pollett_d_org", $words), "Email Check 1"); $this->assertTrue(in_array( "http_c__s__s_www_d_yo_d_org_s_index_d_pl_q_a_e_b_and_c_e_d", @@ -257,8 +256,8 @@ EOD; $this->assertTrue(array_diff($segments, $correct_segments) == [], "Segmenter Test 2"); $segments = PhraseParser::segmentSegment("你们好吗?", 'zh-CN'); - $correct_segments = ["你们", "好", "吗", "?"]; - $this->assertTrue((count($segments) == 4), "Segmenter Test 3"); + $correct_segments = ["你们", "好", "吗"]; + $this->assertTrue((count($segments) == 3), "Segmenter Test 3"); $this->assertTrue(array_diff($segments, $correct_segments) == [], "Segmenter Test 4"); }