Adjustment to bloom filter size calc, slight refactoring IndexDictionary and Index Shard, Improvements to guessLocaleFromString to handle hindi, more work on nwordgram filters, simplify and speed up each summarizer, also refactor and get sentences more accurately, improve english tokenizer's parser for question answering, tweak some test cases, a=chris

Chris Pollett [2018-06-30 21:Jun:th]
Adjustment to bloom filter size calc, slight refactoring IndexDictionary and Index Shard, Improvements to guessLocaleFromString to handle hindi, more work on nwordgram filters, simplify and speed up each summarizer, also refactor and get sentences more accurately, improve english tokenizer's parser for question answering, tweak some test cases, a=chris
Filename
src/configs/Config.php
src/configs/TokenTool.php
src/controllers/components/CrawlComponent.php
src/executables/ArcTool.php
src/executables/Fetcher.php
src/library/BloomFilterFile.php
src/library/IndexDictionary.php
src/library/IndexShard.php
src/library/LocaleFunctions.php
src/library/NWordGrams.php
src/library/PhraseParser.php
src/library/UrlParser.php
src/library/processors/HtmlProcessor.php
src/library/processors/TextProcessor.php
src/library/summarizers/CentroidSummarizer.php
src/library/summarizers/CentroidWeightedSummarizer.php
src/library/summarizers/GraphBasedSummarizer.php
src/library/summarizers/ScrapeSummarizer.php
src/library/summarizers/Summarizer.php
src/locale/ar/resources/Tokenizer.php
src/locale/de/resources/Tokenizer.php
src/locale/en_US/resources/Tokenizer.php
src/locale/en_US/resources/all_aux_grams.txt
src/locale/en_US/resources/all_word_grams.ftr
src/locale/es/resources/Tokenizer.php
src/locale/fa/resources/Tokenizer.php
src/locale/fr_FR/resources/Tokenizer.php
src/locale/hi/resources/Tokenizer.php
src/locale/hi/resources/all_aux_grams.txt
src/locale/hi/resources/all_word_grams.ftr
src/locale/it/resources/Tokenizer.php
src/locale/ru/resources/Tokenizer.php
tests/EnTokenizerTest.php
tests/IndexDictionaryTest.php
tests/IndexShardTest.php
tests/PhraseParserTest.php
diff --git a/src/configs/Config.php b/src/configs/Config.php
index 293cacc7f..84ba574a3 100755
--- a/src/configs/Config.php
+++ b/src/configs/Config.php
@@ -806,7 +806,7 @@ nsconddefine('THUMB_DIM', 128);
 nsconddefine('THUMB_SIZE', 1000000);
 /** Characters we view as not part of words, not same as POSIX [:punct:]*/
 nsconddefine('PUNCT', "\.|\,|\:|\;|\"|\'|\[|\/|\%|\?|-|" .
-    "\]|\{|\}|\(|\)|\!|\||\&|\`|" .
+    "\]|\{|\}|\(|\)|\!|\||।|\&|\`|" .
     "\’|\‘|©|®|™|℠|…|\/|\>|,|\=|。|)|:|、|" .
     "”|“|《|》|(|「|」|★|【|】|·|\+|\*|;".
         "|!|—|―|?|!|،|؛|؞|؟|٪|٬|٭");
diff --git a/src/configs/TokenTool.php b/src/configs/TokenTool.php
index 47115cf67..6843c84ee 100644
--- a/src/configs/TokenTool.php
+++ b/src/configs/TokenTool.php
@@ -204,7 +204,7 @@ function makeNWordGramsFiles($args)
     }
     if (!isset($args[5]) && $args[3] == "all" &&
         $args[4] == NWordGrams::PAGE_COUNT_WIKIPEDIA) {
-        $args[5] = 100000;
+        $args[5] = 75000;
     } else {
         $args[5] = -1;
     }
diff --git a/src/controllers/components/CrawlComponent.php b/src/controllers/components/CrawlComponent.php
index faecd6d31..ab94ce27f 100644
--- a/src/controllers/components/CrawlComponent.php
+++ b/src/controllers/components/CrawlComponent.php
@@ -1526,9 +1526,8 @@ class CrawlComponent extends Component implements CrawlConstants
                 if ($site[self::TITLE] != "" ) {
                     $lang = L\guessLocaleFromString($site[self::TITLE], $lang);
                 } else {
-                    $lang = L\guessLocaleFromString(
-                        substr($site[self::DESCRIPTION], 0,
-                        C\AD_HOC_TITLE_LENGTH), $lang);
+                    $lang = L\guessLocaleFromString($site[self::DESCRIPTION],
+                        $lang);
                 }
                 $word_lists = PhraseParser::extractPhrasesInLists(
                     $phrase_string, $lang);
diff --git a/src/executables/ArcTool.php b/src/executables/ArcTool.php
index 9e0c88cd7..8db9c043b 100755
--- a/src/executables/ArcTool.php
+++ b/src/executables/ArcTool.php
@@ -48,8 +48,7 @@ if (php_sapi_name() != 'cli' ||
     defined("seekquarry\\yioop\\configs\\IS_OWN_WEB_SERVER")) {
     echo "BAD REQUEST"; exit();
 }
-ini_set("memory_limit","2000M"); /*
-        reindex sometimes takes more than the default 128M, 850 to be safe */
+ini_set("memory_limit","2500M");
 /** This tool does not need logging*/
 $_SERVER["LOG_TO_FILES"] = false;
 /** USE_CACHE false rules out file cache as well*/
@@ -874,8 +873,7 @@ class ArcTool implements CrawlConstants
                     intval(file_get_contents($shard_count_file));
                 echo "Restarting rebuild index from $start_generation\n";
             } else {
-                $start_generation= 0;
-                file_put_contents($shard_count_file, $start_generation);
+                $start_generation = 0;
             }
         }
         $info = $archive_name::getArchiveInfo($archive_path);
@@ -885,11 +883,12 @@ class ArcTool implements CrawlConstants
             file_get_contents("$archive_path/generation.txt"));
         $num_generations = $generation_info['ACTIVE'] + 1;
         $archive = new WebArchiveBundle($archive_path."/summaries");
+        $dictionary_path = $archive_path . "/dictionary";
         $dbms_manager = C\NS_DATASOURCES . ucfirst(C\DBMS) . "Manager";
         $db = new $dbms_manager();
         $db->unlinkRecursive($archive_path . "/dictionary", false);
-        IndexDictionary::makePrefixLetters($archive_path . "/dictionary");
-        $dictionary = new IndexDictionary($archive_path . "/dictionary");
+        IndexDictionary::makePrefixLetters($dictionary_path);
+        $dictionary = new IndexDictionary($dictionary_path);
         $seen = 0;
         $generation = $start_generation;
         $keypad = "\x00\x00\x00\x00";
diff --git a/src/executables/Fetcher.php b/src/executables/Fetcher.php
index 6b5c16d63..d9d7fb7ea 100755
--- a/src/executables/Fetcher.php
+++ b/src/executables/Fetcher.php
@@ -2809,13 +2809,14 @@ class Fetcher implements CrawlConstants
                             self::LANG]])) {
                             $phrase_string = $site[self::DESCRIPTION];
                         } else {
-                            $phrase_string = $host_words." .. ".
+                            $phrase_string = $host_words . " .. ".
                                 $site[self::TITLE] . " ..  ".
                                 $path_words . " .. ". $site[self::DESCRIPTION];
                         }
                     } else {
-                        $phrase_string = $host_words." ".$site[self::TITLE] .
-                            " ". $path_words . " ". $site[self::DESCRIPTION];
+                        $phrase_string = $host_words . " " .
+                            $site[self::TITLE] . " ". $path_words . " ".
+                            $site[self::DESCRIPTION];
                     }
                 }
                 if (empty($site[self::LANG])) {
diff --git a/src/library/BloomFilterFile.php b/src/library/BloomFilterFile.php
index 6d9d3c6f9..917d14968 100755
--- a/src/library/BloomFilterFile.php
+++ b/src/library/BloomFilterFile.php
@@ -76,8 +76,12 @@ class BloomFilterFile extends PersistentStructure
         $save_frequency = self::DEFAULT_SAVE_FREQUENCY)
     {
         $log2 = log(2);
+        $log2_sq = $log2 * $log2;
+        /* choose number of keys so that the odds of false positive
+           is 1/$num_values.
+         */
         $this->num_keys = ceil(log($num_values)/$log2);
-        $this->filter_size = ceil( ($this->num_keys) * $num_values/$log2 );
+        $this->filter_size = ceil( ($this->num_keys) * $num_values/$log2_sq);
         $mem_before =  memory_get_usage(true);
         $this->filter = pack("x". ceil(0.125 * $this->filter_size));
             // 1/8 =.125 = num bits/bytes, want to make things floats
diff --git a/src/library/IndexDictionary.php b/src/library/IndexDictionary.php
index fae5f8348..b2539991b 100644
--- a/src/library/IndexDictionary.php
+++ b/src/library/IndexDictionary.php
@@ -163,22 +163,30 @@ class IndexDictionary implements CrawlConstants
         if (!is_dir($this->dir_name)) {
             mkdir($this->dir_name);
             IndexDictionary::makePrefixLetters($this->dir_name);
+            $this->active_tiers = [];
             $this->max_tier = 0;
         } else {
             $this->max_tier = unserialize(
                 file_get_contents($this->dir_name."/max_tier.txt"));
-            $this->read_tier = $this->max_tier;
-            $tiers = glob($this->dir_name."/0/*A.dic");
-            natsort($tiers);
-            $this->active_tiers = [];
-            foreach ($tiers as $tier) {
-                $path = pathinfo($tier);
-                array_unshift($this->active_tiers,
-                    substr($path["filename"], 0, -1));
-            }
+            $this->calculateActiveTiers();
         }
         $this->parent_archive_bundle = $parent_archive_bundle;
     }
+    /**
+     *
+     */
+    public function calculateActiveTiers()
+    {
+        $this->read_tier = $this->max_tier;
+        $tiers = glob($this->dir_name."/0/*A.dic");
+        natsort($tiers);
+        $this->active_tiers = [];
+        foreach ($tiers as $tier) {
+            $path = pathinfo($tier);
+            array_unshift($this->active_tiers,
+                substr($path["filename"], 0, -1));
+        }
+    }
     /**
      * Makes dictionary sub-directories for each of the 256 possible first
      * hash characters that crawHash in raw mode code output.
@@ -205,7 +213,7 @@ class IndexDictionary implements CrawlConstants
     public function addShardDictionary($index_shard, $callback = null)
     {
         $out_slot = "A";
-        if (file_exists($this->dir_name."/0/0A.dic")) {
+        if (file_exists($this->dir_name . "/0/0A.dic")) {
             $out_slot ="B";
         }
         crawlLog("Adding shard data to index dictionary files...");
@@ -273,10 +281,10 @@ class IndexDictionary implements CrawlConstants
                 $callback->join();
             }
             $out_slot = "A";
-            if (file_exists($this->dir_name."/0/".($tier + 1)."A.dic")) {
+            if (file_exists($this->dir_name . "/0/" . ($tier + 1) . "A.dic")) {
                 $out_slot ="B";
             }
-            crawlLog("..Merging index $tier to ".($tier +1).$out_slot);
+            crawlLog("..Merging index $tier to " . ($tier +1) . $out_slot);
             $this->mergeTier($tier, $out_slot);
             $tier++;
             if ($tier > $this->max_tier) {
@@ -285,6 +293,7 @@ class IndexDictionary implements CrawlConstants
                     serialize($this->max_tier));
             }
         }
+        $this->calculateActiveTiers();
         crawlLog("...Done Incremental Merging of Index Dictionary Tiers");
         return true;
     }
@@ -725,7 +734,10 @@ class IndexDictionary implements CrawlConstants
      *      $start_generation
      * @param bool $with_remaining_total
      * @return mixed an array of entries of the form
-     *     generation, first offset, last offset, count
+     *     generation, first offset, last offset, count, matched_key
+     *     If also have with remaining true, then get a pair, with second
+     *     element as above and first element the estimated total number of
+     *     of docs
      */
      public function getWordInfo($word_id, $raw = false, $shift = 0,
         $threshold = -1, $start_generation = -1, $num_distinct_generations = -1,
@@ -798,8 +810,8 @@ class IndexDictionary implements CrawlConstants
      *      to return information about
      * @return mixed a pair(total_count, max_found_generation,
      *      an array of entries of the form
-     *      generation, first offset, last offset, count) or false if
-     *      no data
+     *      generation, first offset, last offset, count, matched_key) or
+     *      false if no data
      */
      public function getWordInfoTier($word_id, $raw, $tier, $shift = 0,
         $threshold = -1, $start_generation = -1, $num_distinct_generations = -1)
diff --git a/src/library/IndexShard.php b/src/library/IndexShard.php
index 93f37dd4e..bb87d56cf 100644
--- a/src/library/IndexShard.php
+++ b/src/library/IndexShard.php
@@ -521,6 +521,8 @@ class IndexShard extends PersistentStructure implements
             $this->mergeWordPostingsToString();
             $this->packWords(null);
             $this->outputPostingLists();
+        } else if ($this->read_only_from_disk && empty($this->num_docs)) {
+            $this->getShardHeader();
         }
         $num_docs_so_far = 0;
         $results = [];
@@ -1247,9 +1249,9 @@ class IndexShard extends PersistentStructure implements
             crawlLog("Saving index shard .. done merge postings to string");
         }
         $this->prepareWordsAndPrefixes($with_logging);
-            if ($with_logging) {
-                crawlLog("Saving index shard .. make prefixes");
-            }
+        if ($with_logging) {
+            crawlLog("Saving index shard .. make prefixes");
+        }
         $header =  pack("N*", $this->prefixes_len,
             $this->words_len,
             $this->word_docs_len,
@@ -1322,6 +1324,7 @@ class IndexShard extends PersistentStructure implements
         if($with_logging) {
             crawlLog("..without dictionary version of shard header written");
         }
+        $this->packWords(null, $with_logging);
         $remaining = $this->word_docs_len;
         $offset = 0;
         $buffer_size = 16 * self::SHARD_BLOCK_SIZE;
@@ -1747,7 +1750,7 @@ class IndexShard extends PersistentStructure implements
      */
     public function getShardHeader()
     {
-        if (isset($this->num_docs) && $this->num_docs > 0) {
+        if (!empty($this->num_docs)) {
             return true; // if $this->num_docs > 0 assume have read in
         }
         $header = substr($this->readBlockShardAtOffset(0, false),
diff --git a/src/library/LocaleFunctions.php b/src/library/LocaleFunctions.php
index 49dd94d30..44412ebde 100755
--- a/src/library/LocaleFunctions.php
+++ b/src/library/LocaleFunctions.php
@@ -100,41 +100,40 @@ function guessLocale()
  */
 function guessLocaleFromString($phrase_string, $locale_tag = null)
 {
-    $original_phrase_string = $phrase_string;
+    $original_phrase_string = mb_substr($phrase_string, 0,
+        C\AD_HOC_TITLE_LENGTH);
     $locale_tag = ($locale_tag == null) ? getLocaleTag() : $locale_tag;
     $sub = C\PUNCT."|[0-9]|\s";
     $phrase_string = preg_replace('/'.$sub.'/u', "", $phrase_string);
     $phrase_string = mb_convert_encoding($phrase_string, "UTF-32", "UTF-8");
     $len = strlen($phrase_string);
-    $guess['zh-CN'] = 0;
-    $guess['ru'] = 0;
-    $guess['he'] = 0;
-    $guess['ar'] = 0;
-    $guess['th'] = 0;
-    $guess['ja'] = 0;
-    $guess['ko'] = 0;
+    $guess = ['ar' => 0, 'he' => 0, 'hi' => 0, 'ko' => 0, 'ja' => 0, 'ru' => 0,
+        'th' => 0, 'zh-CN' => 0];
     $guess[$locale_tag] = 1;
     for ($i = 0; $i < $len; $i += 4) {
         $start = ord($phrase_string[$i+2]);
         $next = ord($phrase_string[$i+3]);
-        if ($start >= 78 && $start <= 159) {
-            $guess['zh-CN'] += 4;
-        } else if ($start == 4 || ($start == 5 && $next < 48)) {
-            $guess['ru']++;
-        } else if ($start == 5 && $next >= 144) {
-            $guess['he'] += 2;
-        } else if ($start >= 6 && $start <= 7) {
+        if ($start >= 6 && $start <= 7) {
             if ($locale_tag == "fa") {
                 $guess[$locale_tag] +=2;
             } else {
                 $guess['ar'] += 2;
             }
-        } else if ($start == 14 && $next < 128) {
-            $guess['th'] += 2;
-        } else if ($start >= 48 && $start <= 49) {
-            $guess['ja'] += 3;
+        } else if ($start == 5 && $next >= 144) {
+            $guess['he'] += 2;
+        } else if (($start == 9 && $next < 128) || ($start == 168 &&
+            $next >= 224)) {
+            $guess['hi'] += 2;
         } else if ($start == 17 || $start >= 172 && $start < 215) {
             $guess['ko'] += 2;
+        } else if ($start >= 48 && $start <= 49) {
+            $guess['ja'] += 3;
+        } else if ($start == 4 || ($start == 5 && $next < 48)) {
+            $guess['ru']++;
+        } else if ($start == 14 && $next < 128) {
+            $guess['th'] += 2;
+        } else if ($start >= 78 && $start <= 159) {
+            $guess['zh-CN'] += 4;
         } else if ($start == 0 && $next < 128) {
             $guess[$locale_tag]++; // assume ascii is from $locale_tag
         }
diff --git a/src/library/NWordGrams.php b/src/library/NWordGrams.php
index 2a54cac26..0348c8e05 100644
--- a/src/library/NWordGrams.php
+++ b/src/library/NWordGrams.php
@@ -200,7 +200,7 @@ class NWordGrams
      * @param int $ngram_type where in Wiki Dump to extract grams from
      * @param int $max_terms maximum number of n-grams to compute and put in
      *      file
-     * @return int $num_ngrams_found count of bigrams in text file.
+     * @return int $num_ngrams_found count of n-grams in text file.
      */
     public static function makeNWordGramsTextFile($wiki_file, $lang,
         $locale, $num_gram = 2, $ngram_type = self::PAGE_COUNT_WIKIPEDIA,
@@ -310,6 +310,10 @@ class NWordGrams
                                 }
                                 $ngram_num_words =
                                     mb_substr_count($ngram, " ") + 1;
+                                if ($lang == 'en' && preg_match(
+                                    '/^(a\s|the\s|of\s|if\s)/', $ngram)) {
+                                    $ngram_num_words--;
+                                }
                                 if (($is_all && $ngram_num_words > 1) ||
                                     (!$is_all &&
                                     $ngram_num_words == $num_gram)) {
@@ -330,7 +334,7 @@ class NWordGrams
                     }
                     if ($is_count_type && count($ngrams) > 4 * $max_terms
                         && $max_terms > 0) {
-                        echo  "..pruning results to $max_terms many\n";
+                        echo  "..pruning results to $max_terms terms.\n";
                         arsort($ngrams);
                         $ngrams = array_slice($ngrams, 0, $max_terms);
                     }
diff --git a/src/library/PhraseParser.php b/src/library/PhraseParser.php
index 097963ce8..adf6437df 100755
--- a/src/library/PhraseParser.php
+++ b/src/library/PhraseParser.php
@@ -134,7 +134,7 @@ class PhraseParser
             $string = trim(substr($string, strlen($control_word) + 1));
         } else {
             self::canonicalizePunctuatedTerms($string, $lang);
-            self::underscoreEntities($string, $lang);
+            self::hyphenateEntities($string, $lang);
         }
         $terms = self::stemCharGramSegment($string, $lang);
         $num = count($terms);
@@ -154,9 +154,8 @@ class PhraseParser
             return $terms;
         }
         $tokenizer = self::getTokenizer($lang);
-        if (method_exists($tokenizer, "getQuestionMarker") &&
-            stristr($whole_phrase, $tokenizer::getQuestionMarker())
-            !== false) {
+        if (!empty($tokenizer::$question_token) &&
+            stristr($whole_phrase, $tokenizer::$question_token) !== false) {
             $terms = [$whole_phrase, $terms[0]];
             return $terms;
         }
@@ -238,7 +237,7 @@ class PhraseParser
             'QUESTION_ANSWER_EXTRACT' => 0]];
         if (!isset(self::$programming_language_map[$lang])) {
             self::canonicalizePunctuatedTerms($string, $lang);
-            self::underscoreEntities($string, $lang);
+            self::hyphenateEntities($string, $lang);
             $phrase_list['TIMES']['CANONICALIZE'] =
                 changeInMicrotime($start_time);
         }
@@ -271,7 +270,7 @@ class PhraseParser
         return $phrase_list;
     }
     /**
-     * This functions tries to convert acronyms, e-mail, urls, etc into
+     * This method tries to convert acronyms, e-mail, urls, etc into
      * a format that does not involved punctuation that will be stripped
      * as we extract phrases.
      *
@@ -282,31 +281,21 @@ class PhraseParser
      */
     public static function canonicalizePunctuatedTerms(&$string, $lang = null)
     {
-        $acronym_pattern = "/\b[A-Za-z](\.\s*[A-Za-z])+(\.|\b)/";
+        $acronym_pattern = "/\b\p{L}(\.\s*\p{L})+(\.|\b)/u";
         $string = preg_replace_callback($acronym_pattern,
             function($matches) {
-                $result = "_".mb_strtolower(
-                    mb_ereg_replace("\.\s*", "", $matches[0]));
+                $result = "_" . mb_ereg_replace("\.\s*", "", $matches[0]);
                 return $result;
             }, $string);
         $ap = "(\'|\u{2019}|\u{02BC})";
-        $ampersand_pattern = "/[A-Za-z]+".
-            "(\s*(\s({$ap}n|{$ap}N)\s|\&)\s*[A-Za-z])+/u";
+        $ampersand_pattern = "/\p{L}+".
+            "(\s*(\s({$ap}n|{$ap}N)\s|\&)\s*\p{L})+/u";
         $string = preg_replace_callback($ampersand_pattern,
             function($matches) {
                 $ap = "(\'|\u{2019}|\u{02BC})";
-                $result = mb_strtolower(
-                    mb_ereg_replace("\s*(" . $ap . "n|" . $ap . "N|\&)\s*",
-                    "_and_", $matches[0]));
-                return $result;
-            }, $string);
-        $contraction_pattern = "/\b[A-Za-z]+" .
-            "({$ap}[A-Za-z]+|\s*{$ap}\s*(s|t))\b/u";
-        $string = preg_replace_callback($contraction_pattern,
-            function($matches) {
-                $result = mb_strtolower(
-                    mb_ereg_replace("\s*\'|\u2019|\u02BC\s*",
-                    "_ap_", $matches[0]));
+                $result = mb_ereg_replace(
+                    "\s*(" . $ap . "n|" . $ap . "N|\&)\s*",
+                    "_and_", $matches[0]);
                 return $result;
             }, $string);
         $url_or_email_pattern =
@@ -314,21 +303,16 @@ class PhraseParser
             '([A-Z0-9._%-]+\@[A-Z0-9.-]+\.[A-Z]{2,4})@i';
         $string = preg_replace_callback($url_or_email_pattern,
             function($matches) {
-                $result =  mb_ereg_replace("\.", "_d_",$matches[0]);
-                $result =  mb_ereg_replace("\:", "_c_",$result);
-                $result =  mb_ereg_replace("\/", "_s_",$result);
-                $result =  mb_ereg_replace("\@", "_at_",$result);
-                $result =  mb_ereg_replace("\[", "_bo_",$result);
-                $result =  mb_ereg_replace("\]", "_bc_",$result);
-                $result =  mb_ereg_replace("\(", "_po_",$result);
-                $result =  mb_ereg_replace("\)", "_pc_",$result);
-                $result =  mb_ereg_replace("\?", "_q_",$result);
-                $result =  mb_ereg_replace("\=", "_e_",$result);
-                $result =  mb_ereg_replace("\&", "_and_",$result);
-                $result = mb_strtolower($result);
-                return $result;
+                return preg_replace(['/\./', "/\:/", "/\//", "/\@/",
+                    "/\[/", "/\]/", "/\(/", "/\)/", "/\?/", "/\=/", "/\&/"],
+                    ["_d_", "_c_", "_s_", "_at_", "_bo_", "_bc_", "_po_",
+                    "_pc_", "_q_", "_e_", "_and_"], $matches[0]);
             },
             $string);
+            $tokenizer = self::getTokenizer($lang);
+            if (method_exists($tokenizer, "canonicalizePunctuatedTerms")) {
+                $tokenizer->canonicalizePunctuatedTerms($string);
+            }
     }
     /**
      * @param string& $string a string of words, etc which might involve such
@@ -336,16 +320,16 @@ class PhraseParser
      * @param $lang a language tag to use as part of the canonicalization
      *     process not used right now
      */
-    public static function underscoreEntities(&$string, $lang = null)
+    public static function hyphenateEntities(&$string, $lang = null)
     {
         if (!$lang) {
             return;
         }
-        $string = mb_strtolower($string);
         $parts = preg_split("/\s+/u", $string);
         $parts = array_filter($parts);
         $num_parts = count($parts);
         $current_entity = "";
+        $lower_entity = "";
         $out_string = "";
         $space = "";
         $i = 0;
@@ -355,24 +339,36 @@ class PhraseParser
             $j++;
             $current_entity = trim(implode(" ",
                 array_slice($parts, $i, $j - $i)));
+            $lower_entity = mb_strtolower($current_entity);
             if ($j - $i > 1) {
                 if (NWordGrams::ngramsContains(
-                    $current_entity, $lang, "all")) {
+                    $lower_entity, $lang, "all")) {
                     $last_entity = $current_entity;
+                    $lower_last_entity = $lower_entity;
                     $k = $j;
                 }
                 if (!NWordGrams::ngramsContains(
-                    $current_entity . "*", $lang, "all")) {
-                    $out_string .= $space . str_replace(" ", "_",
-                        trim($last_entity));
+                    $lower_entity . "*", $lang, "all")) {
+                    $last_entity = trim($last_entity);
+                    $lower_last_entity = trim($lower_last_entity);
+                    // extra checks as Bloom filter not 100%
+                    if (strpos(substr($last_entity, 4), " ") > 0 &&
+                        !preg_match('/\-|\(|\)|\[|\]|,|\./', $last_entity) &&
+                        NWordGrams::ngramsContains($lower_last_entity, $lang,
+                        "all")) {
+                        $last_entity = str_replace(" ", "-", $last_entity);
+                    }
+                    $out_string .= $space . $last_entity;
                     $space = " ";
                     $current_entity = "";
                     $last_entity = "";
+                    $lower_last_entity ="";
                     $i = $k;
                     $j = $k - 1;
                 }
             } else {
                 $last_entity = $current_entity;
+                $lower_last_entity = $lower_entity;
                 $k = $j;
             }
         }
@@ -456,6 +452,10 @@ class PhraseParser
     public static function stemCharGramSegment($string, $lang,
         $to_string = false)
     {
+        static $non_hyphens = "";
+        if (empty($non_hyphens)) {
+            $non_hyphens = str_replace("-|", "", C\PUNCT);
+        }
         if (isset(self::$programming_language_map[$lang])) {
             mb_internal_encoding("UTF-8");
             $tokenizer_name = self::$programming_language_map[$lang] .
@@ -467,7 +467,7 @@ class PhraseParser
             if ($lang == "hi") {
                 $string = preg_replace('/(,:)\p{P}/u', "", $string);
             }
-            $string = mb_ereg_replace("\s+|" . C\PUNCT, " ", $string);
+            $string = mb_ereg_replace("\s+|$non_hyphens", " ", $string);
             $terms = self::segmentSegment($string, $lang);
             $terms = self::charGramTerms($terms, $lang);
             $terms = self::stemTerms($terms, $lang);
@@ -812,6 +812,9 @@ class PhraseParser
      * not contain spaces), this function segments them according to the given
      * locales segmenter
      *
+     * Note: this method is not used when trying to extract keywords from urls.
+     * Instead, UrlParser::getWordsInHostUrl($url) is used.
+     *
      * @param string $segment string to split into terms
      * @param string $lang IANA tag to look up segmenter under
      *     from some other language
@@ -819,21 +822,24 @@ class PhraseParser
      */
     public static function segmentSegment($segment, $lang)
     {
-        if ($segment == "") {
+        static $non_hyphens = "";
+        if (empty($non_hyphens)) {
+            $non_hyphens = str_replace("-|", "", C\PUNCT);
+        }
+        if (empty($segment) || empty($lang)) {
             return [];
         }
+        $segment_obj = self::getTokenizer($lang);
         $term_string = "";
-        if ($lang != null) {
-            $segment_obj = self::getTokenizer($lang);
-        } else {
-            $segment_obj = null;
-        }
-        if ($segment_obj != null && method_exists($segment_obj, "segment")) {
+        if (!empty($segment_obj) && method_exists($segment_obj, "segment")
+            && strpos($segment, '-') === false) {
             $term_string .= $segment_obj->segment($segment);
         } else {
             $term_string = $segment;
         }
-        $terms = mb_split("\s+", trim($term_string));
+        $terms = preg_split("/(\s|$non_hyphens)+/u",
+            mb_strtolower(trim($term_string)));
+        $terms = array_values(array_filter($terms));
         return $terms;
     }
     /**
diff --git a/src/library/UrlParser.php b/src/library/UrlParser.php
index 88939871e..14d224e1a 100755
--- a/src/library/UrlParser.php
+++ b/src/library/UrlParser.php
@@ -499,11 +499,11 @@ class UrlParser
             return "";
         }
         array_pop($host_parts); // get rid of tld
-        if (stristr($host_parts[0],"www")) {
+        if (stristr($host_parts[0], "www")) {
             array_shift($host_parts);
         }
         $words = array_merge($words, $host_parts);
-        $word_string = " ".implode(" ", $words). " ";
+        $word_string = " " . implode(" ", $words). " ";
         return $word_string;
     }
     /**
diff --git a/src/library/processors/HtmlProcessor.php b/src/library/processors/HtmlProcessor.php
index 1aeb650cb..7f738dafe 100755
--- a/src/library/processors/HtmlProcessor.php
+++ b/src/library/processors/HtmlProcessor.php
@@ -156,10 +156,11 @@ class HtmlProcessor extends TextProcessor
                     $location = self::relCanonical($dom, $url);
                     if ($location) {
                         $summary[self::LINKS] = [];
-                        $summary[self::LINKS][$location] = "location:".$url;
+                        $summary[self::LINKS][$location] = "location:" . $url;
                         $summary[self::LOCATION] = true;
                         if (!$summary[self::DESCRIPTION]) {
-                            $summary[self::DESCRIPTION].=$url." => ".$location;
+                            $summary[self::DESCRIPTION] .=
+                                $url." => ".$location;
                         }
                         if (!$summary[self::TITLE]) {
                             $summary[self::TITLE] = $url;
diff --git a/src/library/processors/TextProcessor.php b/src/library/processors/TextProcessor.php
index 901e4ff02..0f4be9cca 100755
--- a/src/library/processors/TextProcessor.php
+++ b/src/library/processors/TextProcessor.php
@@ -93,17 +93,18 @@ class TextProcessor extends PageProcessor
     {
         $summary = null;
         if (is_string($page)) {
-            $dom_page = preg_replace('@<style[^>]*?>.*?</style>@si', ' ',
+            $remove_styles_page = preg_replace('@<style[^>]*?>.*?</style>@si', ' ',
                 $page);
-            $dom = self::dom($dom_page);
+            $dom = self::dom($remove_styles_page);
             $summary[self::TITLE] = "";
-            $summary[self::LANG] = self::calculateLang($dom_page);
+            $summary[self::LANG] = self::calculateLang($remove_styles_page);
             list($summary[self::DESCRIPTION], $summary[self::WORD_CLOUD]) =
-                $this->summarizer->getSummary($dom, $dom_page,
+                $this->summarizer->getSummary($dom, $remove_styles_page,
                     $summary[self::LANG]);
-            $summary[self::LINKS] = self::extractHttpHttpsUrls($dom_page);
-            $summary[self::PAGE] = "<html><body><div><pre>".
-                strip_tags($dom_page)."</pre></div></body></html>";
+            $summary[self::LINKS] = self::extractHttpHttpsUrls(
+                $remove_styles_page);
+            $summary[self::PAGE] = "<html><body><div><pre>" .
+                strip_tags($remove_styles_page) . "</pre></div></body></html>";
         }
         return $summary;
     }
@@ -121,7 +122,9 @@ class TextProcessor extends PageProcessor
     {
         if ($url != null) {
             $lang = UrlParser::getLang($url);
-            if ($lang != null) { return $lang; }
+            if ($lang && !in_array($lang, ["en", "en-US"])) {
+                return $lang;
+            }
         }
         if ($sample_text != null) {
             $lang = L\guessLocaleFromString($sample_text);
diff --git a/src/library/summarizers/CentroidSummarizer.php b/src/library/summarizers/CentroidSummarizer.php
index fc743a7b7..ccba98f85 100644
--- a/src/library/summarizers/CentroidSummarizer.php
+++ b/src/library/summarizers/CentroidSummarizer.php
@@ -44,29 +44,6 @@ use seekquarry\yioop\library\processors\PageProcessor;
  */
 class CentroidSummarizer extends Summarizer
 {
-    /**
-     * Number of bytes in a sentence before it is considered long
-     * We use strlen rather than mbstrlen. This might actually be
-     * a better metric of the potential of a sentence to have info.
-     */
-    const LONG_SENTENCE_LEN = 50;
-    /**
-     * Number of sentences in a document before only consider longer
-     * sentences in centroid
-     */
-    const LONG_SENTENCE_THRESHOLD = 100;
-    /**
-     * Number of distinct terms to use in generating summary
-     */
-    const MAX_DISTINCT_TERMS = 1000;
-    /**
-     * Number of words in word cloud
-     */
-    const WORD_CLOUD_LEN = 5;
-    /**
-     * Number of nonzero centroid components
-     */
-    const CENTROID_COMPONENTS = 50;
     /**
      * whether to output the results to the disk or not
      */
@@ -92,94 +69,20 @@ class CentroidSummarizer extends Summarizer
            alphanumerics.
         */
         $formatted_doc = self::formatDoc($page);
-        $stop_obj = PhraseParser::getTokenizer($lang);
-        /* Splitting into sentences */
         $out_sentences = self::getSentences($page);
-        $n = count($out_sentences);
-        $sentences = [];
-        if ($stop_obj && method_exists($stop_obj, "stopwordsRemover")) {
-            for ($i = 0; $i < $n; $i++ ) {
-                $sentences[$i] = $stop_obj->stopwordsRemover(
-                    self::formatDoc($out_sentences[$i]));
-             }
-        } else {
-            $sentences = $out_sentences;
-        }
+        $stop_obj = PhraseParser::getTokenizer($lang);
+        $sentences = self::removeStopWords($out_sentences, $stop_obj);
         /*  Splitting into terms */
         $terms = [];
         foreach ($sentences as $sentence) {
             $terms = array_merge($terms,
                 PhraseParser::segmentSegment($sentence, $lang));
         }
+        $n = count($out_sentences);
         $terms = array_filter($terms);
-        $terms_counts = array_count_values($terms);
-        arsort($terms_counts);
-        $terms_counts = array_slice($terms_counts, 0,
-            self::MAX_DISTINCT_TERMS);
-        $terms = array_unique(array_keys($terms_counts));
-        $t = count($terms);
-        if ($t == 0) {
-            return ["", ""];
-        }
-        /* Initialize Nk [Number of sentences the term occurs] */
-        $nk = [];
-        $nk = array_fill(0, $t, 0);
-        $nt = [];
-        /* Count TF for each word */
-        for ($i = 0; $i < $n; $i++) {
-            for ($j = 0; $j < $t; $j++) {
-                if (strpos($sentences[$i], $terms[$j]) !== false) {
-                    $nk[$j]++;
-                }
-            }
-        }
-        /* Calculate weights of each term for every sentence */
-        $w = [];
-        $idf = [];
-        $idf_temp = 0;
-        for ($k = 0; $k < $t; $k++) {
-            if ($nk[$k] == 0) {
-                $idf_temp = 0;
-                $tmp = 0;
-            } else {
-                $idf_temp = $n / $nk[$k];
-                $tmp = log($idf_temp);
-            }
-            $idf[$k] = $tmp;
-        }
-        /* Count TF for finding centroid */
-        $wc = [];
-        $max_nt = -1;
-        $b = "\b";
-        if (in_array($lang, ["zh-CN", "ja", "ko"])) {
-            $b = "";
-        }
-        set_error_handler(null);
-        for ($j = 0; $j < $t; $j++) {
-            $quoted = preg_quote($terms[$j], '/');
-            $nt = @preg_match_all("/$b" . $quoted . "$b/", $formatted_doc,
-                $matches); //$matches included for backwards compatibility
-            $wc[$j] = $nt * $idf[$j];
-            if (is_nan($wc[$j]) || is_infinite($wc[$j])) {
-                $wc[$j] = 0;
-            }
-        }
-        set_error_handler(C\NS_CONFIGS . "yioop_error_handler");
-        /* Calculate centroid */
-        arsort($wc);
-        $centroid = array_slice($wc, 0, self::CENTROID_COMPONENTS, true);
-        /* Initializing centroid weight array by 0 */
-        $wc = array_fill(0, $t, 0);
-        /* Word cloud */
-        $i = 0;
-        $word_cloud = [];
-        foreach ($centroid as $key => $value) {
-            $wc[$key] = $value;
-            if ($i < self::WORD_CLOUD_LEN) {
-                $word_cloud[$i] = $terms[$key];
-            }
-            $i++;
-        }
+        list($word_cloud, $wc, $idf) =
+            self::wordCloudAndCountsFromTermsSentences($terms, $sentences,
+            $lang);
         if (strlen($page) < PageProcessor::$max_description_len
             || $n == 1) {
             //if input short only use above to get a word cloud
@@ -212,15 +115,16 @@ class CentroidSummarizer extends Summarizer
         }
         arsort($sim);
         /* Getting how many sentences should be there in summary */
-        $top = self::summarySentenceCount($out_sentences, $sim);
+        $top = self::numSentencesForSummary($out_sentences, $sim);
         $sum_array = [];
         $sum_array = array_keys(array_slice($sim, 0, $top - 1, true));
         sort($sum_array);
         $summary = '';
+        $eos = ($lang == 'hi') ? "।" : "."; //default end of sentence symbol
         foreach ($sum_array as $key) {
             $compressed_sentence =
                 PhraseParser::compressSentence($out_sentences[$key], $lang);
-            $summary .= $compressed_sentence . ". ";
+            $summary .= rtrim($compressed_sentence, $eos) . "$eos ";
         }
         if (self::OUTPUT_TO_FILE) {
             $output_file_contents = "";
@@ -228,7 +132,8 @@ class CentroidSummarizer extends Summarizer
                 $compressed_sentence =
                     PhraseParser::compressSentence($out_sentences[$key],
                     $lang);
-                $output_file_contents .= $compressed_sentence . ".\r\n";
+                $output_file_contents .= rtrim($compressed_sentence,
+                    $eos) . "$eos\r\n";
             }
             file_put_contents(C\WORK_DIRECTORY . self::OUTPUT_FILE_PATH,
                 $output_file_contents);
@@ -245,7 +150,7 @@ class CentroidSummarizer extends Summarizer
      *      similarity score to centroid (sorted from highest to lowest score).
      * @return int number of sentences
      */
-    public static function summarySentenceCount($sentences, $sim)
+    public static function numSentencesForSummary($sentences, $sim)
     {
         $top = null;
         $count = 0;
@@ -258,101 +163,4 @@ class CentroidSummarizer extends Summarizer
         }
         return $top;
     }
-    /**
-     * Breaks any content into sentences by splitting it on spaces or carriage
-     *   returns
-     * @param string $content complete page.
-     * @return array array of sentences from that content.
-     */
-    public static function getSentences($content)
-    {
-        $lines = preg_split(
-            '/(\.|\||\!|\?|!|?|。)\s+|(\n|\r)(\n|\r)+|\s{5}/ui',
-            $content, 0, PREG_SPLIT_NO_EMPTY);
-        $out = [];
-        $sentence = "";
-        $count = 0;
-        $theshold_factor = 1;
-        foreach ($lines as $line) {
-            $sentence .= " " . $line;
-            if (strlen($line) < 2) {
-                continue;
-            }
-            if ($count < self::LONG_SENTENCE_THRESHOLD ||
-                strlen($sentence) > $theshold_factor *
-                    self::LONG_SENTENCE_LEN){
-                $sentence = preg_replace("/\s+/ui", " ", $sentence);
-                $out[] = trim($sentence);
-                $count++;
-                $theshold_factor =
-                    pow(1.5, floor($count/self::LONG_SENTENCE_THRESHOLD));
-            }
-            $sentence = "";
-        }
-        if (trim($sentence) != "") {
-            $sentence = preg_replace("/\s+/ui", " ", $sentence);
-            $out[] = trim($sentence);
-        }
-        return $out;
-    }
-    /**
-     * Formats the sentences to remove all characters except words,
-     *   digits and spaces
-     * @param string $sent complete page.
-     * @return string formatted sentences.
-     */
-    public static function formatSentence($sent)
-    {
-        $sent = trim(preg_replace('/[^\p{L}\p{N}\s]+/u',
-            ' ', mb_strtolower($sent)));
-        return $sent;
-    }
-    /**
-     * Formats the document to remove carriage returns, hyphens and digits
-     * as we will not be using digits in word cloud.
-     * The formatted document generated by this function is only used to
-     * compute centroid.
-     * @param string $content formatted page.
-     * @return string formatted document.
-     */
-    public static function formatDoc($content)
-    {
-        $substitute = ['/[\n\r\-]+/', '/[^\p{L}\s\.]+/u', '/[\.]+/ui'];
-        $content = preg_replace($substitute, ' ', mb_strtolower($content));
-        return $content;
-    }
-    /**
-     * This function does an additional processing on the page
-     * such as removing all the tags from the page
-     * @param string $page complete page.
-     * @return string processed page.
-     */
-    public static function pageProcessing($page)
-    {
-        $substitutions = ['@<script[^>]*?>.*?</script>@si',
-            '/\&nbsp\;|\&rdquo\;|\&ldquo\;|\&mdash\;/si',
-            '@<style[^>]*?>.*?</style>@si', '/[\^\(\)]/',
-            '/\[(.*?)\]/', '/\t\n/'
-        ];
-        $page = preg_replace($substitutions, ' ', $page);
-        $new_page = preg_replace("/\<br\s*(\/)?\s*\>/", "\n", $page);
-        $changed = false;
-        if ($new_page != $page) {
-            $changed = true;
-            $page = $new_page;
-        }
-        $page = preg_replace("/\<\/(h1|h2|h3|h4|h5|h6|table|tr|td|div|".
-            "p|address|section)\s*\>/", "\n\n", $page);
-        $page = preg_replace("/\<a/", " <a", $page);
-        $page = preg_replace("/\&\#\d{3}(\d?)\;|\&\w+\;/", " ", $page);
-        $page = preg_replace("/\</", " <", $page);
-        $page = strip_tags($page);
-        if ($changed) {
-            $page = preg_replace("/(\r?\n[\t| ]*){2}/", "\n", $page);
-        }
-        $page = preg_replace("/(\r?\n[\t| ]*)/", "\n", $page);
-        $page = preg_replace("/\n\n\n+/", "\n\n", $page);
-        $page = preg_replace('/\s\s+/', ' ', $page);
-        return $page;
-    }
 }
diff --git a/src/library/summarizers/CentroidWeightedSummarizer.php b/src/library/summarizers/CentroidWeightedSummarizer.php
index 2b5892206..7f384ab88 100644
--- a/src/library/summarizers/CentroidWeightedSummarizer.php
+++ b/src/library/summarizers/CentroidWeightedSummarizer.php
@@ -45,29 +45,6 @@ use seekquarry\yioop\library\processors\PageProcessor;
  */
 class CentroidWeightedSummarizer extends Summarizer
 {
-    /**
-     * Number of bytes in a sentence before it is considered long
-     * We use strlen rather than mbstrlen. This might actually be
-     * a better metric of the potential of a sentence to have info.
-     */
-    const LONG_SENTENCE_LEN = 50;
-    /**
-     * Number of sentences in a document before only consider longer
-     * sentences in centroid
-     */
-    const LONG_SENTENCE_THRESHOLD = 100;
-    /**
-     * Number of distinct terms to use in generating summary
-     */
-    const MAX_DISTINCT_TERMS = 1000;
-    /**
-     * Number of words in word cloud
-     */
-    const WORD_CLOUD_LEN = 5;
-    /**
-     * Number of nonzero centroid components
-     */
-    const CENTROID_COMPONENTS = 50;
     /**
      * whether to output the results to the disk or not
      */
@@ -93,272 +70,31 @@ class CentroidWeightedSummarizer extends Summarizer
         /* Format the document to remove characters other than periods and
            alphanumerics.
         */
-        $page = mb_strtolower($page);
         $formatted_doc = self::formatDoc($page);
         /* Splitting into sentences */
         $out_sentences = self::getSentences($page);
         $stop_obj = PhraseParser::getTokenizer($lang);
         $sentences = self::removeStopWords($out_sentences, $stop_obj);
-        $sentence_array = self::splitSentences($sentences, $lang);
-        $terms = $sentence_array[0];
-        $tf_per_sentence = $sentence_array[1];
-        $tf_per_sentence_normalized = $sentence_array[2];
+        list($terms, $tf_per_sentence, $tf_per_sentence_normalized) =
+            self::computeTermsAndStatistics($sentences, $lang);
         $tf_average_sentence =
             self::getAverageSentence($tf_per_sentence_normalized);
         $tf_dot_product_per_sentence =
             self::getDotProduct($tf_per_sentence_normalized,
             $tf_average_sentence);
-        usort($tf_dot_product_per_sentence, 'self::sortInAscendingOrder');
+        usort($tf_dot_product_per_sentence, function($a, $b) {
+            return $b > $a ? 1 : -1;
+        });
         $summary = self::getSummaryFromProducts($tf_dot_product_per_sentence,
             $out_sentences, $lang);
-        $n = count($out_sentences);
         $terms = array_filter($terms);
-        $terms_counts = array_count_values($terms);
-        arsort($terms_counts);
-        $terms_counts = array_slice($terms_counts, 0,
-            self::MAX_DISTINCT_TERMS);
-        $terms = array_unique(array_keys($terms_counts));
-        $t = count($terms);
-        if ($t == 0) {
-            return ["", ""];
-        }
-        /* Initialize Nk [Number of sentences the term occurs] */
-        $nk = [];
-        $nk = array_fill(0, $t, 0);
-        $nt = [];
-        /* Count TF for each word */
-        for ($i = 0; $i < $n; $i++) {
-            for ($j = 0; $j < $t; $j++) {
-                if (strpos($sentences[$i], $terms[$j]) !== false) {
-                    $nk[$j]++;
-                }
-            }
-        }
-        /* Calculate weights of each term for every sentence */
-        $w = [];
-        $idf = [];
-        $idf_temp = 0;
-        for ($k = 0; $k < $t; $k++) {
-            if ($nk[$k] == 0) {
-                $idf_temp = 0;
-                $tmp = 0;
-            } else {
-                $idf_temp = $n / $nk[$k];
-                $tmp = log($idf_temp);
-            }
-            $idf[$k] = $tmp;
-        }
-        /* Count TF for finding centroid */
-        $wc = [];
-        $max_nt = -1;
-        $b = "\b";
-        if (in_array($lang, ["zh-CN", "ja", "ko"])) {
-            $b = "";
-        }
-        set_error_handler(null);
-        for ($j = 0; $j < $t; $j++) {
-            $quoted = preg_quote($terms[$j]);
-            $nt = @preg_match_all("/$b" . $quoted . "$b/", $formatted_doc,
-                $matches); //$matches included for backwards compatibility
-            $wc[$j] = $nt * $idf[$j];
-            if (is_nan($wc[$j]) || is_infinite($wc[$j])) {
-                $wc[$j] = 0;
-            }
-        }
-        set_error_handler(C\NS_CONFIGS . "yioop_error_handler");
-        /* Calculate centroid */
-        arsort($wc);
-        $centroid = array_slice($wc, 0, self::CENTROID_COMPONENTS, true);
-        /* Initializing centroid weight array by 0 */
-        $wc = array_fill(0, $t, 0);
-        /* Word cloud */
-        $i = 0;
-        $word_cloud = [];
-        foreach ($centroid as $key => $value) {
-            $wc[$key] = $value;
-            if ($i < self::WORD_CLOUD_LEN) {
-                $word_cloud[$i] = $terms[$key];
-            }
-            $i++;
-        }
+        list($word_cloud,) =
+            self::wordCloudAndCountsFromTermsSentences($terms, $sentences,
+            $lang);
         /* Summary of text summarization */
         return [$summary, $word_cloud];
     }
-    /**
-     * Calculates how many sentences to put in the summary to match the
-     * MAX_DESCRIPTION_LEN.
-     *
-     * @param array $sentences sentences in doc in their original order
-     * @param array $sim associative array of sentence-number-in-doc =>
-     *      similarity score to centroid (sorted from highest to lowest score).
-     * @return int number of sentences
-     */
-    public static function summarySentenceCount($sentences, $sim)
-    {
-        $top = null;
-        $count = 0;
-        foreach ($sim as $key => $value)
-        {
-            if ($count < PageProcessor::$max_description_len) {
-                $count += strlen($sentences[$key]);
-                $top++;
-            }
-        }
-        return $top;
-    }
-    /**
-     * Breaks any content into sentences by splitting it on spaces or carriage
-     *   returns
-     * @param string $content complete page.
-     * @return array array of sentences from that content.
-     */
-    public static function getSentences($content)
-    {
-        $lines = preg_split(
-            '/(\.|\||\!|\?|!|?|。)\s+|(\n|\r)(\n|\r)+|\s{5}/',
-            $content, 0, PREG_SPLIT_NO_EMPTY);
-        $out = [];
-        $sentence = "";
-        $count = 0;
-        $theshold_factor = 1;
-        $threshold = self::LONG_SENTENCE_THRESHOLD;
-        foreach ($lines as $line) {
-            $sentence .= " " . $line;
-            if (strlen($line) < 2) {
-                continue;
-            }
-            if ($count < $threshold ||
-                strlen($sentence) > $theshold_factor *
-                    self::LONG_SENTENCE_LEN) {
-                $sentence = preg_replace("/\s+/ui", " ", $sentence);
-                $out[] = trim($sentence);
-                $count++;
-                $theshold_factor =
-                    pow(1.5, floor($count/$threshold));
-            }
-            $sentence = "";
-        }
-        if (trim($sentence) != "") {
-            $sentence = preg_replace("/\s+/ui", " ", $sentence);
-            $out[] = trim($sentence);
-        }
-        return $out;
-    }
-    /**
-     * Formats the sentences to remove all characters except words,
-     *   digits and spaces
-     * @param string $sent complete page.
-     * @return string formatted sentences.
-     */
-    public static function formatSentence($sent)
-    {
-        $sent = trim(preg_replace('/[^\p{L}\p{N}\s]+/u',
-            ' ', $sent));
-        return $sent;
-    }
-    /**
-     * Formats the document to remove carriage returns, hyphens and digits
-     * as we will not be using digits in word cloud.
-     * The formatted document generated by this function is only used to
-     * compute centroid.
-     * @param string $content formatted page.
-     * @return string formatted document.
-     */
-    public static function formatDoc($content)
-    {
-        $substitute = ['/[\n\r\-]+/', '/[^\p{L}\s\.]+/u', '/[\.]+/'];
-        $content = preg_replace($substitute, ' ', $content);
-        return $content;
-    }
-    /**
-     * This function does an additional processing on the page
-     * such as removing all the tags from the page
-     * @param string $page complete page.
-     * @return string processed page.
-     */
-    public static function pageProcessing($page)
-    {
-        $substitutions = ['@<script[^>]*?>.*?</script>@si',
-            '/\&nbsp\;|\&rdquo\;|\&ldquo\;|\&mdash\;/si',
-            '@<style[^>]*?>.*?</style>@si', '/[\^\(\)]/',
-            '/\[(.*?)\]/', '/\t\n/'
-        ];
-        $page = preg_replace($substitutions, ' ', $page);
-        $page = preg_replace('/\s{2,}/', ' ', $page);
-        $new_page = preg_replace("/\<br\s*(\/)?\s*\>/", "\n", $page);
-        $changed = false;
-        if ($new_page != $page) {
-            $changed = true;
-            $page = $new_page;
-        }
-        $page = preg_replace("/\<\/(h1|h2|h3|h4|h5|h6|table|tr|td|div|".
-            "p|address|section)\s*\>/", "\n\n", $page);
-        $page = preg_replace("/\<a/", " <a", $page);
-        $page = preg_replace("/\&\#\d{3}(\d?)\;|\&\w+\;/", " ", $page);
-        $page = preg_replace("/\</", " <", $page);
-        $page = strip_tags($page);
-        if ($changed) {
-            $page = preg_replace("/(\r?\n[\t| ]*){2}/", "\n", $page);
-        }
-        $page = preg_replace("/(\r?\n[\t| ]*)/", "\n", $page);
-        $page = preg_replace("/\n\n\n+/", "\n\n", $page);
-        return $page;
-    }
-    /**
-     * Calculates an array with key terms and values their frequencies
-     * based on a supplied sentence
-     *
-     * @param array $terms the list of all terms in the doc
-     * @param array $sentence the sentences in the doc
-     * @return array a two dimensional array where the word is the key and
-     *      the frequency is the value
-     */
-    public static function getTermFrequencies($terms, $sentence)
-    {
-        $t = count($terms);
-        $nk = [];
-        $nk = array_fill(0, $t, 0);
-        $nt = [];
-        for ($j = 0; $j < $t; $j++) {
-            $nk[$j] += preg_match_all("/\b" . preg_quote($terms[$j],'/') .
-                "\b/iu", $sentence, $matches);
-        }
-        $term_frequencies = [];
-        for ($i = 0; $i <  count($nk); $i++ ) {
-            $term_frequencies[$terms[$i]] = $nk[$i];
-        }
-        return $term_frequencies;
-    }
-    /**
-     * Normalize the term frequencies based on the sum of the squares.
-     * @param array $term_frequencies the array with the terms as the key
-     *      and its frequency as the value
-     * @return array array of term frequencies normalized
-     */
-    public static function normalizeTermFrequencies($term_frequencies)
-    {
-        $sum_of_squares = 0;
-        $result_sum = 0;
-        if (count($term_frequencies) == 0) {
-            $result = [];
-        } else {
-            foreach ($term_frequencies as $k => $v) {
-                $sum_of_squares += ($v * $v);
-            }
-            $square_root = sqrt($sum_of_squares);
-            foreach ($term_frequencies as $k => $v) {
-                if ($square_root == 0) {
-                    $result[$k] = 0;
-                } else {
-                    $result[$k] = ($v / $square_root);
-                }
-            }
-            foreach ($result as $k => $v) {
-                $result_sum += $v;
-            }
-        }
-        return $result;
-    }
+
     /**
      * Get the average sentence by adding up the values from each column and
      * dividing it by the rows in the array.
@@ -404,55 +140,25 @@ class CentroidWeightedSummarizer extends Summarizer
             $result = [];
             $count = 0;
             foreach ($term_frequencies_normalized as $k => $v) {
-                $tempResult = 0;
+                $temp_result = 0;
                 foreach ($v as $l => $w) {
-                    if (@array_key_exists($l, $average_sentence)) {
-                        $tempResult = $tempResult +
-                            ($average_sentence[$l] * $w);
+                    if (!empty($average_sentence[$l])) {
+                        $temp_result += ($average_sentence[$l] * $w);
                     }
                 }
-                $result[$count] = $tempResult;
+                $result[$count] = $temp_result;
                 $count++;
             }
             return $result;
     }
     /**
-     * Compare the two values and return if b is greater than a
-     * @param string $a the first value to compare
-     * @param string $b the second value to compare
-     * @return boolean if b is greater than a
-     */
-    public static function sortInAscendingOrder($a, $b)
-    {
-        return $b > $a ? 1 : -1;
-    }
-    /**
-     * Returns a new array of sentences without the stop words
-     * @param array $sentences the array of sentences to process
-     * @param object $stop_obj the class that has the stopworedRemover method
-     * @return array a new array of sentences without the stop words
-     */
-    public static function removeStopWords($sentences, $stop_obj)
-    {
-        $n = count($sentences);
-        $result = [];
-        if ($stop_obj && method_exists($stop_obj, "stopwordsRemover")) {
-            for ($i = 0; $i < $n; $i++ ) {
-                $result[$i] = $stop_obj->stopwordsRemover(
-                    self::formatDoc($sentences[$i]));
-             }
-        } else {
-            $result = $sentences;
-        }
-        return $result;
-    }
-    /**
-     * Split up the sentences and return an array with all of the needed parts
+     * Splits sentences into terms and returns [array of terms,
+     *  array term frequencies, array normalized term frequencies]
      * @param array $sentences the array of sentences to process
      * @param string $lang the current locale
      * @return array an array with all of the needed parts
      */
-    public static function splitSentences($sentences, $lang)
+    public static function computeTermsAndStatistics($sentences, $lang)
     {
         $result = [];
         $terms = [];
@@ -468,10 +174,7 @@ class CentroidWeightedSummarizer extends Summarizer
                 self::normalizeTermFrequencies($tf_per_sentence[$tf_index]);
             $tf_index++;
         }
-        $result[0] = $terms;
-        $result[1] = $tf_per_sentence;
-        $result[2] = $tf_per_sentence_normalized;
-        return $result;
+        return [$terms, $tf_per_sentence, $tf_per_sentence_normalized];
     }
     /**
      * Split up the sentences and return an array with all of the needed parts
@@ -489,9 +192,9 @@ class CentroidWeightedSummarizer extends Summarizer
         $result = "";
         $result_length = 0;
         $i = 0;
+        $eos = ($lang == 'hi') ? "।" : "."; //default end of sentence symbol
         foreach ($tf_dot_product_per_sentence as $k => $v) {
-            $sentence = PhraseParser::compressSentence($sentences[$k],
-                $lang);
+            $sentence = PhraseParser::compressSentence($sentences[$k], $lang);
             if ($result_length + strlen($sentence) >
                 PageProcessor::$max_description_len) {
                 break;
@@ -499,15 +202,15 @@ class CentroidWeightedSummarizer extends Summarizer
                 $result_length += strlen($sentence);
                 if ($i == 0) {
                     $i = 1;
-                    $result = $sentence . ". ";
+                    $result = rtrim($sentence, $eos) . "$eos ";
                     if (self::OUTPUT_TO_FILE) {
-                        $output_file_contents = $sentence . ". ";
+                        $output_file_contents = $sentence . "$eos ";
                     }
                 } else {
-                    $result .= " " . $sentence . ". ";
+                    $result .= " " . rtrim($sentence, $eos) . "$eos ";
                     if (self::OUTPUT_TO_FILE) {
                         $output_file_contents = $output_file_contents .
-                            "\r\n" . $sentence . ". ";
+                            "\r\n" . rtrim($sentence, $eos) . "$eos ";
                     }
                 }
             }
diff --git a/src/library/summarizers/GraphBasedSummarizer.php b/src/library/summarizers/GraphBasedSummarizer.php
index 87b27a562..408222d89 100644
--- a/src/library/summarizers/GraphBasedSummarizer.php
+++ b/src/library/summarizers/GraphBasedSummarizer.php
@@ -76,8 +76,8 @@ class GraphBasedSummarizer extends Summarizer
         $page = self::pageProcessing($page);
         $formatted_doc = self::formatDoc($page);
         //not filtering non-ascii characters
-        $sentences = self::getSentences($page . " ", true);
-        $sentences = self::removeStopWords($sentences, $lang);
+        $sentences_with_punctuation = self::getSentences($page);
+        $sentences = self::removeStopWords($sentences_with_punctuation, $lang);
         $sentences = self::removePunctuation($sentences);
         $sentences = PhraseParser::stemTermsK($sentences, $lang, true);
         $terms = self::getTerms($sentences, $lang);
@@ -87,10 +87,9 @@ class GraphBasedSummarizer extends Summarizer
         $adjacency = self::computeAdjacency($term_frequencies_normalized,
             $sentences, $lang, $unmodified_doc);
         $p = self::getSentenceRanks($adjacency);
-        $sentences_with_punctuation = self::getSentences($page . " ", true);
         $summary = self::getFinalSummary($sentences_with_punctuation, $p,
             $lang);
-        return [$summary, []];
+        return [$summary, self::wordCloudFromSummary($summary,  $lang)];
     }
     /**
      * Given as array of sentences and an array of their importance between 0
@@ -255,49 +254,6 @@ class GraphBasedSummarizer extends Summarizer
         }
         return $sentences;
      }
-    /**
-     * Remove the stop words from the array of sentences
-     * @param array $sentences the sentences in the doc
-     * @param string $lang locale tag for stemming
-     * @return array the array of sentences with the stop words removed
-     */
-    public static function removeStopWords($sentences, $lang)
-    {
-        $n = count($sentences);
-        $stop_obj = PhraseParser::getTokenizer($lang);
-        if ($stop_obj && method_exists($stop_obj, "stopwordsRemover")) {
-            for ($i = 0; $i < $n; $i++ ) {
-                $sentences[$i] = $stop_obj->stopwordsRemover(
-                    self::formatDoc($sentences[$i]));
-             }
-        }
-        return $sentences;
-    }
-    /**
-     * Calculate the term frequencies.
-     * @param array $terms the list of all terms in the doc
-     * @param array $sentences the sentences in the doc
-     * @return array a two dimensional array where the word is the key and
-     *      the frequency is the value
-     */
-    public static function getTermFrequencies($terms, $sentences)
-    {
-        $t = count($terms);
-        $n = count($sentences);
-        $nk = [];
-        $nk = array_fill(0, $t, 0);
-        $nt = [];
-       for ($j = 0; $j < $t; $j++) {
-            for ($i = 0; $i < $n; $i++) {
-                $nk[$j] += preg_match_all("/\b" . $terms[$j] . "\b/iu",
-                    $sentences[$i], $matches);
-            }
-        }
-        for ($i = 0; $i <  count($nk); $i++ ) {
-            $term_frequencies[$terms[$i]] = $nk[$i];
-        }
-        return $term_frequencies;
-    }
     /**
      * Get the terms from an array of sentences
      * @param array $sentences the sentences in the doc
@@ -323,62 +279,6 @@ class GraphBasedSummarizer extends Summarizer
         }
         return $terms;
     }
-    /**
-     * Breaks any content into sentences by splitting it on spaces or carriage
-     *   returns
-     * @param string $content complete page.
-     * @param boolean $keep_punctuation whether to keep the punctuation or not.
-     * @return array array of sentences from that content.
-     */
-    public static function getSentences($content, $keep_punctuation)
-    {
-        $result = [];
-        if ($keep_punctuation) {
-            $sentences =
-                preg_split('/(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)/u',
-                $content, 0, PREG_SPLIT_NO_EMPTY);
-            $n = count($sentences);
-            for ($i = 0; $i < $n; $i++ ) {
-                $sentences[$i] = trim($sentences[$i]);
-            }
-            $result = array_filter($sentences);
-        } else {
-            $sentences = preg_split(
-                '/(\.|\||\!|\?|!|?|。)\s+|(\n|\r)(\n|\r)+|\s{5}/u',
-                $content, 0, PREG_SPLIT_NO_EMPTY);
-            $result = array_filter($sentences);
-        }
-        return $result;
-    }
-    /**
-     * Normalize the term frequencies based on the sum of the squares.
-     * @param array $term_frequencies the array with the terms as the key
-     *      and its frequency as the value
-     * @return array array of term frequencies normalized
-     */
-    public static function normalizeTermFrequencies($term_frequencies)
-    {
-        $sum_of_squares = 0;
-        $result_sum = 0;
-        foreach ($term_frequencies as $k => $v) {
-            $sum_of_squares += ($v * $v);
-        }
-        $square_root = sqrt($sum_of_squares);
-        if ($square_root == 0) {
-            $num_terms = count($term_frequencies);
-            if ($num_terms == 0) {
-                return false;
-            }
-            foreach ($term_frequencies as $k => $v) {
-                $result[$k] = 1/$num_terms;
-            }
-            return $result;
-        }
-        foreach ($term_frequencies as $k => $v) {
-            $result[$k] = ($v / $square_root);
-        }
-        return $result;
-    }
     /**
      * Calculate the distortion measure.
      * 1. Check each word in sentence1 to see if it exists in sentence2.
@@ -391,7 +291,7 @@ class GraphBasedSummarizer extends Summarizer
      * sum.
      * 3. Then check the sentence2 to find its not-common words
      * with sentence1, in case the word Y is not in sentence1,
-     * square the score of word Y and add to sum and increase
+     * square the score of word Y and add tosum and increase
      * the number of not-common words by one.
      * 4. At the end, calculate the distortion between sentence1 and
      * sentence2 by dividing sum by the number of not-common
@@ -407,8 +307,8 @@ class GraphBasedSummarizer extends Summarizer
         $term_frequencies, $lang, $doc)
     {
         $result = 0;
-        $first_sentence_split = preg_split('/ +/u', $first_sentence);
-        $second_sentence_split = preg_split('/ +/u', $second_sentence);
+        $first_sentence_split = preg_split('/\s+/u', $first_sentence);
+        $second_sentence_split = preg_split('/\s+/u', $second_sentence);
         $sum = 0;
         $non_common_words = 0;
         $n = count($first_sentence_split);
@@ -462,64 +362,4 @@ class GraphBasedSummarizer extends Summarizer
         }
         return $result;
     }
-    /**
-     * Formats the sentences to remove all characters except words,
-     *   digits and spaces
-     * @param string $sent complete page.
-     * @return string formatted sentences.
-     */
-    public static function formatSentence($sent)
-    {
-        $sent = trim(preg_replace('/[^\p{L}\p{N}\s]+/u',
-            ' ', mb_strtolower($sent)));
-        return $sent;
-    }
-    /**
-     * Formats the document to remove carriage returns, hyphens and digits
-     * as we will not be using digits in word cloud.
-     * The formatted document generated by this function is only used to
-     * compute centroid.
-     * @param string $content formatted page.
-     * @return string formatted document.
-     */
-    public static function formatDoc($content)
-    {
-        $substitute = ['/[\n\r\-]+/', '/[^\p{L}\s\.]+/u', '/[\.]+/u'];
-        $content = preg_replace($substitute, ' ', mb_strtolower($content));
-        return $content;
-    }
-    /**
-     * This function does an additional processing on the page
-     * such as removing all the tags from the page
-     * @param string $page complete page.
-     * @return string processed page.
-     */
-    public static function pageProcessing($page)
-    {
-        $substitutions = ['@<script[^>]*?>.*?</script>@si',
-            '/\&nbsp\;|\&rdquo\;|\&ldquo\;|\&mdash\;/si',
-            '@<style[^>]*?>.*?</style>@si', '/[\^\(\)]/',
-            '/\[(.*?)\]/', '/\t\n/'
-        ];
-        $page = preg_replace($substitutions, ' ', $page);
-        $page = preg_replace('/\s{2,}/u', ' ', $page);
-        $new_page = preg_replace("/\<br\s*(\/)?\s*\>/u", "\n", $page);
-        $changed = false;
-        if ($new_page != $page) {
-            $changed = true;
-            $page = $new_page;
-        }
-        $page = preg_replace("/\<\/(h1|h2|h3|h4|h5|h6|table|tr|td|div|".
-            "p|address|section)\s*\>/u", "\n\n", $page);
-        $page = preg_replace("/\<a/u", " <a", $page);
-        $page = preg_replace("/\&\#\d{3}(\d?)\;|\&\w+\;/u", " ", $page);
-        $page = preg_replace("/\</u", " <", $page);
-        $page = strip_tags($page);
-        if ($changed) {
-            $page = preg_replace("/(\r?\n[\t| ]*){2}/u", "\n", $page);
-        }
-        $page = preg_replace("/(\r?\n[\t| ]*)/u", "\n", $page);
-        $page = preg_replace("/\n\n\n+/u", "\n\n", $page);
-        return $page;
-    }
 }
diff --git a/src/library/summarizers/ScrapeSummarizer.php b/src/library/summarizers/ScrapeSummarizer.php
index e15353b3a..e179de9ca 100644
--- a/src/library/summarizers/ScrapeSummarizer.php
+++ b/src/library/summarizers/ScrapeSummarizer.php
@@ -64,7 +64,9 @@ class ScrapeSummarizer extends Summarizer
      */
     public static function getSummary($dom, $page, $lang)
     {
-        return [self::description($dom, $page, $lang), []];
+        $summary = self::description($dom, $page, $lang);
+        $word_cloud = self::wordCloudFromSummary($summary,  $lang);
+        return [$summary, $word_cloud];
     }
     /**
      * Returns descriptive text concerning a webpage based on its document
diff --git a/src/library/summarizers/Summarizer.php b/src/library/summarizers/Summarizer.php
index 8d6f02796..10b1b3b6f 100644
--- a/src/library/summarizers/Summarizer.php
+++ b/src/library/summarizers/Summarizer.php
@@ -30,6 +30,9 @@
  */
 namespace seekquarry\yioop\library\summarizers;

+use seekquarry\yioop\configs as C;
+use seekquarry\yioop\library\PhraseParser;
+
 /** For Yioop global defines used by subclasses*/
 require_once __DIR__."/../../configs/Config.php";
 /**
@@ -40,6 +43,18 @@ require_once __DIR__."/../../configs/Config.php";
  */
 class Summarizer
 {
+    /**
+     * Number of distinct terms to use in generating summary
+     */
+    const MAX_DISTINCT_TERMS = 1000;
+    /**
+     * Number of nonzero centroid components
+     */
+    const CENTROID_COMPONENTS = 50;
+    /**
+     * Number of words in word cloud
+     */
+    const WORD_CLOUD_LEN = 5;
     /**
      * The value to represent the weight for class one tags.
      */
@@ -168,4 +183,235 @@ class Summarizer
         }
         return $result;
     }
+    /**
+     * Breaks any content into sentences by splitting it on spaces or carriage
+     *   returns
+     * @param string $content complete page.
+     * @return array array of sentences from that content.
+     */
+    public static function getSentences($content)
+    {
+        $content = preg_replace([ "/\n+(\.| |\t)+/u",
+            "/((\p{L}|\p{N}|\)|\}|\]){5,}\s?(\.|\|।|\!|\?|!|?|。))\s+/u",
+            "/।/u", "/(\n|\r)(\n|\r)+/", "/।./u"], ["\n", "$1.\n", "।\n\n",
+            "..\n", "।"], $content);
+        $lines = preg_split('/\.\n/', $content, 0, PREG_SPLIT_NO_EMPTY);
+        $lines = preg_replace("/\s+/", " ", $lines);
+        return $lines;
+    }
+    /**
+     * Formats the sentences to remove all characters except words,
+     *   digits and spaces
+     * @param string $sentence complete page.
+     * @return string formatted sentences.
+     */
+    public static function formatSentence($sentence)
+    {
+        $sentence = trim(preg_replace('/[^\p{L}\p{N}\s]+/u',
+            ' ', mb_strtolower($sentence)));
+        return $sentence;
+    }
+    /**
+     * Formats the document to remove carriage returns, hyphens and digits
+     * as we will not be using digits in word cloud.
+     * The formatted document generated by this function is only used to
+     * compute centroid.
+     * @param string $content formatted page.
+     * @return string formatted document.
+     */
+    public static function formatDoc($content)
+    {
+        $substitute = ['/[\n\r\-]+/', '/[^\p{L}\s\.]+/u', '/\.+/'];
+        $content = preg_replace($substitute, ' ', $content);
+        return $content;
+    }
+    /**
+     * This function does an additional processing on the page
+     * such as removing all the tags from the page
+     * @param string $page complete page.
+     * @return string processed page.
+     */
+    public static function pageProcessing($page)
+    {
+        $substitutions = ['@<script[^>]*?>.*?</script>@si',
+            '/\&nbsp\;|\&rdquo\;|\&ldquo\;|\&mdash\;/si',
+            '@<style[^>]*?>.*?</style>@si', '/\t\n/', '/\s{2,}/'
+        ];
+        $page = preg_replace($substitutions, ' ', $page);
+        $new_page = preg_replace("/\<br\s*(\/)?\s*\>/", "\n", $page);
+        $changed = false;
+        if ($new_page != $page) {
+            $changed = true;
+            $page = $new_page;
+        }
+        $page = preg_replace("/\<\/(h1|h2|h3|h4|h5|h6|table|tr|td|div|".
+            "p|address|section)\s*\>/iu", "\n\n", $page);
+        $page = preg_replace("/\<a/iu", " <a", $page);
+        $page = html_entity_decode($page);
+        $page = preg_replace("/\</", " <", $page);
+        $page = strip_tags($page);
+        if ($changed) {
+            $page = preg_replace("/(\r?\n[\t| ]*){2}/", "\n", $page);
+        }
+        $page = preg_replace("/(\r?\n[\t| ]*)/", "\n", $page);
+        $page = preg_replace("/\n\n\n+/", "\n\n", $page);
+        return $page;
+    }
+    /**
+     * Returns a new array of sentences without the stop words
+     * @param array $sentences the array of sentences to process
+     * @param object $stop_obj the class that has the stopworedRemover method
+     * @return array a new array of sentences without the stop words
+     */
+    public static function removeStopWords($sentences, $stop_obj)
+    {
+        if ($stop_obj && method_exists($stop_obj, "stopwordsRemover")) {
+            $results = $stop_obj->stopwordsRemover($sentences);
+        } else {
+            $results = $sentences;
+        }
+        return $results;
+    }
+    /**
+     * Calculates an array with key terms and values their frequencies
+     * based on a supplied sentence
+     *
+     * @param array $terms the list of all terms in the doc
+     * @param array $sentence the sentences in the doc
+     * @return array a two dimensional array where the word is the key and
+     *      the frequency is the value
+     */
+    public static function getTermFrequencies($terms, $sentence)
+    {
+        $t = count($terms);
+        $nk = array_fill(0, $t, 0);
+        for ($j = 0; $j < $t; $j++) {
+            $nk[$j] += preg_match_all("/\b" . preg_quote($terms[$j], '/') .
+                "\b/iu", $sentence);
+        }
+        return array_combine($terms, $nk);
+    }
+    /**
+     * Normalize the term frequency vector by dividing its entries by its
+     * L_2 norm.
+     * @param array $term_frequencies the array with the terms as the key
+     *      and its frequency as the value
+     * @return array array of term frequencies normalized
+     */
+    public static function normalizeTermFrequencies($term_frequencies)
+    {
+        $sum_of_squares = 0;
+        $result_sum = 0;
+        if (count($term_frequencies) == 0) {
+            return [];
+        }
+        foreach ($term_frequencies as $k => $v) {
+            $sum_of_squares += ($v * $v);
+        }
+        $square_root = sqrt($sum_of_squares);
+        foreach ($term_frequencies as $k => $v) {
+            if ($square_root == 0) {
+                $result[$k] = 0;
+            } else {
+                $result[$k] = ($v / $square_root);
+            }
+        }
+        return $result;
+    }
+    /**
+     *
+     */
+    public static function wordCloudFromSummary($summary,  $lang,
+        $term_frequencies = null)
+    {
+        if ($term_frequencies == null) {
+            $stop_obj = PhraseParser::getTokenizer($lang);
+            if ($stop_obj && method_exists($stop_obj, "stopwordsRemover")) {
+                $summary = $stop_obj->stopwordsRemover($summary);
+            }
+            $summary = mb_strtolower($summary);
+            $terms = PhraseParser::segmentSegment($summary, $lang);
+            $term_frequencies = self::getTermFrequencies($terms, $summary);
+        }
+        arsort($term_frequencies);
+        $top5 = array_slice($term_frequencies, 0 , 5);
+        return array_keys($top5);
+    }
+    /**
+     *
+     */
+    public static function wordCloudAndCountsFromTermsSentences($terms,
+        $sentences, $lang)
+    {
+        $n = count($sentences);
+        $terms_counts = array_count_values($terms);
+        arsort($terms_counts);
+        $terms_counts = array_slice($terms_counts, 0,
+            self::MAX_DISTINCT_TERMS);
+        $terms = array_unique(array_keys($terms_counts));
+        $t = count($terms);
+        if ($t == 0) {
+            return ["", ""];
+        }
+        /* Initialize Nk [Number of sentences the term occurs] */
+        $nk = [];
+        $nk = array_fill(0, $t, 0);
+        $nt = [];
+        /* Count TF for each word */
+        for ($i = 0; $i < $n; $i++) {
+            for ($j = 0; $j < $t; $j++) {
+                if (strpos($sentences[$i], $terms[$j]) !== false) {
+                    $nk[$j]++;
+                }
+            }
+        }
+        /* Calculate weights of each term for every sentence */
+        $w = [];
+        $idf = [];
+        $idf_temp = 0;
+        for ($k = 0; $k < $t; $k++) {
+            if ($nk[$k] == 0) {
+                $idf_temp = 0;
+                $tmp = 0;
+            } else {
+                $idf_temp = $n / $nk[$k];
+                $tmp = log($idf_temp);
+            }
+            $idf[$k] = $tmp;
+        }
+        /* Count TF for finding centroid */
+        $wc = [];
+        $max_nt = -1;
+        $b = "\b";
+        if (in_array($lang, ["zh-CN", "ja", "ko"])) {
+            $b = "";
+        }
+        set_error_handler(null);
+        for ($j = 0; $j < $t; $j++) {
+            $quoted = preg_quote($terms[$j]);
+            $nt = @preg_match_all("/$b" . $quoted . "$b/", $formatted_doc,
+                $matches); //$matches included for backwards compatibility
+            $wc[$j] = $nt * $idf[$j];
+            if (is_nan($wc[$j]) || is_infinite($wc[$j])) {
+                $wc[$j] = 0;
+            }
+        }
+        set_error_handler(C\NS_CONFIGS . "yioop_error_handler");
+        /* Calculate centroid */
+        arsort($wc);
+        $centroid = array_slice($wc, 0, self::CENTROID_COMPONENTS, true);
+        /* Initializing centroid weight array by 0 */
+        $wc = array_fill(0, $t, 0);
+        /* Word cloud */
+        $i = 0;
+        $word_cloud = [];
+        foreach ($centroid as $key => $value) {
+            $wc[$key] = $value;
+            if ($i < self::WORD_CLOUD_LEN) {
+                $word_cloud[$i] = $terms[$key];
+            }
+            $i++;
+        }
+        return [$word_cloud, $wc, $idf];
+    }
 }
diff --git a/src/locale/ar/resources/Tokenizer.php b/src/locale/ar/resources/Tokenizer.php
index 7709f64d5..cc3965737 100755
--- a/src/locale/ar/resources/Tokenizer.php
+++ b/src/locale/ar/resources/Tokenizer.php
@@ -61,12 +61,13 @@ class Tokenizer
     /**
      * Removes the stop words from the page (used for Word Cloud generation)
      *
-     * @param string $page the page to remove stop words from.
-     * @return string $page with no stop words
+     * @param mixed $data either a string or an array of string to remove
+     *      stop words from
+     * @return mixed $data with no stop words
      */
-    public static function stopwordsRemover($page)
+    public static function stopwordsRemover($data)
     {
-        $stop_words = [
+        static $stop_words = [
             "ا", "أ", "،", "عشر", "عدد", "عدة","عشرة",
             "عدم", "عام", "عاما", "عن", "عند", "عندما",
             "على", "عليه", "عليها", "زيارة", "سنة", "سنوات",
@@ -97,9 +98,12 @@ class Tokenizer
             "منها", "مليار", "لوكالة", "يكون", "يمكن",
             "مليون"
         ];
-        $page = preg_replace('/\b('.implode('|',$stop_words).')\b/u', '',
-            $page);
-        return $page;
+        static $pattern = "";
+        if (empty($pattern)) {
+            $pattern = '/\b(' . implode('|', $stop_words) . ')\b/u';
+        }
+        $data = preg_replace($pattern, '', $data);
+        return $data;
     }
     /**
      * Computes the stem of an Arabic word
@@ -193,4 +197,4 @@ class Tokenizer
         }
         return $word;
     }
-}
\ No newline at end of file
+}
diff --git a/src/locale/de/resources/Tokenizer.php b/src/locale/de/resources/Tokenizer.php
index 545df2fbe..86d18de0c 100755
--- a/src/locale/de/resources/Tokenizer.php
+++ b/src/locale/de/resources/Tokenizer.php
@@ -112,17 +112,18 @@ class Tokenizer
     /**
      * Removes the stop words from the page (used for Word Cloud generation)
      *
-     * @param string $page the page to remove stop words from.
-     * @return string $page with no stop words
+     * @param mixed $data either a string or an array of string to remove
+     *      stop words from
+     * @return mixed $data with no stop words
      */
-    public static function stopwordsRemover($page)
+    public static function stopwordsRemover($data)
     {
-        $stop_words = ['aber', 'alle', 'allem', 'allen', 'aller', 'alles',
-            'als', 'as', 'also', 'am', 'an', 'ander', 'andere', 'anderem',
-            'anderen', 'anderer', 'anderes', 'anderm', 'andern', 'anderr',
-            'anders', 'auch', 'auf', 'aus', 'bei', 'bin', 'bis', 'bist',
-            'da', 'damit', 'dann', 'der', 'den', 'des', 'dem', 'die', 'das',
-            'daß', 'derselbe', 'derselben', 'denselben', 'desselben',
+        static $stop_words = ['aber', 'alle', 'allem', 'allen', 'aller',
+            'alles', 'als', 'as', 'also', 'am', 'an', 'ander', 'andere',
+            'anderem', 'anderen', 'anderer', 'anderes', 'anderm', 'andern',
+            'anderr', 'anders', 'auch', 'auf', 'aus', 'bei', 'bin', 'bis',
+            'bist', 'da', 'damit', 'dann', 'der', 'den', 'des', 'dem', 'die',
+            'das', 'daß', 'derselbe', 'derselben', 'denselben', 'desselben',
             'demselben', 'dieselbe', 'dieselben', 'dasselbe', 'dazu', 'dein',
             'deine', 'deinem', 'deinen', 'deiner', 'deines', 'denn', 'derer',
             'dessen', 'dich', 'dir', 'du', 'dies', 'diese', 'diesem', 'diesen',
@@ -152,9 +153,12 @@ class Tokenizer
             'wollte', 'würde', 'würden', 'zu', 'zum',
             'zur', 'zwar', 'zwischen'
             ];
-        $page = preg_replace('/\b('.implode('|',$stop_words).')\b/u', '',
-            strtolower($page));
-        return $page;
+        static $pattern = "";
+        if (empty($pattern)) {
+            $pattern = '/\b(' . implode('|', $stop_words) . ')\b/u';
+        }
+        $data = preg_replace($pattern, '', $data);
+        return $data;
     }
     /**
      * Computes the stem of a German word
diff --git a/src/locale/en_US/resources/Tokenizer.php b/src/locale/en_US/resources/Tokenizer.php
index db7ecc92d..003ed54d8 100755
--- a/src/locale/en_US/resources/Tokenizer.php
+++ b/src/locale/en_US/resources/Tokenizer.php
@@ -73,22 +73,39 @@ class Tokenizer
      * can be answered using a question answer list
      * @string
      */
-    public static $question_marker = "qqq";
+    public static $question_token = "qqq";
     /**
-     * List of verb-like parts of speech that might appear in lexicon file
+     * List of adjective-like parts of speech that might appear in lexicon file
+     * @array
+     */
+    public static $adjective_type = ["JJ", "JJR", "JJS"];
+    /**
+     * List of adverb-like parts of speech that might appear in lexicon file
+     * @array
+     */
+    public static $adverb_type = ["RB", "RBR", "RBS"];
+    /**
+     * List of conjunction-like parts of speech that might appear in lexicon
+     * file
+     * @array
+     */
+    public static $conjunction_type = ["CC"];
+    /**
+     * List of determiner-like parts of speech that might appear in lexicon
+     * file
      * @array
      */
-    public static $verb_phrases = ["VB", "VBD", "VBG", "VBN", "VBP", "VBZ"];
+    public static $determiner_type = ["DT", "PDT"];
     /**
      * List of noun-like parts of speech that might appear in lexicon file
      * @array
      */
-    public static $noun_phrases = ["NN", "NNS", "NNP", "NNPS", "PRP"];
+    public static $noun_type = ["NN", "NNS", "NNP", "NNPS", "PRP"];
     /**
-     * List of adjective-like parts of speech that might appear in lexicon file
+     * List of verb-like parts of speech that might appear in lexicon file
      * @array
      */
-    public static $adjective_phrases = ["JJ", "JJR", "JJS"];
+    public static $verb_type = ["VB", "VBD", "VBG", "VBN", "VBP", "VBZ"];
     /**
      * storage used in computing the stem
      * @var string
@@ -128,12 +145,13 @@ class Tokenizer
     /**
      * Removes the stop words from the page (used for Word Cloud generation)
      *
-     * @param string $page the page to remove stop words from.
-     * @return string $page with no stop words
+     * @param mixed $data either a string or an array of string to remove
+     *      stop words from
+     * @return mixed $data with no stop words
      */
-    public static function stopwordsRemover($page)
+    public static function stopwordsRemover($data)
     {
-        $stop_words = ['a','able','about','above','abst',
+        static $stop_words = ['a','able','about','above','abst',
         'accordance','according','based','accordingly','across','act',
         'actually','added','adj','affected','affecting','affects','after',
         'afterwards','again','against','ah','all','almost','alone','along',
@@ -239,9 +257,62 @@ class Tokenizer
         'without','wont','words','world',
         'would','wouldnt','www','x','y','yes','yet','you','youd','youll',
         'your','youre','yours','yourself','yourselves','youve','z','zero'];
-        $page = preg_replace('/\b('.implode('|',$stop_words).')\b/', '',
-            mb_strtolower($page));
-        return $page;
+        static $pattern = "";
+        if (empty($pattern)) {
+            $pattern = '/\b(' . implode('|', $stop_words) . ')\b/ui';
+        }
+        $data = preg_replace($pattern, '', $data);
+        return $data;
+    }
+    /**
+     * This methods tries to handle punctuation in terms specific to the
+     * English language such as abbreviations.
+     *
+     * @param string& $string a string of words, etc which might involve such
+     *      terms
+     */
+    public function canonicalizePunctuatedTerms(&$string)
+    {
+        static $substitutions = [
+            //abbreviated titles
+            "/([mM]r|[mM]rs|[mM]s|[dD]r|dD]rs|[iI]n|".
+            "[cC]apt|[cC]pl|sS]t|fF]t|[vV]s)\.(\s*)(\p{Lu}|\Z)/u" => '$1 $3',
+            "/,(\p{Lu})\.(\s*)(\p{Lu}|\Z)/u" => '$1_ $3',
+            "/gimme/i" => "give me",
+            "/gonna/i" => "going to",
+            "/gotta/i" => "got to",
+            "/ma\'am/i" => "madam",
+            "/\'tis/i" => "it is",
+            "/\'twas/i" => "it was",
+            "/y\'all/i" => "you all",
+            "/I\'m/" => "I am",
+            "/I ain\'t/" => "I am not",
+            "/You ain\'t/i" => "you are not",
+            "/(why|who|which|when|what|this|that|there|how|" .
+                "it|everyone|one|he|she)\'s/i" => "$1 is",
+            "/is been/" => "has been",
+            "/is had/" => "has had",
+            // shan't
+            "/shan\'t/" => "shall not",
+            // contractions with not
+            "/\b(\p{L}+)\'d/" => ' $1 would',
+            // contractions with not
+            "/\b(\p{L}+)n\'t/" => ' $1 not',
+            // contractions with will
+            "/\b(\p{L}+)\'ll/" => ' $1 will',
+            // contractions with have
+            "/\b(\p{L}+)\'ve/" => ' $1 have',
+            // contractions with have
+            "/\b(\p{L}+)\'re/" => ' $1 are',
+            "/\b(\p{L}+)\'s/" => ' $1_pos_s'
+        ];
+        static $patterns = [];
+        static $replacements = [];
+        if (empty($patterns)) {
+            $patterns = array_keys($substitutions);
+            $replacements = array_values($substitutions);
+        }
+        $string = preg_replace($patterns, $replacements, $string);
     }
     /**
      * Takes a phrase and tags each term in it with its part of speech.
@@ -278,15 +349,15 @@ class Tokenizer
      */
    public static function tagTokenizePartOfSpeech($text)
     {
+        /* if run as own server dictionary only loaded once for all requests */
         static $dictionary = [];
         $lexicon_file = C\LOCALE_DIR . "/en_US/resources/lexicon.txt.gz";
         if (empty($dictionary)) {
             if (file_exists($lexicon_file)) {
-                $lines = gzfile($lexicon_file);
-                foreach ($lines as $line) {
-                    $tags = preg_split('/(\s+|\,)/u', trim($line));
-                    $dictionary[array_shift($tags)] = array_filter($tags);
-                }
+                $lex_data = gzdecode(file_get_contents($lexicon_file));
+                preg_match_all("/([^\s\,]+)[\s|\,]+([^\n]+)/u",
+                    $lex_data, $lex_parts);
+                $dictionary = array_combine($lex_parts[1], $lex_parts[2]);
             }
         }
         preg_match_all("/[\w\d]+/", $text, $matches);
@@ -307,11 +378,12 @@ class Tokenizer
             // remove trailing full stops
             $token = strtolower($token);
             if (!empty($dictionary[$token])) {
-                $tag_list = $dictionary[$token];
+                $tag_list = explode(" ", $dictionary[$token]);
                 $current['tag'] = $tag_list[0];
             }
             // Converts verbs after 'the' to nouns
-            if ($previous['tag'] == 'DT' && in_array($current['tag'], $verbs)){
+            if ($previous['token'] == 'the' &&
+                in_array($current['tag'], $verbs)){
                 $current['tag'] = 'NN';
             }
             // Convert noun to number if . appears
@@ -352,7 +424,7 @@ class Tokenizer
                 $result[$i - 1]['tag'] = 'JJ';
                 $current['tag'] = 'NN';
             }
-            /* If we get noun, and the second can be a verb,
+            /* If we have a noun, and the second can be a verb,
              * convert to verb; if noun noun and previous could be an
              * adjective convert to adjective
              */
@@ -426,12 +498,37 @@ class Tokenizer
      */
     public static function compressSentence($sentence_to_compress)
     {
-        $result = $sentence_to_compress;
-        $result = self::compressSentenceStep2($result);
-        $result = self::compressSentenceStep3($result);
-        $result = self::compressSentenceStep4($result);
-        $result = self::compressSentenceStep5($result);
-        return $result;
+        // patterns are based on From Back to Basics: CLASSY 2006 page 3:
+        static $delete_patterns = [
+            /*
+              2. We remove many adverbs and all conjunctions,
+              including phrases such as "As a matter of fact," and
+              "At this point," that occur at the start  of a sentence.
+             */
+            "/^At this point,?\b/i", "/^As a matter of fact,?\b/i",
+            "/^[a-zA-Z]*ly\b/i", "/(^and,?\b)|(^but,?\b)|(^for,?\b)|" .
+            "(^nor,?\b)|(^or,?\b)|(^so,?\b)|(^yet,?\b)/i",
+            /*
+              3. We remove a small selections of words that occur in the middle
+              of a sentence, such as ", however," and ", also," (not always
+              requiring the commas).
+             */
+            "/(;|,)?\s*(nevertheless|today|tomorrow|soon|instead|in practice|" .
+            "however|as a practical matter|further(more)?|".
+            "as such|also)\s*(;|,)?\s*/i",
+            /*
+              4. For DUC 2006, we added the removal of ages such as ", 51," or
+              ", aged 24,".
+             */
+            "/,\s?\d{1,3},/i", "/,\s?aged\s?\d{1,3},/i",
+            /*
+              6. We remove relative clause attributives (clauses beginning with
+              "who(m)", "which", "when", and "where") wherever possible.
+            */
+            "/(,\s?whom?[^,]*,)|(,\s?which[^,]*,)|" .
+            "(,\s?when[^,]*,)|(,\s?where[^,]*,)/i"
+        ];
+        return preg_replace($delete_patterns, " ", $sentence_to_compress);
     }
     /**
      * Takes a triplets array with subject, predicate, object fields with
@@ -451,60 +548,84 @@ class Tokenizer
             self::extractTripletByType($sub_pred_obj_triplets, "RAW");
         return $processed_triplets;
     }
+    /**
+     *
+     */
+    public static function parseTypeList(&$cur_node, $tagged_phrase, $type)
+    {
+        $string = "";
+        $previous_string = "";
+        $previous_tag = "";
+        $start_node = $cur_node;
+        $next_tag = (empty($tagged_phrase[$cur_node]['tag'])) ? "" :
+            trim($tagged_phrase[$cur_node]['tag']);
+        $allowed_conjuncts = [];
+        while ($next_tag && (in_array($next_tag, $type) ||
+            in_array($next_tag, $allowed_conjuncts))) {
+            $previous_string = $string;
+            $string .= " ". $tagged_phrase[$cur_node]['token'];
+            $cur_node++;
+            $allowed_conjuncts = self::$conjunction_type;
+            $previous_tag = $next_tag;
+            $next_tag = (empty($tagged_phrase[$cur_node]['tag'])) ? "" :
+                trim($tagged_phrase[$cur_node]['tag']);
+        }
+        if (in_array($previous_tag, $allowed_conjuncts) && $start_node <
+            $cur_node) {
+            $cur_node--;
+            $string = $previous_string;
+        }
+        return $string;
+    }
     /**
      * Takes a part-of-speech tagged phrase and pre-tree with a
-     * parse-from position and builds a parse tree for a determiner if possible
+     * parse-from position and builds a parse tree for an adjective if possible
      *
      * @param array $tagged_phrase
      *      an array of pairs of the form ("token" => token_for_term,
      *     "tag"=> part_of_speech_tag_for_term)
-     * @param array $tree that consists of ["curnode" =>
+     * @param array $tree that consists of ["cur_node" =>
      *      current parse position in $tagged_phrase]
      * @return array has fields
      *      "cur_node" index of how far we parsed $tagged_phrase
-     *      "DT" a subarray with a token node for the determiner that was
+     *      "JJ" a subarray with a token node for the adjective that was
      *      parsed
      */
-    public static function extractDeterminer($tagged_phrase, $tree)
+    public static function parseAdjective($tagged_phrase, $tree)
     {
-        $cur_node = $tree['cur_node'];
-        if (isset($tagged_phrase[$cur_node]['tag']) &&
-            trim($tagged_phrase[$cur_node]['tag']) == "DT" ) {
-            $tree['DT'] = $tagged_phrase[$cur_node]['token'];
-            $tree['cur_node']++;
-            return $tree;
+        $adjective_string = self::parseTypeList($tree['cur_node'],
+            $tagged_phrase, self::$adjective_type);
+        if (!empty($adjective_string)) {
+            $tree["JJ"] = $adjective_string;
         }
         return $tree;
     }
     /**
      * Takes a part-of-speech tagged phrase and pre-tree with a
-     * parse-from position and builds a parse tree for an adjective if possible
+     * parse-from position and builds a parse tree for a determiner if possible
      *
      * @param array $tagged_phrase
      *      an array of pairs of the form ("token" => token_for_term,
      *     "tag"=> part_of_speech_tag_for_term)
-     * @param array $tree that consists of ["cur_node" =>
+     * @param array $tree that consists of ["curnode" =>
      *      current parse position in $tagged_phrase]
      * @return array has fields
      *      "cur_node" index of how far we parsed $tagged_phrase
-     *      "JJ" a subarray with a token node for the adjective that was
+     *      "DT" a subarray with a token node for the determiner that was
      *      parsed
      */
-    public static function extractAdjective($tagged_phrase, $tree)
+    public static function parseDeterminer($tagged_phrase, $tree)
     {
-        $adjective_string = "";
-        $cur_node = $tree['cur_node'];
-        while (isset($tagged_phrase[$cur_node]['tag']) &&
-           in_array(trim($tagged_phrase[$cur_node]['tag']),
-           self::$adjective_phrases)) {
-           $adjective_string .= " " . $tagged_phrase[$cur_node]['token'];
-           $cur_node++;
-       }
-       if (!empty($adjective_string)) {
-           $tree["JJ"] = $adjective_string;
-       }
-       $tree['cur_node'] = $cur_node;
-       return $tree;
+        $determiner_string = "";
+        /* In: All the cows low, "All the" is considered a determiner.
+           That is, we will mush together the predeterminer with the determiner
+         */
+        $determiner_string = self::parseTypeList($tree['cur_node'],
+            $tagged_phrase, self::$determiner_type);
+        if (!empty($determiner_string)) {
+           $tree["DT"] = $determiner_string;
+        }
+        return $tree;
     }
     /**
      * Takes a part-of-speech tagged phrase and pre-tree with a
@@ -520,21 +641,37 @@ class Tokenizer
      *      "NN" a subarray with a token node for the noun string that was
      *      parsed
      */
-    public static function extractNoun($tagged_phrase, $tree)
+    public static function parseNoun($tagged_phrase, $tree)
     {
         //Combining multiple noun into one
-        $noun_string = "";
-        $cur_node = $tree['cur_node'];
-        while (isset($tagged_phrase[$cur_node]['tag']) &&
-            (in_array(trim($tagged_phrase[$cur_node]['tag']),
-            self::$noun_phrases))) {
-            $noun_string .= " " . $tagged_phrase[$cur_node]['token'];
-            $cur_node++;
-        }
+        $noun_string = self::parseTypeList($tree['cur_node'], $tagged_phrase,
+            self::$noun_type);
         if (!empty($noun_string)) {
             $tree["NN"] = $noun_string;
         }
-        $tree['cur_node'] = $cur_node;
+        return $tree;
+    }
+    /**
+     * Takes a part-of-speech tagged phrase and pre-tree with a
+     * parse-from position and builds a parse tree for a verb if possible
+     *
+     * @param array $tagged_phrase
+     *      an array of pairs of the form ("token" => token_for_term,
+     *     "tag"=> part_of_speech_tag_for_term)
+     * @param array $tree that consists of ["curnode" =>
+     *      current parse position in $tagged_phrase]
+     * @return array has fields
+     *      "cur_node" index of how far we parsed $tagged_phrase
+     *      "VB" a subarray with a token node for the verb string that was
+     *      parsed
+     */
+    public static function parseVerb($tagged_phrase, $tree)
+    {
+        $verb_string = self::parseTypeList($tree['cur_node'], $tagged_phrase,
+            self::$verb_type);
+        if (!empty($verb_string)) {
+            $tree["VB"] = $verb_string;
+        }
         return $tree;
     }
     /**
@@ -547,7 +684,7 @@ class Tokenizer
      *     "tag"=> part_of_speech_tag_for_term)
      * @param array $tree that consists of ["cur_node" =>
      *      current parse position in $tagged_phrase]
-     * @param int $index which term in $tagged_phrase to start to try to extract
+     * @param int $index which term in $tagged_phrase to start to try to parse
      *      a preposition from
      * @return array has fields
      *      "cur_node" index of how far we parsed $tagged_phrase
@@ -558,51 +695,42 @@ class Tokenizer
      *      "JJ_i" with value an adjective subtree
      *      "NN_i"  with value an additional noun subtree
      */
-    public static function extractPrepositionalPhrases($tagged_phrase, $tree,
+    public static function parsePrepositionalPhrases($tagged_phrase, $tree,
         $index = 1)
     {
-       $cur_node = $tree['cur_node'];
-        // Checking for preposition.I.e, format: prep [det] [adjective] noun
+        $cur_node = $tree['cur_node'];
+        // Checking for preposition. I.e, format: prep [det] [adjective] noun
         if (isset($tagged_phrase[$cur_node]['tag']) &&
             trim($tagged_phrase[$cur_node]['tag']) == "IN") {
             /* can have multiple prep's in a row, for example,
                it is known in over 20 countries*/
-            $preposition_string = "";
-            while (isset($tagged_phrase[$cur_node]['tag']) &&
-                trim($tagged_phrase[$cur_node]['tag']) == "IN") {
-                $preposition_string .= " ". $tagged_phrase[$cur_node]['token'];
-                $cur_node++;
-            }
+            $preposition_string = self::parseTypeList($cur_node, $tagged_phrase,
+                ["IN"]);
             if (!empty($preposition_string)) {
                 $tree["IN_$index"] = $preposition_string;
             }
-            if (isset($tagged_phrase[$cur_node]['tag']) &&
-                trim($tagged_phrase[$cur_node]['tag']) == "DT") {
-                $tree['DT_$index'] = $tagged_phrase[$cur_node]['token'];
-                $cur_node++;
-            }
-            $adjective_string = "";
-            while (isset($tagged_phrase[$cur_node]['tag']) &&
-                in_array(trim($tagged_phrase[$cur_node]['tag']),
-                self::$adjective_phrases)) {
-                $adjective_string .= " " . $tagged_phrase[$cur_node]['token'];
-                $cur_node++;
+            $determiner_string = self::parseTypeList($cur_node, $tagged_phrase,
+                self::$determiner_type);
+            if (!empty($determiner_string)) {
+                $tree["DT_$index"] = $determiner_string;
             }
+            $adjective_string = self::parseTypeList($cur_node, $tagged_phrase,
+                self::$adjective_type);
             if (!empty($adjective_string)) {
                 $tree["JJ_$index"] = $adjective_string;
             }
-            $prep_noun_string = "";
-            while (isset($tagged_phrase[$cur_node]['tag']) &&
-                in_array(trim($tagged_phrase[$cur_node]['tag']),
-                self::$noun_phrases)) {
-                $prep_noun_string .= " " . $tagged_phrase[$cur_node]['token'];
-                $cur_node++;
-            }
+            $prep_noun_string = self::parseTypeList($cur_node, $tagged_phrase,
+                self::$noun_type);
             if ($prep_noun_string) {
                 $tree["NP_$index"] = $prep_noun_string;
             }
-            $tree_next = self::extractPrepositionalPhrases($tagged_phrase,
+            /* if have more than one phrase in a row:
+               the drought happened in many countries over many years.
+             */
+            $tree_next = self::parsePrepositionalPhrases($tagged_phrase,
                 ["cur_node" => $cur_node], $index + 1);
+            unset($tree_next['cur_node']);
+            $tree['PRP'] = $tree_next;
         }
         $tree['cur_node'] = $cur_node;
         return $tree;
@@ -623,70 +751,47 @@ class Tokenizer
      *      "JJ" with value an adjective subtree
      *      "NN" with value a noun tree
      */
-    public static function extractNounPhrase($tagged_phrase, $tree)
+    public static function parseNounPhrase($tagged_phrase, $tree)
     {
         $cur_node = $tree['cur_node'];
-        $tree_dt = self::extractDeterminer($tagged_phrase,
+        $tree_dt = self::parseDeterminer($tagged_phrase,
             ['cur_node' => $cur_node]);
-        $tree_jj = self::extractAdjective($tagged_phrase,
+        $tree_jj = self::parseAdjective($tagged_phrase,
             ['cur_node' => $tree_dt['cur_node']]);
-        $tree_nn = self::extractNoun($tagged_phrase,
+        $tree_nn = self::parseNoun($tagged_phrase,
             ['cur_node' => $tree_jj['cur_node']]);
-        $tree_pp = self::extractPrepositionalPhrases($tagged_phrase,
-            ['cur_node' => $tree_nn['cur_node']]);
         if ($tree_nn['cur_node'] == $cur_node) {
             $tree['NP'] = "";
-        } else {
-            $cur_node = $tree_pp['cur_node'];
-            unset($tree_dt['cur_node']);
-            $tree_new_sub['DT'] = $tree_dt;
-            unset($tree_jj['cur_node']);
-            $tree_new_sub['JJ'] = $tree_jj;
-            unset($tree_nn['cur_node']);
-            $tree_new_sub['NN'] = $tree_nn;
-            unset($tree_pp['cur_node']);
-            $tree_new_sub['PRP'] = $tree_pp;
-            $tree_new['cur_node'] = $cur_node;
-            $tree_new['NP'] = $tree_new_sub;
-            return $tree_new;
-        }
-        return $tree;
-    }
-    /**
-     * Takes a part-of-speech tagged phrase and pre-tree with a
-     * parse-from position and builds a parse tree for a verb if possible
-     *
-     * @param array $tagged_phrase
-     *      an array of pairs of the form ("token" => token_for_term,
-     *     "tag"=> part_of_speech_tag_for_term)
-     * @param array $tree that consists of ["curnode" =>
-     *      current parse position in $tagged_phrase]
-     * @return array has fields
-     *      "cur_node" index of how far we parsed $tagged_phrase
-     *      "VB" a subarray with a token node for the verb string that was
-     *      parsed
-     */
-    public static function extractVerb($tagged_phrase, $tree)
-    {
-        $cur_node = $tree['cur_node'];
-        // skip stuff before verb (intensifiers and adverbs)
-        while (isset($tagged_phrase[$cur_node]['tag']) &&
-            !in_array(trim($tagged_phrase[$cur_node]['tag']),
-            self::$verb_phrases)) {
-            $cur_node++;
+            return $tree;
         }
-        $verb_string = "";
-        while (isset($tagged_phrase[$cur_node]['tag']) &&
-            in_array(trim($tagged_phrase[$cur_node]['tag']),
-            self::$verb_phrases)) {
-            $verb_string .= " " . $tagged_phrase[$cur_node]['token'];
+        $tree_pp = self::parsePrepositionalPhrases($tagged_phrase,
+            ['cur_node' => $tree_nn['cur_node']]);
+        $tree_aux = self::parseAuxClause($tagged_phrase,
+            ['cur_node' => $tree_pp['cur_node']]);
+        $cur_node = $tree_aux['cur_node'];
+        $cc = "";
+        if (!empty($tagged_phrase[$cur_node]['tag']) &&
+            in_array($tagged_phrase[$cur_node]['tag'],
+            self::$conjunction_type)) {
+            $cc = $tagged_phrase[$cur_node]['token'];
             $cur_node++;
         }
-        if (!empty($verb_string)) {
-            $tree["VB"] = $verb_string;
+        $tree_np = self::parseNounPhrase($tagged_phrase,
+            ['cur_node' => $cur_node]);
+        if ($tree_np['cur_node'] == $cur_node && $cc) {
+            $cur_node--;
+            $tree_np = [];
+            $cc = "";
+        } else {
+            $cur_node = $tree_np['cur_node'];
         }
-        $tree['cur_node'] = $cur_node;
-        return $tree;
+        unset($tree_dt['cur_node'], $tree_jj['cur_node'],
+            $tree_nn['cur_node'], $tree_pp['cur_node'],
+            $tree_aux['cur_node'], $tree_np['cur_node']);
+        $sub_tree = ['DT' => $tree_dt, 'JJ' => $tree_jj, 'NN' => $tree_nn,
+            'PRP' => $tree_pp, 'AUX' => $tree_aux, 'CC' => $cc,
+            'ADD_NP' => $tree_np];
+        return ['cur_node' => $cur_node, 'NP' => $sub_tree];
     }
     /**
      * Takes a part-of-speech tagged phrase and pre-tree with a
@@ -703,42 +808,40 @@ class Tokenizer
      *      "VB" with value a verb subtree
      *      "NP" with value an noun phrase subtree
      */
-    public static function extractVerbPhrase($tagged_phrase, $tree)
+    public static function parseVerbPhrase($tagged_phrase, $tree)
     {
         $cur_node = $tree['cur_node'];
-        $tree_vb = self::extractVerb($tagged_phrase, ['cur_node' => $cur_node]);
-        if ($tree_vb['cur_node'] == $cur_node) {
+        $adverb_string = self::parseTypeList($cur_node, $tagged_phrase,
+            self::$adverb_type);
+        $tree_vb = self::parseVerb($tagged_phrase, ['cur_node' => $cur_node]);
+        if ($cur_node == $tree_vb['cur_node']) {
+            // if no verb return what started with
             return $tree;
         }
         $cur_node = $tree_vb['cur_node'];
-        $preposition_string = "";
-        while (isset($tagged_phrase[$cur_node]['tag']) &&
-            trim($tagged_phrase[$cur_node]['tag']) == "IN") {
-            $preposition_string .= " ". $tagged_phrase[$cur_node]['token'];
-            $cur_node++;
+        $add_to_adverb_string = self::parseTypeList($cur_node,
+            $tagged_phrase, self::$adverb_type);
+        if (trim($add_to_adverb_string) != 'very') {
+            $adverb_string .= $add_to_adverb_string;
+            $tree_vb['cur_node'] = $cur_node;
+        } else {
+            $tagged_phrase[$tree_vb['cur_node']]['tag'] = 'JJ';
         }
-        if (!empty($preposition_string)) {
-            $tree_vb["IN"] = $preposition_string;
+        $tree_np = self::parseNounPhrase($tagged_phrase,
+            ['cur_node' => $tree_vb['cur_node']]);
+        $adverb_string .= self::parseTypeList($tree_np['cur_node'],
+            $tagged_phrase, self::$adverb_type);
+        if (!empty($adverb_string)) {
+            $tree_vb["RB"] = $adverb_string;
         }
-        $tree_np = self::extractNounPhrase($tagged_phrase,
-            ['cur_node' => $cur_node]);
-        $tree_new = [];
-        $tree_new_sub = [];
-        if ($tree_np['cur_node'] !=  $cur_node) {
-            $cur_node = $tree_np['cur_node'];
-            unset($tree_vb['cur_node']);
-            unset($tree_np['cur_node']);
-            $tree_new_sub['VB'] = $tree_vb;
-            $tree_new_sub['NP'] = $tree_np['NP'];
-            $tree_new['cur_node'] = $cur_node;
-            $tree_new['VP'] = $tree_new_sub;
-            return $tree_new;
+        $cur_node = $tree_np['cur_node'];
+        if (!empty($tree_np['NP'])) {
+            unset($tree_vb['cur_node'], $tree_np['cur_node']);
+            return ['VP' => ['VB' => $tree_vb, 'NP' => $tree_np['NP']],
+                'cur_node' => $cur_node];
         }
         unset($tree_vb['cur_node']);
-        $tree_new_sub['VB'] = $tree_vb;
-        $tree_new['cur_node'] = $cur_node;
-        $tree_new['VP'] = $tree_new_sub;
-        return $tree_new;
+        return ['VP' => ['VB' => $tree_vb], 'cur_node' => $cur_node];
     }
     /**
      * Given a part-of-speeech tagged phrase array generates a parse tree
@@ -752,22 +855,73 @@ class Tokenizer
      *      $tree["NP"] contains a subtree for a noun phrase
      *      $tree["VP"] contains a subtree for a verb phrase
      */
-    public static function generatePhraseParseTree($tagged_phrase)
+    public static function parseWholePhrase($tagged_phrase, $tree)
     {
-        $tree = [];
+        // for example: In the dark of winter, he walked silently.
+        $tree_start = self::parsePrepositionalPhrases($tagged_phrase,
+            ["cur_node" => $tree['cur_node']]);
+        $cur_node = empty($tree_start['cur_node']) ? $tree['cur_node'] :
+            $tree_start['cur_node'];
+        unset($tree_start['cur_node']);
         //cur_node is the index in tagged_phrase we've parse to so far
-        $tree_np = self::extractNounPhrase($tagged_phrase, ["cur_node" => 0]);
-        $tree = ["cur_node" => $tree_np['cur_node']];
-        $tree_vp = self::extractVerbPhrase($tagged_phrase, $tree);
-        if ($tree == $tree_vp) {
+        $tree_np = self::parseNounPhrase($tagged_phrase,
+            ["cur_node" => $cur_node]);
+        if ($tree_np['cur_node'] == $cur_node) {
             return $tree;
         }
-        $tree['cur_node'] = $tree_vp['cur_node'];
-        unset($tree_np['cur_node']);
-        unset($tree_vp['cur_node']);
-        $tree['NP'] = $tree_np['NP'];
-        $tree['VP'] = $tree_vp['VP'];
-        return $tree;
+        $tree_vp = self::parseVerbPhrase($tagged_phrase,
+            ["cur_node" => $tree_np['cur_node']]);
+        if ($tree_np['cur_node'] == $tree_vp['cur_node']) {
+            return $tree;
+        }
+        $cur_node = $tree_vp['cur_node'];
+        unset($tree_np['cur_node'], $tree_vp['cur_node']);
+        if (!empty($tree_start) && !empty($tree_np['NP'])) {
+            $tree_np['NP']['PRP-1'] = $tree_start;
+        }
+        return ['cur_node' => $cur_node, 'NP' => $tree_np['NP'],
+            'VP' => $tree_vp['VP']];
+    }
+    /**
+     * Takes a part-of-speech tagged phrase and pre-tree with a
+     * parse-from position and builds a parse tree for a auxiliary clause
+     * if possible
+     *
+     * @param array $tagged_phrase
+     *      an array of pairs of the form ("token" => token_for_term,
+     *     "tag"=> part_of_speech_tag_for_term)
+     * @param array $tree that consists of ["cur_node" =>
+     *      current parse position in $tagged_phrase]
+     * @return array has fields
+     *      "cur_node" index of how far we parsed $tagged_phrase
+     */
+    public static function parseAuxClause($tagged_phrase, $tree)
+    {
+        $cur_node = $tree["cur_node"];
+        $token = empty($tagged_phrase[$cur_node]["token"]) ? "" :
+            trim($tagged_phrase[$cur_node]["token"]);
+        if (!in_array($token, ["that", "who", "which", "because", "like",
+            "as"])) {
+            return $tree;
+        }
+        $cur_node++;
+        $tree_vp = self::parseVerbPhrase($tagged_phrase,
+            ["cur_node" => $cur_node]);
+        if ($cur_node != $tree_vp['cur_node']) {
+            $cur_node = $tree_vp['cur_node'];
+            unset($tree_vp['cur_node']);
+            return ['cur_node' => $cur_node,
+                'IN' => $token, 'PHRASE' => $tree_vp];
+        }
+        $tree_wp = self::parseWholePhrase($tagged_phrase,
+            ["cur_node" => $cur_node]);
+        if ($tree_wp['cur_node'] == $cur_node) {
+            return $tree;
+        }
+        $cur_node = $tree_wp['cur_node'];
+        unset($tree_wp['cur_node']);
+        return ['cur_node' => $cur_node,
+            'IN' => $token, 'PHRASE' => $tree_wp];
     }
     /**
      * Takes a parse tree of a phrase and computes subject, predicate, and
@@ -809,8 +963,13 @@ class Tokenizer
             }, \ARRAY_FILTER_USE_KEY );
         $triplet_types = ['CONCISE', 'RAW'];
         foreach ($word_and_phrase_list as $word_and_phrase => $position_list) {
+            $word_and_phrase = self::compressSentence($word_and_phrase);
+            // strip parentheticals
+            $word_and_phrase = preg_replace("/[\{\[\(][^\}\]\)]+[\}\]\)]/u",
+                "", $word_and_phrase);
             $tagged_phrase = self::tagTokenizePartOfSpeech($word_and_phrase);
-            $parse_tree = self::generatePhraseParseTree($tagged_phrase);
+            $parse_tree = self::parseWholePhrase($tagged_phrase,
+                ['cur_node' => 0]);
             $triplets = self::extractTripletsParseTree($parse_tree);
             $extracted_triplets = self::rearrangeTripletsByType($triplets);
             foreach ($triplet_types as $type) {
@@ -954,11 +1113,10 @@ class Tokenizer
     public static function parseWhoQuestion($tagged_question, $index)
     {
         $generated_questions = [];
-        $question_marker = self::getQuestionMarker();
         $tree = ["cur_node" => $index];
         $tree['NP'] = "WHO";
         $triplets = [];
-        $tree_vp = self::extractVerbPhrase($tagged_question, $tree);
+        $tree_vp = self::parseVerbPhrase($tagged_question, $tree);
         $triplets['predicate'] = self::extractPredicateParseTree(
             $tree_vp);
         $triplets['object'] = self::extractObjectParseTree(
@@ -970,8 +1128,8 @@ class Tokenizer
                 $generated_questions[$type][] =
                     trim($triplets['object'][$type]) .
                     " " . trim($triplets['predicate'][$type]) . " " .
-                    $question_marker;
-                $generated_questions[$type][] = $question_marker .
+                    self::$question_token;
+                $generated_questions[$type][] = self::$question_token .
                     " " . trim($triplets['predicate'][$type]) .
                     " " . trim($triplets['object'][$type]);
             }
@@ -993,10 +1151,9 @@ class Tokenizer
     {
         $generated_questions = [];
         $aux_verb = "";
-        $question_marker = self::getQuestionMarker();
         while (isset($tagged_question[$index]) &&
             in_array(trim($tagged_question[$index]['tag']),
-            self::$verb_phrases)) {
+            self::$verb_type)) {
             $token = trim($tagged_question[$index]['token']);
             $aux_verb .= " " . $token;
             $index++;
@@ -1004,9 +1161,9 @@ class Tokenizer
         $tree = ["cur_node" => $index];
         $tree['NP'] = "WHPlus";
         $triplets = [];
-        $tree_np = self::extractNounPhrase($tagged_question, $tree);
+        $tree_np = self::parseNounPhrase($tagged_question, $tree);
         $triplets['subject'] = self::extractSubjectParseTree($tree_np);
-        $tree_vp = self::extractVerbPhrase($tagged_question, $tree_np);
+        $tree_vp = self::parseVerbPhrase($tagged_question, $tree_np);
         $triplets['predicate'] = self::extractPredicateParseTree($tree_vp);
         if (!empty($aux_verb)) {
             if (!isset($triplets['predicate']['RAW'])) {
@@ -1022,8 +1179,8 @@ class Tokenizer
                 $generated_questions[$type][] =
                     trim($triplets['subject'][$type]) .
                     " " . trim($triplets['predicate'][$type]) .
-                    " " . $question_marker;
-                $generated_questions[$type][] = $question_marker.
+                    " " . self::$question_token;
+                $generated_questions[$type][] = self::$question_token .
                     " " . trim($triplets['predicate'][$type]) .
                     " " . trim($triplets['subject'][$type]);
             }
@@ -1073,7 +1230,6 @@ class Tokenizer
             && !empty($sub_pred_obj_triplets['predicate'][$type])
             && !empty($sub_pred_obj_triplets['object'][$type])) {
             $question_answer_triplets = [];
-            $question_marker = self::$question_marker;
             $sentence = [ trim($sub_pred_obj_triplets['subject'][$type]),
                 trim($sub_pred_obj_triplets['predicate'][$type]),
                 trim($sub_pred_obj_triplets['object'][$type])];
@@ -1088,7 +1244,7 @@ class Tokenizer
                 }
                 for ($i = 0; $i < 3; $i++) {
                     $q_sentence = $sentence;
-                    $q_sentence[$i] = $question_marker;
+                    $q_sentence[$i] = self::$question_token;
                     $q_sentence_string = implode(" ", $q_sentence);
                     $q_sentence_string = self::stemPhrase($q_sentence_string);
                     $question_triplets[] = $q_sentence_string;
@@ -1504,80 +1660,4 @@ class Tokenizer
         }
         return $tagged_phrase;
     }
-    /**
-     * The function returns the question marker for the locale
-     *
-     * @return the question marker
-     */
-    public static function getQuestionMarker()
-    {
-        return self::$question_marker;
-    }
-    /**
-     * From Back to Basics: CLASSY 2006 page 3:
-     * 2. We remove many adverbs and all conjunctions, including phrases such
-     * as "As a matter of fact," and "At this point," that occur at the start
-     * of a sentence.
-     *
-     * @param string $sentence_to_compress the sentence to compress
-     * @return the compressed sentence
-     */
-    public static function compressSentenceStep2($sentence_to_compress)
-    {
-        $result = $sentence_to_compress;
-        $result = preg_replace("/^At this point,?/i", "", $result);
-        $result = preg_replace("/^As a matter of fact,?/i", "", $result);
-        //adverbs
-        $result = preg_replace("/^[a-zA-Z]*ly\s?/i", "", $result);
-        //conjunctions
-        $result = preg_replace("/(^and,?)|(^but,?)|(^for,?)|(^nor,?)|(^or,?)" .
-            "|(^so,?)|(^yet,?)/i", "", $result);
-        return $result;
-    }
-    /**
-     * From Back to Basics: CLASSY 2006 page 3:
-     * 3. We remove a small selections of words that occur in the middle of a
-     * sentence, such as ", however," and ", also," (not always requiring the
-     * commas).
-     *
-     * @param string $sentence_to_compress the sentence to compress
-     * @return the compressed sentence
-     */
-    public static function compressSentenceStep3($sentence_to_compress)
-    {
-        $result = $sentence_to_compress;
-        $result = preg_replace("/,?\s?however,?/i", "", $result);
-        $result = preg_replace("/,?\s?also,?/i", "", $result);
-        return $result;
-    }
-    /**
-     * From Back to Basics: CLASSY 2006 page 3:
-     * 4. For DUC 2006, we added the removal of ages such as ", 51," or
-     * ", aged 24,".
-     *
-     * @param string $sentence_to_compress the sentence to compress
-     * @return the compressed sentence
-     */
-    public static function compressSentenceStep4($sentence_to_compress)
-    {
-        $result = $sentence_to_compress;
-        $result = preg_replace("/,\s?\d{1,3},/i", "", $result);
-        $result = preg_replace("/,\s?aged\s?\d{1,3},/i", "", $result);
-        return $result;
-    }
-    /**
-     * From Back to Basics: CLASSY 2006 page 3:
-     * 6. We remove relative clause attributives (clauses beginning with
-     * "who(m)", "which", "when", and "where") wherever possible.
-     *
-     * @param string $sentence_to_compress the sentence to compress
-     * @return the compressed sentence
-     */
-    public static function compressSentenceStep5($sentence_to_compress)
-    {
-        $result = $sentence_to_compress;
-        $result = preg_replace("/(,\s?whom?[^,]*,)|(,\s?which[^,]*,)|" .
-            "(,\s?when[^,]*,)|(,\s?where[^,]*,)/i", "", $result);
-        return $result;
-    }
 }
diff --git a/src/locale/en_US/resources/all_aux_grams.txt b/src/locale/en_US/resources/all_aux_grams.txt
new file mode 100755
index 000000000..2593797db
--- /dev/null
+++ b/src/locale/en_US/resources/all_aux_grams.txt
@@ -0,0 +1,194 @@
+governor general
+governor generals
+lieutenant governor
+lieutenant governors
+prime ministers
+executive power
+executive powers
+justin trudeau
+pierre trudeau
+chief justice
+charter of rights and freedoms
+privy council
+agreed on
+agreed to
+aral 88
+aye yah ah ah
+back up
+backed up
+battle ax
+beat up
+beefed up
+belly up
+blacked in
+blow up
+blown up
+boxed in
+break in
+break up
+bucking up
+buckle on
+build up
+built in
+bulked up
+bust up
+buttoned up
+cable tv
+call in
+carbon 14
+carry in
+carry on
+catch 22
+catch up
+cave in
+chevrolet pontiac gm
+chin up
+classified ad
+clean up
+cleaned up
+close in
+close up
+cobalt 60
+color tv
+cover up
+dairy oh
+dammed up
+dc 8
+dc 10
+derring do
+double a
+dried up
+drive in
+drop in
+dual road up
+dust up
+efficient in
+either or
+fade in
+fall in
+fenced in
+fill in
+follow on
+follow up
+foul up
+full on
+garbage in
+gathering in
+go go go
+goings on
+good by
+grown up
+gung ho
+hands on
+hangers on
+hard come by
+he goes or i go
+head on
+heigh ho
+high up
+hook up
+huang ti
+hunched up
+hyped up
+jumped up
+just say no
+kung fu
+lash up
+lean to
+line up
+link up
+live in
+lock up
+locking in
+made for tv
+made up
+make up
+mark up
+mixed up
+models on the way up
+mouth up
+move up
+movie to be
+near by
+nearly 30
+occupation as
+odds on
+oh the pain of it
+on the go
+one pound or so
+over 40
+over 50
+paid in
+paid up
+painted in
+passer by
+passers by
+pasted in
+pay as you go
+pent up
+phase in
+pick up
+plug in
+powers that be
+public tv
+puffed up
+pumped up
+push up
+radio tv
+rolled up
+runner up
+runners up
+sales of
+satellite tv
+seven up
+shack up
+shake up
+shangri la
+shape up
+shoo in
+shot up
+should be
+shut in
+snap in
+snap on
+soon to be
+souped up
+speed up
+speeded up
+split up
+stand by
+stand in
+stand up
+start up
+step up
+stepped up
+stern to
+stored up
+stuck up
+swearing in
+take up
+tobacco ad
+trade ad
+trade in
+trade up
+triple a
+trumped up
+trussed up
+tune in
+turned up
+under 35
+under 50
+unheard of
+wake up
+walk in
+walk on
+walk to
+walk up
+warm up
+wash up
+well to do
+wife to be
+with it
+would be
+wrap up
+write in
diff --git a/src/locale/en_US/resources/all_word_grams.ftr b/src/locale/en_US/resources/all_word_grams.ftr
new file mode 100644
index 000000000..02b9b5d60
Binary files /dev/null and b/src/locale/en_US/resources/all_word_grams.ftr differ
diff --git a/src/locale/es/resources/Tokenizer.php b/src/locale/es/resources/Tokenizer.php
index cdc63905c..1e816e868 100755
--- a/src/locale/es/resources/Tokenizer.php
+++ b/src/locale/es/resources/Tokenizer.php
@@ -112,12 +112,13 @@ class Tokenizer
     /**
      * Removes the stop words from the page (used for Word Cloud generation)
      *
-     * @param string $page the page to remove stop words from.
-     * @return string $page with no stop words
+     * @param mixed $data either a string or an array of string to remove
+     *      stop words from
+     * @return mixed $data with no stop words
      */
-    public static function stopwordsRemover($page)
+    public static function stopwordsRemover($data)
     {
-        $stop_words = ["de", "la", "que", "el","en", "y", "a", "los",
+        static $stop_words = ["de", "la", "que", "el","en", "y", "a", "los",
             "del", "se", "las", "por", "un", "para", "con", "no", "una",
             "su", "al", "lo", "como", "más", "pero", "sus", "le", "ya", "o",
             "este", "sí", "porque", "esta", "entre", "cuando", "muy", "sin",
@@ -171,9 +172,12 @@ class Tokenizer
             "tuviéramos", "tuvierais", "tuvieran", "tuviese", "tuvieses",
             "tuviésemos", "tuvieseis", "tuviesen", "teniendo", "tenido",
             "tenida", "tenidos", "tenidas", "tened"];
-        $page = preg_replace('/\b('.implode('|',$stop_words).')\b/', '',
-            mb_strtolower($page));
-        return $page;
+        static $pattern = "";
+        if (empty($pattern)) {
+            $pattern = '/\b(' . implode('|', $stop_words) . ')\b/u';
+        }
+        $data = preg_replace($pattern, '', $data);
+        return $data;
     }
     /**
      * Computes the stem of a French word
diff --git a/src/locale/fa/resources/Tokenizer.php b/src/locale/fa/resources/Tokenizer.php
index d76065fd8..bef154e7c 100755
--- a/src/locale/fa/resources/Tokenizer.php
+++ b/src/locale/fa/resources/Tokenizer.php
@@ -67,12 +67,13 @@ class Tokenizer
     /**
      * Removes the stop words from the page (used for Word Cloud generation)
      *
-     * @param string $page the page to remove stop words from.
-     * @return string $page with no stop words
+     * @param mixed $data either a string or an array of string to remove
+     *      stop words from
+     * @return mixed $data with no stop words
      */
-    public static function stopwordsRemover($page)
+    public static function stopwordsRemover($data)
     {
-        $stop_words = [
+        static $stop_words = [
             "در", "به", "از", "كه", "مي", "اين", "است", "را", "با", "هاي",
             "براي", "آن", "يك", "شود", "شده","خود", "ها", "كرد", "شد", "اي",
             "تا", "كند", "بر", "بود", "گفت", "نيز", "وي", "هم", "كنند",
@@ -123,9 +124,12 @@ class Tokenizer
             "لطفاً", "ّه", "انکه",
             "وقتیکه", "همین", "پیش", "مدّتی", "هنگامی", "مان", "تان"
             ];
-        $page = preg_replace('/\b('.implode('|',$stop_words).')\b/u', '',
-            mb_strtolower($page));
-        return $page;
+        static $pattern = "";
+        if (empty($pattern)) {
+            $pattern = '/\b(' . implode('|', $stop_words) . ')\b/u';
+        }
+        $data = preg_replace($pattern, '', $data);
+        return $data;
     }
     /**
      * Computes the stem of a Persian word
diff --git a/src/locale/fr_FR/resources/Tokenizer.php b/src/locale/fr_FR/resources/Tokenizer.php
index cb3460642..667e69756 100755
--- a/src/locale/fr_FR/resources/Tokenizer.php
+++ b/src/locale/fr_FR/resources/Tokenizer.php
@@ -109,10 +109,11 @@ class Tokenizer
     /**
      * Removes the stop words from the page (used for Word Cloud generation)
      *
-     * @param string $page the page to remove stop words from.
-     * @return string $page with no stop words
+     * @param mixed $data either a string or an array of string to remove
+     *      stop words from
+     * @return mixed $data with no stop words
      */
-    public static function stopwordsRemover($page)
+    public static function stopwordsRemover($data)
     {
         $stop_words = ['alors', 'au', 'aucuns', 'aussi', 'autre', 'avant',
             'avec', 'avoir', 'bon', 'car', 'ce', 'cela', 'ces', 'ceux',
@@ -132,9 +133,12 @@ class Tokenizer
             'tout', 'trop', 'très', 'tu','valeur', 'voie', 'voient', 'vont',
             'votre','vous','vu','ça','étaient', 'état', 'étions', 'été',
             'être'];
-        $page = preg_replace('/\b('.implode('|',$stop_words).')\b/', '',
-            mb_strtolower($page));
-        return $page;
+        static $pattern = "";
+        if (empty($pattern)) {
+            $pattern = '/\b(' . implode('|', $stop_words) . ')\b/u';
+        }
+        $data = preg_replace($pattern, '', $data);
+        return $data;
     }
     /**
      * Computes the stem of a French word
diff --git a/src/locale/hi/resources/Tokenizer.php b/src/locale/hi/resources/Tokenizer.php
index c4a53107b..c97d3c3d0 100755
--- a/src/locale/hi/resources/Tokenizer.php
+++ b/src/locale/hi/resources/Tokenizer.php
@@ -47,36 +47,36 @@ class Tokenizer
      * List of verb-like parts of speech that might appear in lexicon
      * @var array
      */
-    public static $verb_phrases = ["VB", "VBD", "VBG", "VBN", "VBP", "VBZ",
+    public static $verb_type = ["VB", "VBD", "VBG", "VBN", "VBP", "VBZ",
         "RB"];
     /**
      * List of noun-like parts of speech that might appear in lexicon
      * @var array
      */
-    public static $noun_phrases = ["NN", "NNS", "NNP", "NNPS", "DT"];
+    public static $noun_type = ["NN", "NNS", "NNP", "NNPS", "DT"];
     /**
      * List of adjective-like parts of speech that might appear in lexicon
      * @var array
      */
-    public static $adjective_phrases = ["JJ", "JJR", "JJS"];
+    public static $adjective_type = ["JJ", "JJR", "JJS"];
     /**
      * List of postpositional-like parts of speech that might appear in lexicon
      * @var array
      */
-    public static $postpositional_phrases = ["IN", "inj", "PREP", "proNN",
+    public static $postpositional_type = ["IN", "inj", "PREP", "proNN",
         "CONJ", "INT", "particle", "case", "PSP", "direct_DT", "PRP"];
     /**
      * List of questions in Hindi
      * @var array
      */
-    public static $questions = ["क्या", "कब", "कहा", "क्यों", "कौन", "जिसे",
-            "जिसका", "कहाँ", "कहां"];
+    public static $question_pattern =
+        "/\b[क्या|कब|कहा|क्यों|कौन|जिसे|जिसका|कहाँ|कहां]\b/ui";
     /**
      * Any unique identifier corresponding to the component of a triplet which
      * can be answered using a question answer list
      * @var string
      */
-    public static $question_marker = "qqq";
+    public static $question_token = "qqq";
     /**
      * Words we don't want to be stemmed
      * @var array
@@ -144,11 +144,10 @@ class Tokenizer
         $lexicon_file = C\LOCALE_DIR . "/hi/resources/lexicon.txt.gz";
         if (empty($dictionary)) {
             if (file_exists($lexicon_file)) {
-                $lines = gzfile($lexicon_file);
-                foreach ($lines as $line) {
-                    $tags = preg_split('/(\s+|\,)/u', trim($line));
-                    $dictionary[array_shift($tags)] = array_filter($tags);
-                }
+                $lex_data = gzdecode(file_get_contents($lexicon_file));
+                preg_match_all("/([^\s\,]+)[\s|\,]+([^\n]+)/u",
+                    $lex_data, $lex_parts);
+                $dictionary = array_combine($lex_parts[1], $lex_parts[2]);
             }
         }
         $tokens = preg_split("/\s+/u", $text);
@@ -162,12 +161,12 @@ class Tokenizer
             $current = ["token" => $token, "tag" => "UNKNOWN"];
             $term = $current["token"];
             if (!empty($dictionary[$token])) {
-                $tag_list = $dictionary[$token];
+                $tag_list = explode(" ", $dictionary[$token]);
                 $current['tag'] = $tag_list[0];
             }
             if (is_numeric($token)) {
                 $current["tag"] = "NN";
-            } else if (strcmp($token,"है") == 0 || strcmp($token, "हैं") == 0) {
+            } else if (in_array($token, ["है", "हैं"])) {
                 $current["tag"] = "VB";
             }
             if (empty($current["tag"])) {
@@ -176,7 +175,8 @@ class Tokenizer
             $result[$i] = $current;
             $i++;
         }
-        return self::tagUnknownWords($result);
+        $result = self::tagUnknownWords($result);
+        return $result;
     }
     /**
      * This method tags the remaining words in a partially tagged text array.
@@ -189,7 +189,7 @@ class Tokenizer
     public static function tagUnknownWords($partially_tagged_text)
     {
         $result = $partially_tagged_text;
-        $verbs = ["VBZ","VBD","VBN"];
+        $verbs = ["VBZ", "VBD", "VBN"];
         $length = count($result);
         $previous = $result[0];
         for ($i = 1; $i < $length; $i++)
@@ -296,6 +296,24 @@ class Tokenizer
         }
         return $tagged_phrase;
     }
+    /**
+     *
+     */
+    public static function parseTypeList(&$cur_node, $tagged_phrase, $type)
+    {
+        $string = "";
+        $start_node = $cur_node;
+        $next_tag = (empty($tagged_phrase[$cur_node]['tag'])) ? "" :
+            trim($tagged_phrase[$cur_node]['tag']);
+        $allowed_conjuncts = [];
+        while ($next_tag && (in_array($next_tag, $type))) {
+            $string .= " ". $tagged_phrase[$cur_node]['token'];
+            $cur_node++;
+            $next_tag = (empty($tagged_phrase[$cur_node]['tag'])) ? "" :
+                trim($tagged_phrase[$cur_node]['tag']);
+        }
+        return $string;
+    }
     /**
      * Takes a part-of-speech tagged phrase and pre-tree with a
      * parse-from position and builds a parse tree for a noun if possible
@@ -310,21 +328,60 @@ class Tokenizer
      *      "NN" a subarray with a token node for the noun string that was
      *      parsed
      */
-    public static function extractNoun($tagged_phrase, $tree)
+    public static function parseNoun($tagged_phrase, $tree)
     {
         //Combining multiple noun into one
-        $noun_string = "";
-        $cur_node = $tree["cur_node"];
-        while (isset($tagged_phrase[$cur_node]["tag"]) &&
-            (in_array(trim($tagged_phrase[$cur_node]["tag"]),
-            self::$noun_phrases))) {
-            $noun_string .= " " . $tagged_phrase[$cur_node]["token"];
-            $cur_node++;
-        }
+        $noun_string = self::parseTypeList($tree['cur_node'], $tagged_phrase,
+            self::$noun_type);
         if (!empty($noun_string)) {
             $tree["NN"] = $noun_string;
         }
-        $tree["cur_node"] = $cur_node;
+        return $tree;
+    }
+    /**
+     * Takes a part-of-speech tagged phrase and pre-tree with a
+     * parse-from position and builds a parse tree for a verb if possible
+     *
+     * @param array $tagged_phrase
+     *      an array of pairs of the form ("token" => token_for_term,
+     *     "tag"=> part_of_speech_tag_for_term)
+     * @param array $tree that consists of ["curnode" =>
+     *      current parse position in $tagged_phrase]
+     * @return array has fields
+     *      "cur_node" index of how far we parsed $tagged_phrase
+     *      "VB" a subarray with a token node for the verb string that was
+     *      parsed
+     */
+    public static function parseVerb($tagged_phrase, $tree)
+    {
+        $verb_string = self::parseTypeList($tree['cur_node'], $tagged_phrase,
+            self::$verb_type);
+        if (!empty($verb_string)) {
+            $tree["VB"] = $verb_string;
+        }
+        return $tree;
+    }
+    /**
+     * Takes a part-of-speech tagged phrase and pre-tree with a
+     * parse-from position and builds a parse tree for an adjective if possible
+     *
+     * @param array $tagged_phrase
+     *      an array of pairs of the form ("token" => token_for_term,
+     *     "tag"=> part_of_speech_tag_for_term)
+     * @param array $tree that consists of ["cur_node" =>
+     *      current parse position in $tagged_phrase]
+     * @return array has fields
+     *      "cur_node" index of how far we parsed $tagged_phrase
+     *      "JJ" a subarray with a token node for the adjective that was
+     *      parsed
+     */
+    public static function parseAdjective($tagged_phrase, $tree)
+    {
+        $adjective_string = self::parseTypeList($tree['cur_node'],
+            $tagged_phrase, self::$adjective_type);
+        if (!empty($adjective_string)) {
+            $tree["JJ"] = $adjective_string;
+        }
         return $tree;
     }
     /**
@@ -334,59 +391,43 @@ class Tokenizer
      *
      * @param array $tagged_phrase
      *      an array of pairs of the form ("token" => token_for_term,
-     *     "tag"=> part_of_speech_tag_for_term)
+     *     "tag" => part_of_speech_tag_for_term)
      * @param array $tree that consists of ["cur_node" =>
      *      current parse position in $tagged_phrase]
      * @param int $index position in array to start from
      * @return array has fields
      *      "cur_node" index of how far we parsed $tagged_phrase
      */
-    public static function extractPostpositionPhrase($tagged_phrase, $tree,
+    public static function parsePostpositionPhrase($tagged_phrase, $tree,
         $index = 1)
     {
         $cur_node = $tree["cur_node"];
         $tree_pp["cur_node"] = $tree["cur_node"];
         if (isset ($tagged_phrase[$cur_node]["tag"]) &&
             in_array($tagged_phrase[$cur_node]["tag"],
-            self::$postpositional_phrases)) {
-            $pp_string ="";
-            while (isset($tagged_phrase[$cur_node]["tag"]) &&
-                in_array($tagged_phrase[$cur_node]["tag"],
-                self::$postpositional_phrases)) {
-                $pp_string .= " " . $tagged_phrase[$cur_node]["token"];
-                $cur_node++;
-            }
+            self::$postpositional_type)) {
+            $pp_string = self::parseTypeList($cur_node, $tagged_phrase,
+                self::$postpositional_type);
             if (!empty($pp_string)) {
                 $tree_pp["IN_$index"] = $pp_string;
             }
-            $adjective_string = "";
-            while (isset($tagged_phrase[$cur_node]["tag"]) &&
-                in_array($tagged_phrase[$cur_node]["tag"],
-                    self::$adjective_phrases)) {
-                $adjective_string .= " " .
-                    $tagged_phrase[$cur_node]["token"];
-                $cur_node++;
-            }
+            $adjective_string = self::parseTypeList($cur_node, $tagged_phrase,
+                self::$adjective_type);
             if (!empty($adjective_string)) {
                 $tree_pp["JJ_$index"] = $adjective_string;
             }
-            $nn_string = "";
-            while (isset($tagged_phrase[$cur_node]["tag"]) &&
-                in_array($tagged_phrase[$cur_node]["tag"],
-                    self::$noun_phrases)) {
-                $nn_string .= " " . $tagged_phrase[$cur_node]["token"];
-                $cur_node++;
-            }
+            $nn_string = self::parseTypeList($cur_node, $tagged_phrase,
+                self::$noun_type);
             if (!empty($nn_string)) {
                 $tree_pp["NN_$index"] = $nn_string;
             }
             $tree_pp["cur_node"] = $cur_node;
-            $tree_next = self::extractPostpositionPhrase($tagged_phrase,
+            $tree_next = self::parsePostpositionPhrase($tagged_phrase,
                 $tree_pp, $index + 1);
-            $tree_pp = array_merge ($tree_pp, $tree_next);
+            $tree_pp = array_merge($tree_pp, $tree_next);
         }
         $tree["cur_node"] = $tree_pp["cur_node"];
-        unset ($tree_pp["cur_node"]);
+        unset($tree_pp["cur_node"]);
         $tree["POST"] = $tree_pp;
         return $tree;
     }
@@ -404,56 +445,21 @@ class Tokenizer
      *      "JJ" with value an Adjective subtree
      *      "NN" with value of a Noun Subtree
      */
-    public static function extractNounPhrase($tagged_phrase, $tree)
+    public static function parseNounPhrase($tagged_phrase, $tree)
     {
         $cur_node = $tree["cur_node"];
-        $tree_jj = self::extractAdjective($tagged_phrase,
+        $tree_jj = self::parseAdjective($tagged_phrase,
             ["cur_node" => $tree["cur_node"]]);
-        $tree_nn = self::extractNoun($tagged_phrase,
+        $tree_nn = self::parseNoun($tagged_phrase,
             ["cur_node" => $tree_jj["cur_node"]]);
         if ($tree_nn["cur_node"] == $cur_node) {
             $tree["NP"] = "";
-        } else {
-            $cur_node = $tree_nn["cur_node"];
-            unset($tree_jj["cur_node"]);
-            $tree_new_sub["JJ"] = $tree_jj;
-            unset($tree_nn["cur_node"]);
-            $tree_new_sub["NN"] = $tree_nn;
-            $tree_new["cur_node"] = $cur_node;
-            $tree_new["NP"] = $tree_new_sub;
-            return $tree_new;
-        }
-        return $tree;
-    }
-    /**
-     * Takes a part-of-speech tagged phrase and pre-tree with a
-     * parse-from position and builds a parse tree for a verb if possible
-     *
-     * @param array $tagged_phrase
-     *      an array of pairs of the form ("token" => token_for_term,
-     *     "tag"=> part_of_speech_tag_for_term)
-     * @param array $tree that consists of ["curnode" =>
-     *      current parse position in $tagged_phrase]
-     * @return array has fields
-     *      "cur_node" index of how far we parsed $tagged_phrase
-     *      "VB" a subarray with a token node for the verb string that was
-     *      parsed
-     */
-    public static function extractVerb($tagged_phrase, $tree)
-    {
-        $cur_node = $tree["cur_node"];
-        $verb_string = "";
-        while (isset($tagged_phrase[$cur_node]["tag"]) &&
-            in_array(trim($tagged_phrase[$cur_node]["tag"]),
-            self::$verb_phrases)) {
-            $verb_string .= " " . $tagged_phrase[$cur_node]["token"];
-            $cur_node++;
-        }
-        if (!empty($verb_string)) {
-            $tree["VB"] = $verb_string;
+            return $tree;
         }
-        $tree["cur_node"] = $cur_node;
-        return $tree;
+        $cur_node = $tree_nn["cur_node"];
+        unset($tree_jj["cur_node"], $tree_nn["cur_node"]);
+        return ["cur_node" => $cur_node, "NP" =>
+            ["JJ" => $tree_jj, "NN" => $tree_nn] ];
     }
     /**
      * Takes a part-of-speech tagged phrase and pre-tree with a
@@ -469,73 +475,30 @@ class Tokenizer
      *      "VP" a subarray with possible fields
      *      "VB" with value a verb subtree
      */
-    public static function extractVerbPhrase($tagged_phrase, $tree)
+    public static function parseVerbPhrase($tagged_phrase, $tree)
     {
         $cur_node = $tree["cur_node"];
-        $tree_vb = self::extractVerb($tagged_phrase, ["cur_node" => $cur_node]);
+        $tree_vb = self::parseVerb($tagged_phrase, ["cur_node" => $cur_node]);
         if ($tree_vb["cur_node"] == $cur_node) {
             $tree["VP"] = [];
             return $tree;
         }
         $cur_node = $tree_vb["cur_node"];
-        $postposition_string = "";
-        while (isset($tagged_phrase[$cur_node]["tag"]) &&
-            in_array(trim($tagged_phrase[$cur_node]["tag"]),
-                self::$postpositional_phrases)) {
-            $postposition_string .= " ". $tagged_phrase[$cur_node]["token"];
-            $cur_node++;
-        }
+        $postposition_string = self::parseTypeList($cur_node,
+            $tagged_phrase, self::$postpositional_type);
         if (!empty($postposition_string)) {
             $tree_vb["IN"] = $postposition_string;
         }
-        $tree_np = self::extractNounPhrase($tagged_phrase,
-            ["cur_node" => $cur_node]);
-        $tree_new = [];
-        $tree_new_sub = [];
+        $tree_np = self::parseNounPhrase($tagged_phrase,
+            ["cur_node" => $cur_node]);;
         if ($tree_np["cur_node"] !=  $cur_node) {
             $cur_node = $tree_np["cur_node"];
             unset($tree_vb["cur_node"], $tree_np["cur_node"]);
-            $tree_new_sub["VB"] = $tree_vb;
-            $tree_new_sub["NP"] = $tree_np["NP"];
-            $tree_new["cur_node"] = $cur_node;
-            $tree_new["VP"] = $tree_new_sub;
-            return $tree_new;
+            return ['cur_node' => $cur_node, 'VP' =>['VB' => $tree_vb,
+                'NP' => $tree_np["NP"]]];
         }
         unset($tree_vb["cur_node"]);
-        $tree_new_sub["VB"] = $tree_vb;
-        $tree_new["cur_node"] = $cur_node;
-        $tree_new["VP"] = $tree_new_sub;
-        return $tree_new;
-    }
-    /**
-     * Takes a part-of-speech tagged phrase and pre-tree with a
-     * parse-from position and builds a parse tree for an adjective if possible
-     *
-     * @param array $tagged_phrase
-     *      an array of pairs of the form ("token" => token_for_term,
-     *     "tag"=> part_of_speech_tag_for_term)
-     * @param array $tree that consists of ["cur_node" =>
-     *      current parse position in $tagged_phrase]
-     * @return array has fields
-     *      "cur_node" index of how far we parsed $tagged_phrase
-     *      "JJ" a subarray with a token node for the adjective that was
-     *      parsed
-     */
-    public static function extractAdjective($tagged_phrase, $tree)
-    {
-        $adjective_string = "";
-        $cur_node = $tree["cur_node"];
-        while (isset($tagged_phrase[$cur_node]["tag"]) &&
-            in_array(trim($tagged_phrase[$cur_node]["tag"]),
-            self::$adjective_phrases)) {
-            $adjective_string .= " " . $tagged_phrase[$cur_node]["token"];
-            $cur_node++;
-        }
-        if (!empty($adjective_string)) {
-            $tree["JJ"] = $adjective_string;
-        }
-        $tree["cur_node"] = $cur_node;
-        return $tree;
+        return ['cur_node' => $cur_node, 'VP' => ['VB' => $tree_vb]];
     }
     /**
      * Given a part-of-speeech tagged phrase array generates a parse tree
@@ -550,20 +513,17 @@ class Tokenizer
      *      $tree["POST"] contains a subtree for a object phrase
      *      $tree["VP"] contains a subtree for a predicate phrase
      */
-    public static function generatePhraseParseTree($tagged_phrase)
+    public static function parseWholePhrase($tagged_phrase, $tree)
     {
-        $tree = [];
-        $tree_np = self::extractNounPhrase($tagged_phrase,["cur_node" => 0]);
-        $tree = ["cur_node" => $tree_np["cur_node"]];
-        $tree_pp = self::extractPostpositionPhrase($tagged_phrase, $tree);
-        $tree["cur_node"] = $tree_pp["cur_node"];
-        $tree_vp = self::extractVerbPhrase($tagged_phrase, $tree);
-        $tree["cur_node"] = $tree_vp["cur_node"];
+        $tree_np = self::parseNounPhrase($tagged_phrase,["cur_node" => 0]);
+        $tree_pp = self::parsePostpositionPhrase($tagged_phrase,
+            ["cur_node" => $tree_np["cur_node"]] );
+        $tree_vp = self::parseVerbPhrase($tagged_phrase,
+            ["cur_node" => $tree_pp["cur_node"]] );
+        $cur_node = $tree_vp["cur_node"];
         unset($tree_np["cur_node"], $tree_pp["cur_node"], $tree_vp["cur_node"]);
-        $tree["NP"] = $tree_np["NP"];
-        $tree["POST"] = $tree_pp["POST"];
-        $tree["VP"] = $tree_vp["VP"];
-        return $tree;
+        return ["cur_node" => $cur_node, "NP" => $tree_np["NP"],
+            "POST" => $tree_pp["POST"], "VP" => $tree_vp["VP"]];
     }
     /**
      * Scans a word list for phrases. For phrases found generate
@@ -587,7 +547,8 @@ class Tokenizer
             $sentence = preg_replace("/\s+/u", " ", $word_and_phrase);
             $sentence = trim($sentence);
             $tagged_phrase = self::tagTokenizePartOfSpeech($sentence);
-            $parse_tree = self::generatePhraseParseTree($tagged_phrase);
+            $parse_tree = self::parseWholePhrase($tagged_phrase,
+                ["cur_node" => 0]);
             $triplets = self::extractTripletsParseTree($parse_tree);
             $extracted_triplets = self::rearrangeTripletsByType($triplets);
             foreach ($triplet_types as $type) {
@@ -772,7 +733,6 @@ class Tokenizer
             && !empty($sub_pred_obj_triplets["predicate"][$type])
             && !empty($sub_pred_obj_triplets["object"][$type])) {
             $question_answer_triplets = [];
-            $question_marker = self::$question_marker;
             $sentence = [$sub_pred_obj_triplets["subject"][$type],
                     $sub_pred_obj_triplets["object"][$type],
                     $sub_pred_obj_triplets["predicate"][$type]];
@@ -780,7 +740,7 @@ class Tokenizer
             for ($j = 0; $j < 2; $j++) {
                 for ($i = 0; $i < 3; $i++) {
                     $question = $sentence;
-                    $question[$i] = $question_marker;
+                    $question[$i] = self::$question_token;
                     $question_string = implode(" ", $question);
                     $question_string = trim($question_string);
                     $question_string = preg_replace("/\s+/u", " ",
@@ -806,20 +766,19 @@ class Tokenizer
     public static function parseQuestion($tagged_question, $index)
     {
         $generated_questions = [];
-        $question_marker = trim(self::getQuestionMarker());
         $triplets = [];
-        $tree_np = self::extractNounPhrase($tagged_question,
+        $tree_np = self::parseNounPhrase($tagged_question,
             ["cur_node" => 0]);
         $triplets["subject"] = self::extractSubjectParseTree($tree_np);
-        $tree_vp = self::extractVerbPhrase($tagged_question,
-            ["cur_node" => $index+1]);
+        $tree_vp = self::parseVerbPhrase($tagged_question,
+            ["cur_node" => $index + 1]);
         $triplets["predicate"] = self::extractPredicateParseTree($tree_vp);
         $triplet_types = ["CONCISE", "RAW"];
         foreach ($triplet_types as $type) {
             if (!empty($triplets["subject"][$type])
                 && !empty($triplets["predicate"][$type])) {
                 $question = trim (trim($triplets["subject"][$type]) .
-                    " " . $question_marker .
+                    " " . self::$question_token .
                     " " . trim($triplets["predicate"][$type]));
                 $question = preg_replace("/\s+/u", " ", $question);
                 $generated_questions[$type][] = $question;
@@ -836,22 +795,7 @@ class Tokenizer
      */
     public function isQuestion($phrase)
     {
-        $phrase = trim($phrase);
-        for ($i = 0; $i < count(self::$questions); $i++) {
-            if (mb_strpos($phrase, trim(self::$questions[$i])) !== false) {
-                return true;
-            }
-        }
-        return false;
-    }
-    /**
-     * The function returns the question marker for the locale
-     *
-     * @return the question marker
-     */
-    public static function getQuestionMarker()
-    {
-        return self::$question_marker;
+        return preg_match(self::$question_pattern, $phrase);
     }
     /**
      * Takes questions and returns the triplet from the question
diff --git a/src/locale/hi/resources/all_aux_grams.txt b/src/locale/hi/resources/all_aux_grams.txt
new file mode 100755
index 000000000..c9b76d32f
--- /dev/null
+++ b/src/locale/hi/resources/all_aux_grams.txt
@@ -0,0 +1 @@
+महामा गाँधी
diff --git a/src/locale/hi/resources/all_word_grams.ftr b/src/locale/hi/resources/all_word_grams.ftr
new file mode 100644
index 000000000..6d8246a5c
Binary files /dev/null and b/src/locale/hi/resources/all_word_grams.ftr differ
diff --git a/src/locale/it/resources/Tokenizer.php b/src/locale/it/resources/Tokenizer.php
index f6968e4fa..20d1bc1f0 100755
--- a/src/locale/it/resources/Tokenizer.php
+++ b/src/locale/it/resources/Tokenizer.php
@@ -102,12 +102,13 @@ class Tokenizer
     /**
      * Removes the stop words from the page (used for Word Cloud generation)
      *
-     * @param string $page the page to remove stop words from.
-     * @return string $page with no stop words
+     * @param mixed $data either a string or an array of string to remove
+     *      stop words from
+     * @return mixed $data with no stop words
      */
-    public static function stopwordsRemover($page)
+    public static function stopwordsRemover($data)
     {
-        $stop_words = [
+        static $stop_words = [
             'http', 'https',
             "ad", "al", "allo", "ai", "agli", "all", "agl", "alla",
             "alle", "con", "col", "coi", "da", "dal", "dallo", "dai",
@@ -150,9 +151,12 @@ class Tokenizer
             "steste", "stettero", "stessi", "stesse", "stessimo", "stessero",
             "stando"
         ];
-        $page = preg_replace('/\b('.implode('|',$stop_words).')\b/', '',
-            mb_strtolower($page));
-        return $page;
+        static $pattern = "";
+        if (empty($pattern)) {
+            $pattern = '/\b(' . implode('|', $stop_words) . ')\b/u';
+        }
+        $data = preg_replace($pattern, '', $data);
+        return $data;
     }
     /**
      * Computes the stem of an Italian word
diff --git a/src/locale/ru/resources/Tokenizer.php b/src/locale/ru/resources/Tokenizer.php
index 78d3ff7b7..ea75a4af2 100755
--- a/src/locale/ru/resources/Tokenizer.php
+++ b/src/locale/ru/resources/Tokenizer.php
@@ -68,12 +68,13 @@ class Tokenizer
     /**
      * Removes the stop words from the page (used for Word Cloud generation)
      *
-     * @param string $page the page to remove stop words from.
-     * @return string $page with no stop words
+     * @param mixed $data either a string or an array of string to remove
+     *      stop words from
+     * @return mixed $data with no stop words
      */
-    public static function stopwordsRemover($page)
+    public static function stopwordsRemover($data)
     {
-        $stop_words = ["й", "ч", "чп", "ое", "юфп",
+        static $stop_words = ["й", "ч", "чп", "ое", "юфп",
             "по", "об", "с", "у", "уп", "лбл", "б", "фп",
             "чуе", "поб", "фбл", "езп", "оп", "дб", "фщ",
             "л", "х", "це", "чщ", "ъб", "вщ", "рп",
@@ -106,9 +107,12 @@ class Tokenizer
             "фблпк", "йн", "впмее", "чуездб",
             "лпоеюоп", "чуа", "нецдх", 'http', 'https'
         ];
-        $page = preg_replace('/\b('.implode('|',$stop_words).')\b/u', '',
-            strtolower($page));
-        return $page;
+        static $pattern = "";
+        if (empty($pattern)) {
+            $pattern = '/\b(' . implode('|', $stop_words) . ')\b/u';
+        }
+        $data = preg_replace($pattern, '', $data);
+        return $data;
     }
     /**
      * Computes the stem of a Russian word
diff --git a/tests/EnTokenizerTest.php b/tests/EnTokenizerTest.php
index b36598eca..4c2aed370 100644
--- a/tests/EnTokenizerTest.php
+++ b/tests/EnTokenizerTest.php
@@ -103,7 +103,7 @@ class EnTokenizerTest extends UnitTest
     }
     /**
      * Tests the question answering system for English. Sees if correctly
-     * ectract [s v o] stemmed triplets from sentences, and whether it can
+     * extracts [s v o] stemmed triplets from sentences, and whether it can
      * use those to answer questions.
      */
     public function questionAnswerTestCase()
diff --git a/tests/IndexDictionaryTest.php b/tests/IndexDictionaryTest.php
new file mode 100644
index 000000000..8e6560940
--- /dev/null
+++ b/tests/IndexDictionaryTest.php
@@ -0,0 +1,128 @@
+<?php
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009 - 2018  Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ *
+ * END LICENSE
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @license https://www.gnu.org/licenses/ GPL3
+ * @link https://www.seekquarry.com/
+ * @copyright 2009 - 2018
+ * @filesource
+ */
+namespace seekquarry\yioop\tests;
+
+use seekquarry\yioop\configs as C;
+use seekquarry\yioop\library as L;
+use seekquarry\yioop\library\CrawlConstants;
+use seekquarry\yioop\library\IndexShard;
+use seekquarry\yioop\library\IndexDictionary;
+use seekquarry\yioop\library\UnitTest;
+
+/**
+ * Used to test that the IndexDictionary class can properly add shards
+ * and retrieve correct posting slice ranges in the shards.
+ *
+ * @author Chris Pollett
+ */
+class IndexDictionaryTest extends UnitTest
+{
+    /**
+     * Construct some index shard we can add documents to
+     */
+    public function setUp()
+    {
+        $this->test_objects['shard'] = new IndexShard(C\WORK_DIRECTORY.
+            "/shard.txt", 0);
+        $this->test_objects['shard2'] = new IndexShard(C\WORK_DIRECTORY.
+            "/shard2.txt", 1);
+        $this->test_objects['shard3'] = new IndexShard(C\WORK_DIRECTORY.
+            "/shard3.txt", 2);
+        $this->test_objects['dictionary'] = new IndexDictionary(
+            C\WORK_DIRECTORY . "/dictionary", null);
+    }
+    /**
+     * Deletes any index shard files we may have created
+     */
+    public function tearDown()
+    {
+        set_error_handler(null);
+        @unlink(C\WORK_DIRECTORY . "/shard.txt");
+        @unlink(C\WORK_DIRECTORY . "/shard2.txt");
+        @unlink(C\WORK_DIRECTORY . "/shard3.txt");
+        $dbms_manager = C\NS_DATASOURCES . ucfirst(C\DBMS) . "Manager";
+        $db = new $dbms_manager();
+        $db->unlinkRecursive(C\WORK_DIRECTORY . "/dictionary", true);
+        set_error_handler(C\NS_CONFIGS . "yioop_error_handler");
+    }
+    /**
+     * Check that appending two index shards works correctly
+     */
+    public function addShardDictionaryTestCase()
+    {
+        $docid = "AAAAAAAABBBBBBBBCCCCCCCC"; //set up doc
+        $offset = 5;
+        $word_counts = [
+            'MMMMMMMM' => [1, 3, 5],
+            'NNNNNNNN' => [2, 4, 6],
+            'OOOOOOOO' => [7, 8, 9],
+        ];
+        $meta_ids = ["PPPPPPPP", "QQQQQQQQ"];
+        $this->test_objects['shard']->addDocumentWords($docid,
+            $offset, $word_counts, $meta_ids);
+        $this->assertEqual($this->test_objects['shard']->len_all_link_docs, 9,
+            "Len All Docs Correctly Counts Length of First Doc");
+        $this->test_objects['shard']->save();
+        $shard = new IndexShard(C\WORK_DIRECTORY .
+            "/shard.txt", 0, C\NUM_DOCS_PER_GENERATION, true);
+        $word_id = L\crawlHashWord('MMMMMMMM');
+        $shard_info = $shard->getWordInfo($word_id);
+        $this->test_objects['dictionary']->addShardDictionary($shard);
+        $dict_info = $this->test_objects['dictionary']->getWordInfo($word_id);
+        array_shift($dict_info[0]);
+        $first_entry = array_shift($dict_info);
+        $this->assertEqual($shard_info, $first_entry,
+            "Shard word entry agrees with dictionary word entry");
+        $docid = "AAAAAAAABBBBBBBBEEEEEEEE";
+        $offset = 10;
+        $word_counts = [
+            'BBBBBBBB' => [1],
+            'CCCCCCCC' => [2],
+            'MMMMMMMM' => [6],
+        ];
+        $meta_ids = ["EEEEEEEE", "FFFFFFFF"];
+        $this->test_objects['shard2']->addDocumentWords($docid,
+            $offset, $word_counts, $meta_ids);
+        $this->test_objects['shard2']->save();
+        $shard = new IndexShard(C\WORK_DIRECTORY .
+            "/shard2.txt", 1, C\NUM_DOCS_PER_GENERATION, true);
+        $word_id = L\crawlHashWord('MMMMMMMM');
+        $shard_info2 = $shard->getWordInfo($word_id);
+        $this->test_objects['dictionary']->addShardDictionary($shard);
+        $dict_info = $this->test_objects['dictionary']->getWordInfo($word_id);
+        $this->assertEqual(count($dict_info), 2,
+            "After second shard insert have two entries for all M word");
+        array_shift($dict_info[1]);
+        $second_entry = $dict_info[1];
+        $this->assertEqual($shard_info2, $second_entry,
+            "Second entry in two shard case for M word matches expected");
+    }
+}
diff --git a/tests/IndexShardTest.php b/tests/IndexShardTest.php
index 22d6f758e..a05fc593c 100644
--- a/tests/IndexShardTest.php
+++ b/tests/IndexShardTest.php
@@ -338,12 +338,12 @@ class IndexShardTest extends UnitTest
         $this->test_objects['shard']->save();
         $this->test_objects['shard2'] = IndexShard::load(C\WORK_DIRECTORY.
             "/shard.txt");
+        $word_info = $this->test_objects['shard']->getWordInfo(
+            L\crawlHashWord('FFFFFFFF'));
         $this->assertEqual($this->test_objects['shard2']->len_all_docs, 3,
             "Len All Docs Correctly Counts Length of First Doc");
-
         $c_data = $this->test_objects['shard2']->getPostingsSliceById(
             L\crawlHashWord('BBBBBBBB', true), 5);
-
         $this->assertTrue(isset($c_data["AAAAAAAABBBBBBBBCCCCCCCC"]),
             "Doc lookup by word works");
         $c_data = $this->test_objects['shard2']->getPostingsSliceById(
@@ -386,5 +386,15 @@ class IndexShardTest extends UnitTest
             L\crawlHashWord('FFFFFFFF', true), 5);
         $this->assertTrue(isset($c_data["AAAAAAAABBBBBBBBCCCCCCCC"]),
             "String Load Doc lookup 2 by word works");
+        // Check if save without dictionary preserves postings
+        $word_info = $this->test_objects['shard']->getWordInfo(
+            L\crawlHashWord('FFFFFFFF'));
+        $this->test_objects['shard']->saveWithoutDictionary();
+        $shard = new IndexShard(C\WORK_DIRECTORY .
+            "/shard.txt", 0, C\NUM_DOCS_PER_GENERATION, true);
+        $c_data = $shard->getPostingsSlice($word_info[0],
+            $word_info[0], $word_info[1], 5);
+        $this->assertTrue(isset($c_data["AAAAAAAABBBBBBBBCCCCCCCC"]),
+            "Save without dictionary test works");
     }
 }
diff --git a/tests/PhraseParserTest.php b/tests/PhraseParserTest.php
index ea429e151..9775f8250 100644
--- a/tests/PhraseParserTest.php
+++ b/tests/PhraseParserTest.php
@@ -70,7 +70,6 @@ EOD;
         $this->assertTrue(in_array("dr", $words), "Abbreviation 1");
         $this->assertTrue(in_array("_ty", $words), "Initials 1");
         $this->assertTrue(in_array("_jrr", $words), "Initials 2");
-
         $phrase_string = <<< EOD
 THE THE
 ‘Deep Space nine’ ‘Deep Space’ version of GIANT the the
@@ -84,9 +83,9 @@ EOD;
         $this->assertTrue(in_array("the the", $words), "Extract Bigram 1");
         $this->assertTrue(in_array("deep space", $words), "Extract Bigram 2");
         $this->assertTrue(in_array("deep", $words), "Unigrams still present 1");
-        $this->assertTrue(in_array("space", $words), "Unigrams still present 2");
+        $this->assertTrue(in_array("space", $words),
+            "Unigrams still present 2");
         $this->assertTrue(in_array("2012", $words), "Punctuation removal 1");
-
         $phrase_string = <<< EOD
  百度一下,你就知道
  .. 知 道 MP3 图 片 视 频 地 图 输入法 手写
@@ -126,7 +125,7 @@ EOD;
         $this->assertTrue(in_array("a_and_w", $words), "Ampersand Test 1");
         $this->assertTrue(in_array("a_and_tt", $words), "Ampersand Test 2");
         $this->assertTrue(in_array("fish_and_chip", $words), "n for and test");
-        $this->assertTrue(in_array("chris_a_pollett_d_org", $words),
+        $this->assertTrue(in_array("chris_at_pollett_d_org", $words),
             "Email Check 1");
         $this->assertTrue(in_array(
             "http_c__s__s_www_d_yo_d_org_s_index_d_pl_q_a_e_b_and_c_e_d",
@@ -257,8 +256,8 @@ EOD;
         $this->assertTrue(array_diff($segments, $correct_segments) == [],
             "Segmenter Test 2");
         $segments = PhraseParser::segmentSegment("你们好吗?", 'zh-CN');
-        $correct_segments = ["你们", "好", "吗", "?"];
-        $this->assertTrue((count($segments) == 4), "Segmenter Test 3");
+        $correct_segments = ["你们", "好", "吗"];
+        $this->assertTrue((count($segments) == 3), "Segmenter Test 3");
         $this->assertTrue(array_diff($segments, $correct_segments) == [],
             "Segmenter Test 4");
     }
ViewGit