Modify rebuild index so that it adds to dictionary as goes rather than at end, a=chris

Chris Pollett [2018-06-22 18:Jun:nd]

Modify rebuild index so that it adds to dictionary as goes rather than at end, a=chris

Filename
src/configs/TokenTool.php
src/executables/ArcTool.php
src/library/NWordGrams.php
src/library/PhraseParser.php
src/library/ScraperManager.php
src/library/processors/HtmlProcessor.php
src/library/processors/PageProcessor.php
src/library/summarizers/CentroidWeightedSummarizer.php
src/locale/en_US/resources/Tokenizer.php
tests/BloomFilterFileTest.php

diff --git a/src/configs/TokenTool.php b/src/configs/TokenTool.php
index 5f0c88dbf..47115cf67 100644
--- a/src/configs/TokenTool.php
+++ b/src/configs/TokenTool.php
@@ -117,8 +117,8 @@ http://en.wiktionary.org/wiki/Wiktionary:Frequency_lists
 A little script-fu can generally take such a list and put it into the
 format of one word/term per line which is needed by TokenTool.php

-For filter file, Raw page count dumps can be found at
-http://dumps.wikimedia.org/other/pagecounts-raw/
+For filter file, page count dumps can be found at
+https://dumps.wikimedia.org/other/pagecounts-ez/merged/
 These probably give the best n-gram or all gram results, usually
 in a matter of minutes; nevertheless, this tool does support trying to extract
 similar data from Wikipedia dumps. This can take hours.
@@ -191,26 +191,26 @@ function makeNWordGramsFiles($args)
 {
     if (!isset($args[1])) {
         $args[1] = "en";
-        $args[2] = "en-US";
+        $args[2] = "en_US";
     }
     if (!isset($args[2])) {
         $args[2] = $args[1];
     }
     if (!isset($args[3])) {
-        $args[3] = 2; // bigrams
+        $args[3] = "all"; // 2 or more (all-grams)
     }
     if (!isset($argv[4])) {
         $args[4] = NWordGrams::PAGE_COUNT_WIKIPEDIA;
     }
     if (!isset($args[5]) && $args[3] == "all" &&
-        $args[2] == NWordGrams::PAGE_COUNT_WIKIPEDIA) {
-        $args[5] = 400000;
+        $args[4] == NWordGrams::PAGE_COUNT_WIKIPEDIA) {
+        $args[5] = 100000;
     } else {
         $args[5] = -1;
     }
-    $wiki_file_path = PREP_DIR."/";
-    if (!file_exists($wiki_file_path.$args[0])) {
-        echo $args[0]." does not exist in $wiki_file_path";
+    $wiki_file_path = PREP_DIR . "/";
+    if (!file_exists($wiki_file_path . $args[0])) {
+        echo $args[0] . " does not exist in $wiki_file_path";
         exit();
     }
     /*
@@ -220,10 +220,9 @@ function makeNWordGramsFiles($args)
     list($num_ngrams, $max_gram_len) =
         NWordGrams::makeNWordGramsTextFile($args[0], $args[1], $args[2],
         $args[3], $args[4], $args[5]);
-
     /*
      *This call creates a bloom filter file from n word grams text file based
-     *on the language specified.The lang passed as parameter is prefixed
+     *on the language specified. The lang passed as parameter is prefixed
      *to the filter file name. The count of n word grams in text file is passed
      *as a parameter to set the limit of n word grams in the filter file.
      */
@@ -243,7 +242,7 @@ function makeNWordGramsFiles($args)
 function makeSuggestTrie($dict_file, $locale, $end_marker)
 {
     $locale = str_replace("-", "_", $locale);
-    $out_file = LOCALE_DIR."/$locale/resources/suggest_trie.txt.gz";
+    $out_file = LOCALE_DIR . "/$locale/resources/suggest_trie.txt.gz";

     // Read and load dictionary and stop word files
     $words = fileWithTrim($dict_file);
@@ -276,7 +275,7 @@ function makeSuggestTrie($dict_file, $locale, $end_marker)
 function fileWithTrim($file_name)
 {
     if (!file_exists($file_name)) {
-        $file_name = PREP_DIR."/$file_name";
+        $file_name = PREP_DIR . "/$file_name";
         if (!file_exists($file_name)) {
             echo "$file_name Not Found\n\n";
             return [];
diff --git a/src/executables/ArcTool.php b/src/executables/ArcTool.php
index d4c18e2a5..9e0c88cd7 100755
--- a/src/executables/ArcTool.php
+++ b/src/executables/ArcTool.php
@@ -139,7 +139,10 @@ class ArcTool implements CrawlConstants
                 $this->outputPostingInfo($path, $argv[3], $argv[4], $num);
                 break;
             case "rebuild":
-                $this->rebuildIndexArchive($path);
+                if (!isset($argv[3])) {
+                    $argv[3] = 0;
+                }
+                $this->rebuildIndexArchive($path, $argv[3]);
                 break;
             case "count":
                 if (!isset($argv[3])) {
@@ -548,7 +551,9 @@ class ArcTool implements CrawlConstants
      * @param int $max_tier tier up to which the dictionary tiers should be
      *     merge (typically a value greater than the max_tier of the
      *     dictionary)
-     * @param int
+     * @param mixed $start_shard which shard to start
+     *  shard from. If 'continue' then keeps goign from where last attempt at
+     *  a rebuild was.
      */
     public function reindexIndexArchive($path, $max_tier = -1, $start_shard = 0)
     {
@@ -566,7 +571,7 @@ class ArcTool implements CrawlConstants
                 $start_shard = 0;
             }
         }
-        $shards = glob($path."/posting_doc_shards/index*");
+        $shards = glob($path . "/posting_doc_shards/index*");
         $num_shards = count($shards);
         echo "Total number of shards to reindex is: $num_shards";
         if (is_array($shards)) {
@@ -851,27 +856,47 @@ class ArcTool implements CrawlConstants
      * Then a reindex is done.
      *
      * @param string $archive_path file path to a IndexArchiveBundle
+     * @param mixed $start_generation which web archive generation to start
+     *  rebuild from. If 'continue' then keeps goign from where last attempt at
+     *  a rebuild was.
      */
-    public function rebuildIndexArchive($archive_path)
+    public function rebuildIndexArchive($archive_path, $start_generation = 0)
     {
         $archive_type = $this->getArchiveKind($archive_path);
         $archive_name = C\NS_LIB . $archive_type ;
         if ($archive_type != "IndexArchiveBundle") {
             $this->badFormatMessageAndExit($archive_path);
         }
+        $shard_count_file = $archive_path . "/reindex_count.txt";
+        if (trim($start_generation) === "continue") {
+            if (file_exists($shard_count_file)) {
+                $start_generation =
+                    intval(file_get_contents($shard_count_file));
+                echo "Restarting rebuild index from $start_generation\n";
+            } else {
+                $start_generation= 0;
+                file_put_contents($shard_count_file, $start_generation);
+            }
+        }
         $info = $archive_name::getArchiveInfo($archive_path);
         $tmp = unserialize($info["DESCRIPTION"]);
         $video_sources = $tmp[self::VIDEO_SOURCES];
         $generation_info = unserialize(
             file_get_contents("$archive_path/generation.txt"));
-        $num_generations = $generation_info['ACTIVE']+1;
+        $num_generations = $generation_info['ACTIVE'] + 1;
         $archive = new WebArchiveBundle($archive_path."/summaries");
+        $dbms_manager = C\NS_DATASOURCES . ucfirst(C\DBMS) . "Manager";
+        $db = new $dbms_manager();
+        $db->unlinkRecursive($archive_path . "/dictionary", false);
+        IndexDictionary::makePrefixLetters($archive_path . "/dictionary");
+        $dictionary = new IndexDictionary($archive_path . "/dictionary");
         $seen = 0;
-        $generation = 0;
+        $generation = $start_generation;
         $keypad = "\x00\x00\x00\x00";
         while($generation < $num_generations) {
             $partition = $archive->getPartition($generation, false);
-            $shard_name = $archive_path."/posting_doc_shards/index$generation";
+            $shard_name = $archive_path .
+                "/posting_doc_shards/index$generation";
             L\crawlLog("Processing partition $generation");
             if (file_exists($shard_name)) {
                 L\crawlLog("..Unlinking old shard $generation");
@@ -976,9 +1001,17 @@ class ArcTool implements CrawlConstants
                 $seen_partition += $num_to_get;
             }
             $shard->save(false, true);
+            $shard = new IndexShard($shard_name, $generation,
+                C\NUM_DOCS_PER_GENERATION, true);
+            if ($dictionary->addShardDictionary($shard)) {
+                $shard->saveWithoutDictionary(true);
+                file_put_contents($shard_count_file, $generation + 1);
+            } else {
+                echo "Problem adding shard $i";
+                exit();
+            }
             $generation++;
         }
-        $this->reindexIndexArchive($archive_path);
     }
     /**
      * Used to create an archive_bundle_iterator for a non-yioop archive
diff --git a/src/library/NWordGrams.php b/src/library/NWordGrams.php
index 7fb269f54..2a54cac26 100644
--- a/src/library/NWordGrams.php
+++ b/src/library/NWordGrams.php
@@ -61,6 +61,10 @@ class NWordGrams
      * text file name containing bigrams.
      */
     const TEXT_SUFFIX = "_word_grams.txt";
+    /**
+     * Auxiliary suffice file ngrams to add to filter
+     */
+    const AUX_SUFFIX = "_aux_grams.txt";
     const WIKI_DUMP_REDIRECT = 0;
     const WIKI_DUMP_TITLE = 1;
     const PAGE_COUNT_WIKIPEDIA = 2;
@@ -77,17 +81,27 @@ class NWordGrams
     public static function ngramsContains($phrase, $lang, $filter_prefix = 2)
     {
         $lang = str_replace("-", "_", $lang);
-        if (self::$ngrams == null || !isset(self::$ngrams[$filter_prefix])) {
+        if ($lang == 'en' || $lang == 'en_GB') {
+            $lang = 'en_US';
+        }
+        if (empty(self::$ngrams)) {
+            self::$ngrams = [];
+        }
+        if (empty(self::$ngrams[$lang])) {
+            self::$ngrams[$lang] = [];
+        }
+        if (empty(self::$ngrams[$lang][$filter_prefix])) {
             $filter_path = C\LOCALE_DIR . "/$lang/resources/" .
                 "{$filter_prefix}" . self::FILTER_SUFFIX;
             if (file_exists($filter_path)) {
-                self::$ngrams[$filter_prefix] =
+                self::$ngrams[$lang][$filter_prefix] =
                     BloomFilterFile::load($filter_path);
-            } else  {
+            } else {
                 return false;
             }
         }
-        return self::$ngrams[$filter_prefix]->contains(mb_strtolower($phrase));
+        return self::$ngrams[$lang][$filter_prefix]->contains(
+            mb_strtolower($phrase));
     }
     /**
      * Creates a bloom filter file from a n word gram text file. The
@@ -114,19 +128,26 @@ class NWordGrams
             unlink($filter_path); //build again from scratch
         }
         $ngrams = new BloomFilterFile($filter_path, $num_ngrams_found);
-
-        $inputFilePath = C\LOCALE_DIR . "/$lang/resources/" .
+        $input_file_path = C\LOCALE_DIR . "/$lang/resources/" .
             "{$num_gram}" .  self::TEXT_SUFFIX;
-        $fp = fopen($inputFilePath, 'r') or die("Can't open ngrams text file");
+        $fp = fopen($input_file_path, 'r') or
+            die("Can't open ngrams text file");
         while ( ($ngram = fgets($fp)) !== false) {
-          $words = PhraseParser::stemTerms(trim($ngram), $lang);
-          if (strlen($words[0]) == 1) { // get rid of n grams like "a dog"
-              continue;
-          }
-          $ngram_stemmed = implode(" ", $words);
-          $ngrams->add(mb_strtolower($ngram_stemmed));
+          $ngram = trim(mb_strtolower($ngram));
+          $ngrams->add($ngram);
         }
         fclose($fp);
+        $input_file_path = C\LOCALE_DIR . "/$lang/resources/" .
+            "{$num_gram}" .  self::AUX_SUFFIX;
+        if (file_exists($input_file_path)) {
+            $fp = fopen($input_file_path, 'r') or
+                die("Can't open ngrams text file");
+            while ( ($ngram = fgets($fp)) !== false) {
+              $ngram = trim(mb_strtolower($ngram));
+              $ngrams->add($ngram);
+            }
+            fclose($fp);
+        }
         $ngrams->max_gram_len = $max_gram_len;
         $ngrams->save();
     }
@@ -199,12 +220,12 @@ class NWordGrams
                 $replace_array = ['#redirect [[',']]'];
                 break;
             case self::PAGE_COUNT_WIKIPEDIA:
-                $pattern = '/^'.$lang.'\s[^\p{P}]+';
-                $pattern_end='/u';
+                $pattern = '/^' . $lang . "(\.[a-z])?";
+                $pattern_end = '\s\d*/u';
                 $is_count_type = true;
                 break;
             case self::PAGE_COUNT_WIKTIONARY:
-                $pattern = '/^'.$lang.'.d\s[^\p{P}]+';
+                $pattern = '/^'.$lang.'.d\s[\p{L}|\p{Z}]+';
                 $pattern_end='/u';
                 $is_count_type = true;
                 break;
@@ -227,10 +248,10 @@ class NWordGrams
         $replace_types = [self::WIKI_DUMP_TITLE, self::WIKI_DUMP_REDIRECT];

         if (is_dir(C\PREP_DIR."/$wiki_file") ) {
-            $folder_files = glob(C\PREP_DIR."/$wiki_file/*.{gz,bz}",
+            $folder_files = glob(C\PREP_DIR . "/$wiki_file/*.{gz,bz}",
                 GLOB_BRACE);
         } else {
-            $folder_files = [C\PREP_DIR."/$wiki_file"];
+            $folder_files = [C\PREP_DIR . "/$wiki_file"];
         }
         $ngrams = [];
         foreach ($folder_files as $wiki_file_path) {
@@ -280,14 +301,18 @@ class NWordGrams
                             $line_parts = explode(" ", $matches[0]);
                             if (isset($line_parts[1]) &&
                                 isset($line_parts[2])) {
-                                $ngram=mb_ereg_replace("_", " ",$line_parts[1]);
-                                $char_grams =
+                                $ngram = mb_ereg_replace("_", " ",
+                                    $line_parts[1]);
+                                if ($char_grams =
                                     PhraseParser::getCharGramsTerm(
-                                        [$ngram],$locale);
-                                $ngram = implode(" ", $char_grams);
-                                $ngram_num_words=mb_substr_count($ngram, " ")+1;
+                                    [$ngram], $locale)) {
+                                    $ngram = implode(" ", $char_grams);
+                                }
+                                $ngram_num_words =
+                                    mb_substr_count($ngram, " ") + 1;
                                 if (($is_all && $ngram_num_words > 1) ||
-                                    (!$is_all &&$ngram_num_words == $num_gram)){
+                                    (!$is_all &&
+                                    $ngram_num_words == $num_gram)) {
                                     $ngrams[$ngram] = $line_parts[2];
                                 }
                             }
@@ -295,14 +320,20 @@ class NWordGrams
                             $ngram = mb_ereg_replace(
                                 $replace_array, "", $matches[0]);
                             $ngram = mb_ereg_replace("_", " ", $ngram);
-
                             $ngrams[] = $ngram;
                         }
                         if ($is_all && isset($ngram)) {
                             $ngram_num_words = mb_substr_count($ngram, " ") + 1;
-                            $max_gram_len = max($max_gram_len,$ngram_num_words);
+                            $max_gram_len =
+                                max($max_gram_len, $ngram_num_words);
                         }
                     }
+                    if ($is_count_type && count($ngrams) > 4 * $max_terms
+                        && $max_terms > 0) {
+                        echo  "..pruning results to $max_terms many\n";
+                        arsort($ngrams);
+                        $ngrams = array_slice($ngrams, 0, $max_terms);
+                    }
                 }
             }
         }
@@ -319,13 +350,13 @@ class NWordGrams
         // in is_all case add prefix*'s for (n >= 3)-grams
         if ($is_all) {
             for ($i = 0; $i < $num_ngrams_found; $i++) {
-                $ngram_in_word =  mb_substr_count($ngrams[$i], " ")+1;
+                $ngram_in_word =  mb_substr_count($ngrams[$i], " ") + 1;
                 if ($ngram_in_word >= 3) {
                     $ngram_parts = explode(" ", $ngrams[$i]);
                     $ngram = $ngram_parts[0];
                     for ($j = 1; $j < $ngram_in_word - 1;  $j++ ) {
-                        $ngram .= " ".$ngram_parts[$j];
-                        $ngrams[] = $ngram."*";
+                        $ngram .= " " . $ngram_parts[$j];
+                        $ngrams[] = $ngram . "*";
                     }
                 }
             }
diff --git a/src/library/PhraseParser.php b/src/library/PhraseParser.php
index 14eceb76a..097963ce8 100755
--- a/src/library/PhraseParser.php
+++ b/src/library/PhraseParser.php
@@ -134,6 +134,7 @@ class PhraseParser
             $string = trim(substr($string, strlen($control_word) + 1));
         } else {
             self::canonicalizePunctuatedTerms($string, $lang);
+            self::underscoreEntities($string, $lang);
         }
         $terms = self::stemCharGramSegment($string, $lang);
         $num = count($terms);
@@ -237,6 +238,7 @@ class PhraseParser
             'QUESTION_ANSWER_EXTRACT' => 0]];
         if (!isset(self::$programming_language_map[$lang])) {
             self::canonicalizePunctuatedTerms($string, $lang);
+            self::underscoreEntities($string, $lang);
             $phrase_list['TIMES']['CANONICALIZE'] =
                 changeInMicrotime($start_time);
         }
@@ -287,14 +289,26 @@ class PhraseParser
                     mb_ereg_replace("\.\s*", "", $matches[0]));
                 return $result;
             }, $string);
-        $ampersand_pattern = "/[A-Za-z]+(\s*(\s(\'n|\'N)\s|\&)\s*[A-Za-z])+/";
+        $ap = "(\'|\u{2019}|\u{02BC})";
+        $ampersand_pattern = "/[A-Za-z]+".
+            "(\s*(\s({$ap}n|{$ap}N)\s|\&)\s*[A-Za-z])+/u";
         $string = preg_replace_callback($ampersand_pattern,
             function($matches) {
+                $ap = "(\'|\u{2019}|\u{02BC})";
                 $result = mb_strtolower(
-                    mb_ereg_replace("\s*(\'n|\'N|\&)\s*","_and_",$matches[0]));
+                    mb_ereg_replace("\s*(" . $ap . "n|" . $ap . "N|\&)\s*",
+                    "_and_", $matches[0]));
                 return $result;
-            },
-            $string);
+            }, $string);
+        $contraction_pattern = "/\b[A-Za-z]+" .
+            "({$ap}[A-Za-z]+|\s*{$ap}\s*(s|t))\b/u";
+        $string = preg_replace_callback($contraction_pattern,
+            function($matches) {
+                $result = mb_strtolower(
+                    mb_ereg_replace("\s*\'|\u2019|\u02BC\s*",
+                    "_ap_", $matches[0]));
+                return $result;
+            }, $string);
         $url_or_email_pattern =
             '@((gopher|http|https)://([^ \t\r\n\v\f\'\"\;\,<>])*)|'.
             '([A-Z0-9._%-]+\@[A-Z0-9.-]+\.[A-Z]{2,4})@i';
@@ -303,19 +317,67 @@ class PhraseParser
                 $result =  mb_ereg_replace("\.", "_d_",$matches[0]);
                 $result =  mb_ereg_replace("\:", "_c_",$result);
                 $result =  mb_ereg_replace("\/", "_s_",$result);
-                $result =  mb_ereg_replace("\@", "_a_",$result);
+                $result =  mb_ereg_replace("\@", "_at_",$result);
                 $result =  mb_ereg_replace("\[", "_bo_",$result);
                 $result =  mb_ereg_replace("\]", "_bc_",$result);
                 $result =  mb_ereg_replace("\(", "_po_",$result);
                 $result =  mb_ereg_replace("\)", "_pc_",$result);
                 $result =  mb_ereg_replace("\?", "_q_",$result);
                 $result =  mb_ereg_replace("\=", "_e_",$result);
-                $result =  mb_ereg_replace("\&", "_a_",$result);
+                $result =  mb_ereg_replace("\&", "_and_",$result);
                 $result = mb_strtolower($result);
                 return $result;
             },
             $string);
     }
+    /**
+     * @param string& $string a string of words, etc which might involve such
+     *      terms
+     * @param $lang a language tag to use as part of the canonicalization
+     *     process not used right now
+     */
+    public static function underscoreEntities(&$string, $lang = null)
+    {
+        if (!$lang) {
+            return;
+        }
+        $string = mb_strtolower($string);
+        $parts = preg_split("/\s+/u", $string);
+        $parts = array_filter($parts);
+        $num_parts = count($parts);
+        $current_entity = "";
+        $out_string = "";
+        $space = "";
+        $i = 0;
+        $j = -1;
+        $k = 0;
+        while ($j < $num_parts) {
+            $j++;
+            $current_entity = trim(implode(" ",
+                array_slice($parts, $i, $j - $i)));
+            if ($j - $i > 1) {
+                if (NWordGrams::ngramsContains(
+                    $current_entity, $lang, "all")) {
+                    $last_entity = $current_entity;
+                    $k = $j;
+                }
+                if (!NWordGrams::ngramsContains(
+                    $current_entity . "*", $lang, "all")) {
+                    $out_string .= $space . str_replace(" ", "_",
+                        trim($last_entity));
+                    $space = " ";
+                    $current_entity = "";
+                    $last_entity = "";
+                    $i = $k;
+                    $j = $k - 1;
+                }
+            } else {
+                $last_entity = $current_entity;
+                $k = $j;
+            }
+        }
+        $string = $out_string . " " . $current_entity;
+    }
     /**
      * Splits string according to punctuation and white space then
      * extracts (stems/char grams) of terms and n word grams from the string
diff --git a/src/library/ScraperManager.php b/src/library/ScraperManager.php
index 59a4b8722..f3ffdb7d4 100644
--- a/src/library/ScraperManager.php
+++ b/src/library/ScraperManager.php
@@ -86,20 +86,24 @@ class ScraperManager
         $scrape_rules = preg_split('/###/u',
             $scrape_rules_string, 0, PREG_SPLIT_NO_EMPTY);
         if (count($scrape_rules) > 0) {
-            $temp_page = self::getContentByXquery($page,
+            $dom = self::getContentByXquery($page,
                 $scrape_rules[0]);
             unset($scrape_rules[0]);
-            if (!empty($temp_page)) {
+            if (!empty($dom)) {
                 foreach ($scrape_rules as $tag_to_remove) {
-                    $new_temp_page =
-                        self::removeContentByXquery($temp_page, $tag_to_remove);
-                    if (!empty($new_temp_page)) {
-                        $temp_page = $new_temp_page;
+                    self::removeContentByXquery($dom,
+                        $tag_to_remove);
+                    if (empty($dom)) {
+                        break;
                     }
                 }
+                if (!empty($dom)) {
+                    $new_temp_page = $dom->saveHTML();
+                }
+                set_error_handler(C\NS_CONFIGS . "yioop_error_handler");
             }
         }
-        return empty($temp_page) ? $page : $temp_page;
+        return empty($new_temp_page) ? $page : $new_temp_page;
     }
     /**
      * If $signature begins with '/', checks to see if applying
@@ -134,52 +138,47 @@ class ScraperManager
      * @param string $page a document to apply the xpath query against
      * @param string $query the xpath query to run
      *
-     * @return string the content found as a string, otherwise an empty string
+     * @return \DOMDocument dom of a simplified web page containing nodes
+     *      matching xpath query within an html body tag.
      */
     public static function getContentByXquery($page, $query)
     {
-        $result = "";
+        $out_dom = null;
         $dom = new \DOMDocument();
         set_error_handler(null);
         if (@$dom->loadHTML($page)) {
             $xpath = new \DOMXPath($dom);
             $xpath_result = $xpath->query($query);
             if (!empty($xpath_result) && $xpath_result->length > 0) {
-                $result = $dom->saveHTML($xpath_result->item(0));
+                $out_dom = new \DOMDocument();
+                $out_dom->loadHTML("<html><body></body></html>");
+                $node = $out_dom->importNode($xpath_result->item(0), true);
+                $out_dom->documentElement->childNodes->item(0)->appendChild(
+                    $node);
             }
         }
         set_error_handler(C\NS_CONFIGS . "yioop_error_handler");
-        return $result;
+        return $out_dom;
     }
     /**
-     * Removes from the contents of a document the results of
+     * Removes from the contents of a DOMDocument the results of
      * an xpath query
-     * @param string $page a document to apply the xpath query against
+     * @param \DOMDocument $dom a document to apply the xpath query against
      * @param string $query the xpath query to run
-     *
-     * @return string the content less the xpath results as an HTML document
      */
-    public static function removeContentByXquery($page, $query)
+    public static function removeContentByXquery($dom, $query)
     {
-        $result = $page;
-        $dom = new \DOMDocument();
-        set_error_handler(null);
-        if (@$dom->loadHTML($page)) {
-            $xpath = new \DOMXPath($dom);
-            $xpath_result = $xpath->query($query);
-            if ($xpath_result->length > 0) {
-                $len = $xpath_result->length;
-                for ($i = 0; $i < $len; $i++) {
-                    $node = $xpath_result->item($i);
-                    $parent = $node->parentNode;
-                    if ($parent) {
-                        $parent->removeChild($node);
-                    }
+        $xpath = new \DOMXPath($dom);
+        $xpath_result = $xpath->query($query);
+        if ($xpath_result->length > 0) {
+            $len = $xpath_result->length;
+            for ($i = 0; $i < $len; $i++) {
+                $node = $xpath_result->item($i);
+                $parent = $node->parentNode;
+                if ($parent) {
+                    $parent->removeChild($node);
                 }
-                $result = $dom->saveHTML();
             }
         }
-        set_error_handler(C\NS_CONFIGS . "yioop_error_handler");
-        return $result;
     }
 }
diff --git a/src/library/processors/HtmlProcessor.php b/src/library/processors/HtmlProcessor.php
index 59c1be909..1aeb650cb 100755
--- a/src/library/processors/HtmlProcessor.php
+++ b/src/library/processors/HtmlProcessor.php
@@ -145,9 +145,9 @@ class HtmlProcessor extends TextProcessor
                 }
                 $location = self::location($dom, $url);
                 if ($location) {
-                    $summary[self::LINKS][$location] = "location:".$url;
+                    $summary[self::LINKS][$location] = "location:" . $url;
                     $summary[self::LOCATION] = true;
-                    $summary[self::DESCRIPTION] .= $url." => ".$location;
+                    $summary[self::DESCRIPTION] .= $url . " => " . $location;
                     if (!$summary[self::TITLE]) {
                         $summary[self::TITLE] = $url;
                     }
diff --git a/src/library/processors/PageProcessor.php b/src/library/processors/PageProcessor.php
index 445d2e508..f4458d3b9 100644
--- a/src/library/processors/PageProcessor.php
+++ b/src/library/processors/PageProcessor.php
@@ -182,7 +182,6 @@ abstract class PageProcessor implements CrawlConstants
      *     the information in $page
      */
     public abstract function process($page, $url);
-
     /**
      * Get processors for different file types. constructing
      * them will populate the self::$indexed_file_types,
diff --git a/src/library/summarizers/CentroidWeightedSummarizer.php b/src/library/summarizers/CentroidWeightedSummarizer.php
index 6b3b91728..2b5892206 100644
--- a/src/library/summarizers/CentroidWeightedSummarizer.php
+++ b/src/library/summarizers/CentroidWeightedSummarizer.php
@@ -93,10 +93,11 @@ class CentroidWeightedSummarizer extends Summarizer
         /* Format the document to remove characters other than periods and
            alphanumerics.
         */
+        $page = mb_strtolower($page);
         $formatted_doc = self::formatDoc($page);
-        $stop_obj = PhraseParser::getTokenizer($lang);
         /* Splitting into sentences */
         $out_sentences = self::getSentences($page);
+        $stop_obj = PhraseParser::getTokenizer($lang);
         $sentences = self::removeStopWords($out_sentences, $stop_obj);
         $sentence_array = self::splitSentences($sentences, $lang);
         $terms = $sentence_array[0];
@@ -220,19 +221,20 @@ class CentroidWeightedSummarizer extends Summarizer
         $sentence = "";
         $count = 0;
         $theshold_factor = 1;
+        $threshold = self::LONG_SENTENCE_THRESHOLD;
         foreach ($lines as $line) {
             $sentence .= " " . $line;
             if (strlen($line) < 2) {
                 continue;
             }
-            if ($count < self::LONG_SENTENCE_THRESHOLD ||
+            if ($count < $threshold ||
                 strlen($sentence) > $theshold_factor *
-                    self::LONG_SENTENCE_LEN){
+                    self::LONG_SENTENCE_LEN) {
                 $sentence = preg_replace("/\s+/ui", " ", $sentence);
                 $out[] = trim($sentence);
                 $count++;
                 $theshold_factor =
-                    pow(1.5, floor($count/self::LONG_SENTENCE_THRESHOLD));
+                    pow(1.5, floor($count/$threshold));
             }
             $sentence = "";
         }
@@ -251,7 +253,7 @@ class CentroidWeightedSummarizer extends Summarizer
     public static function formatSentence($sent)
     {
         $sent = trim(preg_replace('/[^\p{L}\p{N}\s]+/u',
-            ' ', mb_strtolower($sent)));
+            ' ', $sent));
         return $sent;
     }
     /**
@@ -265,7 +267,7 @@ class CentroidWeightedSummarizer extends Summarizer
     public static function formatDoc($content)
     {
         $substitute = ['/[\n\r\-]+/', '/[^\p{L}\s\.]+/u', '/[\.]+/'];
-        $content = preg_replace($substitute, ' ', mb_strtolower($content));
+        $content = preg_replace($substitute, ' ', $content);
         return $content;
     }
     /**
diff --git a/src/locale/en_US/resources/Tokenizer.php b/src/locale/en_US/resources/Tokenizer.php
index 2b714bbb1..db7ecc92d 100755
--- a/src/locale/en_US/resources/Tokenizer.php
+++ b/src/locale/en_US/resources/Tokenizer.php
@@ -78,8 +78,7 @@ class Tokenizer
      * List of verb-like parts of speech that might appear in lexicon file
      * @array
      */
-    public static $verb_phrases = ["VB", "VBD", "VBG", "VBN", "VBP",
-        "VBZ"];
+    public static $verb_phrases = ["VB", "VBD", "VBG", "VBN", "VBP", "VBZ"];
     /**
      * List of noun-like parts of speech that might appear in lexicon file
      * @array
diff --git a/tests/BloomFilterFileTest.php b/tests/BloomFilterFileTest.php
index 17f084fbd..5be7adfdc 100755
--- a/tests/BloomFilterFileTest.php
+++ b/tests/BloomFilterFileTest.php
@@ -80,8 +80,16 @@ class BloomFilterFileTest extends UnitTest
     public function inTestCase()
     {
         $this->test_objects['FILE1']->add(77);
+        $this->test_objects['FILE1']->add("prime minister");
+        $this->test_objects['FILE1']->add("prime minister*");
         $this->assertTrue(
             $this->test_objects['FILE1']->contains(77), "File 1 contains 77");
+        $this->assertTrue(
+            $this->test_objects['FILE1']->contains("prime minister"),
+            "File 1 contains prime minister");
+        $this->assertTrue(
+            $this->test_objects['FILE1']->contains("prime minister*"),
+            "File 1 contains prime minister*");
         $this->assertFalse(
             $this->test_objects['FILE1']->contains(66), "File 1 contains 66");
     }

ViewGit