viewgit/inc/functions.php:22 Function utf8_encode() is deprecated [8192]
diff --git a/src/configs/TokenTool.php b/src/configs/TokenTool.php index 5f0c88dbf..47115cf67 100644 --- a/src/configs/TokenTool.php +++ b/src/configs/TokenTool.php @@ -117,8 +117,8 @@ http://en.wiktionary.org/wiki/Wiktionary:Frequency_lists A little script-fu can generally take such a list and put it into the format of one word/term per line which is needed by TokenTool.php -For filter file, Raw page count dumps can be found at -http://dumps.wikimedia.org/other/pagecounts-raw/ +For filter file, page count dumps can be found at +https://dumps.wikimedia.org/other/pagecounts-ez/merged/ These probably give the best n-gram or all gram results, usually in a matter of minutes; nevertheless, this tool does support trying to extract similar data from Wikipedia dumps. This can take hours. @@ -191,26 +191,26 @@ function makeNWordGramsFiles($args) { if (!isset($args[1])) { $args[1] = "en"; - $args[2] = "en-US"; + $args[2] = "en_US"; } if (!isset($args[2])) { $args[2] = $args[1]; } if (!isset($args[3])) { - $args[3] = 2; // bigrams + $args[3] = "all"; // 2 or more (all-grams) } if (!isset($argv[4])) { $args[4] = NWordGrams::PAGE_COUNT_WIKIPEDIA; } if (!isset($args[5]) && $args[3] == "all" && - $args[2] == NWordGrams::PAGE_COUNT_WIKIPEDIA) { - $args[5] = 400000; + $args[4] == NWordGrams::PAGE_COUNT_WIKIPEDIA) { + $args[5] = 100000; } else { $args[5] = -1; } - $wiki_file_path = PREP_DIR."/"; - if (!file_exists($wiki_file_path.$args[0])) { - echo $args[0]." does not exist in $wiki_file_path"; + $wiki_file_path = PREP_DIR . "/"; + if (!file_exists($wiki_file_path . $args[0])) { + echo $args[0] . " does not exist in $wiki_file_path"; exit(); } /* @@ -220,10 +220,9 @@ function makeNWordGramsFiles($args) list($num_ngrams, $max_gram_len) = NWordGrams::makeNWordGramsTextFile($args[0], $args[1], $args[2], $args[3], $args[4], $args[5]); - /* *This call creates a bloom filter file from n word grams text file based - *on the language specified.The lang passed as parameter is prefixed + *on the language specified. The lang passed as parameter is prefixed *to the filter file name. The count of n word grams in text file is passed *as a parameter to set the limit of n word grams in the filter file. */ @@ -243,7 +242,7 @@ function makeNWordGramsFiles($args) function makeSuggestTrie($dict_file, $locale, $end_marker) { $locale = str_replace("-", "_", $locale); - $out_file = LOCALE_DIR."/$locale/resources/suggest_trie.txt.gz"; + $out_file = LOCALE_DIR . "/$locale/resources/suggest_trie.txt.gz"; // Read and load dictionary and stop word files $words = fileWithTrim($dict_file); @@ -276,7 +275,7 @@ function makeSuggestTrie($dict_file, $locale, $end_marker) function fileWithTrim($file_name) { if (!file_exists($file_name)) { - $file_name = PREP_DIR."/$file_name"; + $file_name = PREP_DIR . "/$file_name"; if (!file_exists($file_name)) { echo "$file_name Not Found\n\n"; return []; diff --git a/src/executables/ArcTool.php b/src/executables/ArcTool.php index d4c18e2a5..9e0c88cd7 100755 --- a/src/executables/ArcTool.php +++ b/src/executables/ArcTool.php @@ -139,7 +139,10 @@ class ArcTool implements CrawlConstants $this->outputPostingInfo($path, $argv[3], $argv[4], $num); break; case "rebuild": - $this->rebuildIndexArchive($path); + if (!isset($argv[3])) { + $argv[3] = 0; + } + $this->rebuildIndexArchive($path, $argv[3]); break; case "count": if (!isset($argv[3])) { @@ -548,7 +551,9 @@ class ArcTool implements CrawlConstants * @param int $max_tier tier up to which the dictionary tiers should be * merge (typically a value greater than the max_tier of the * dictionary) - * @param int + * @param mixed $start_shard which shard to start + * shard from. If 'continue' then keeps goign from where last attempt at + * a rebuild was. */ public function reindexIndexArchive($path, $max_tier = -1, $start_shard = 0) { @@ -566,7 +571,7 @@ class ArcTool implements CrawlConstants $start_shard = 0; } } - $shards = glob($path."/posting_doc_shards/index*"); + $shards = glob($path . "/posting_doc_shards/index*"); $num_shards = count($shards); echo "Total number of shards to reindex is: $num_shards"; if (is_array($shards)) { @@ -851,27 +856,47 @@ class ArcTool implements CrawlConstants * Then a reindex is done. * * @param string $archive_path file path to a IndexArchiveBundle + * @param mixed $start_generation which web archive generation to start + * rebuild from. If 'continue' then keeps goign from where last attempt at + * a rebuild was. */ - public function rebuildIndexArchive($archive_path) + public function rebuildIndexArchive($archive_path, $start_generation = 0) { $archive_type = $this->getArchiveKind($archive_path); $archive_name = C\NS_LIB . $archive_type ; if ($archive_type != "IndexArchiveBundle") { $this->badFormatMessageAndExit($archive_path); } + $shard_count_file = $archive_path . "/reindex_count.txt"; + if (trim($start_generation) === "continue") { + if (file_exists($shard_count_file)) { + $start_generation = + intval(file_get_contents($shard_count_file)); + echo "Restarting rebuild index from $start_generation\n"; + } else { + $start_generation= 0; + file_put_contents($shard_count_file, $start_generation); + } + } $info = $archive_name::getArchiveInfo($archive_path); $tmp = unserialize($info["DESCRIPTION"]); $video_sources = $tmp[self::VIDEO_SOURCES]; $generation_info = unserialize( file_get_contents("$archive_path/generation.txt")); - $num_generations = $generation_info['ACTIVE']+1; + $num_generations = $generation_info['ACTIVE'] + 1; $archive = new WebArchiveBundle($archive_path."/summaries"); + $dbms_manager = C\NS_DATASOURCES . ucfirst(C\DBMS) . "Manager"; + $db = new $dbms_manager(); + $db->unlinkRecursive($archive_path . "/dictionary", false); + IndexDictionary::makePrefixLetters($archive_path . "/dictionary"); + $dictionary = new IndexDictionary($archive_path . "/dictionary"); $seen = 0; - $generation = 0; + $generation = $start_generation; $keypad = "\x00\x00\x00\x00"; while($generation < $num_generations) { $partition = $archive->getPartition($generation, false); - $shard_name = $archive_path."/posting_doc_shards/index$generation"; + $shard_name = $archive_path . + "/posting_doc_shards/index$generation"; L\crawlLog("Processing partition $generation"); if (file_exists($shard_name)) { L\crawlLog("..Unlinking old shard $generation"); @@ -976,9 +1001,17 @@ class ArcTool implements CrawlConstants $seen_partition += $num_to_get; } $shard->save(false, true); + $shard = new IndexShard($shard_name, $generation, + C\NUM_DOCS_PER_GENERATION, true); + if ($dictionary->addShardDictionary($shard)) { + $shard->saveWithoutDictionary(true); + file_put_contents($shard_count_file, $generation + 1); + } else { + echo "Problem adding shard $i"; + exit(); + } $generation++; } - $this->reindexIndexArchive($archive_path); } /** * Used to create an archive_bundle_iterator for a non-yioop archive diff --git a/src/library/NWordGrams.php b/src/library/NWordGrams.php index 7fb269f54..2a54cac26 100644 --- a/src/library/NWordGrams.php +++ b/src/library/NWordGrams.php @@ -61,6 +61,10 @@ class NWordGrams * text file name containing bigrams. */ const TEXT_SUFFIX = "_word_grams.txt"; + /** + * Auxiliary suffice file ngrams to add to filter + */ + const AUX_SUFFIX = "_aux_grams.txt"; const WIKI_DUMP_REDIRECT = 0; const WIKI_DUMP_TITLE = 1; const PAGE_COUNT_WIKIPEDIA = 2; @@ -77,17 +81,27 @@ class NWordGrams public static function ngramsContains($phrase, $lang, $filter_prefix = 2) { $lang = str_replace("-", "_", $lang); - if (self::$ngrams == null || !isset(self::$ngrams[$filter_prefix])) { + if ($lang == 'en' || $lang == 'en_GB') { + $lang = 'en_US'; + } + if (empty(self::$ngrams)) { + self::$ngrams = []; + } + if (empty(self::$ngrams[$lang])) { + self::$ngrams[$lang] = []; + } + if (empty(self::$ngrams[$lang][$filter_prefix])) { $filter_path = C\LOCALE_DIR . "/$lang/resources/" . "{$filter_prefix}" . self::FILTER_SUFFIX; if (file_exists($filter_path)) { - self::$ngrams[$filter_prefix] = + self::$ngrams[$lang][$filter_prefix] = BloomFilterFile::load($filter_path); - } else { + } else { return false; } } - return self::$ngrams[$filter_prefix]->contains(mb_strtolower($phrase)); + return self::$ngrams[$lang][$filter_prefix]->contains( + mb_strtolower($phrase)); } /** * Creates a bloom filter file from a n word gram text file. The @@ -114,19 +128,26 @@ class NWordGrams unlink($filter_path); //build again from scratch } $ngrams = new BloomFilterFile($filter_path, $num_ngrams_found); - - $inputFilePath = C\LOCALE_DIR . "/$lang/resources/" . + $input_file_path = C\LOCALE_DIR . "/$lang/resources/" . "{$num_gram}" . self::TEXT_SUFFIX; - $fp = fopen($inputFilePath, 'r') or die("Can't open ngrams text file"); + $fp = fopen($input_file_path, 'r') or + die("Can't open ngrams text file"); while ( ($ngram = fgets($fp)) !== false) { - $words = PhraseParser::stemTerms(trim($ngram), $lang); - if (strlen($words[0]) == 1) { // get rid of n grams like "a dog" - continue; - } - $ngram_stemmed = implode(" ", $words); - $ngrams->add(mb_strtolower($ngram_stemmed)); + $ngram = trim(mb_strtolower($ngram)); + $ngrams->add($ngram); } fclose($fp); + $input_file_path = C\LOCALE_DIR . "/$lang/resources/" . + "{$num_gram}" . self::AUX_SUFFIX; + if (file_exists($input_file_path)) { + $fp = fopen($input_file_path, 'r') or + die("Can't open ngrams text file"); + while ( ($ngram = fgets($fp)) !== false) { + $ngram = trim(mb_strtolower($ngram)); + $ngrams->add($ngram); + } + fclose($fp); + } $ngrams->max_gram_len = $max_gram_len; $ngrams->save(); } @@ -199,12 +220,12 @@ class NWordGrams $replace_array = ['#redirect [[',']]']; break; case self::PAGE_COUNT_WIKIPEDIA: - $pattern = '/^'.$lang.'\s[^\p{P}]+'; - $pattern_end='/u'; + $pattern = '/^' . $lang . "(\.[a-z])?"; + $pattern_end = '\s\d*/u'; $is_count_type = true; break; case self::PAGE_COUNT_WIKTIONARY: - $pattern = '/^'.$lang.'.d\s[^\p{P}]+'; + $pattern = '/^'.$lang.'.d\s[\p{L}|\p{Z}]+'; $pattern_end='/u'; $is_count_type = true; break; @@ -227,10 +248,10 @@ class NWordGrams $replace_types = [self::WIKI_DUMP_TITLE, self::WIKI_DUMP_REDIRECT]; if (is_dir(C\PREP_DIR."/$wiki_file") ) { - $folder_files = glob(C\PREP_DIR."/$wiki_file/*.{gz,bz}", + $folder_files = glob(C\PREP_DIR . "/$wiki_file/*.{gz,bz}", GLOB_BRACE); } else { - $folder_files = [C\PREP_DIR."/$wiki_file"]; + $folder_files = [C\PREP_DIR . "/$wiki_file"]; } $ngrams = []; foreach ($folder_files as $wiki_file_path) { @@ -280,14 +301,18 @@ class NWordGrams $line_parts = explode(" ", $matches[0]); if (isset($line_parts[1]) && isset($line_parts[2])) { - $ngram=mb_ereg_replace("_", " ",$line_parts[1]); - $char_grams = + $ngram = mb_ereg_replace("_", " ", + $line_parts[1]); + if ($char_grams = PhraseParser::getCharGramsTerm( - [$ngram],$locale); - $ngram = implode(" ", $char_grams); - $ngram_num_words=mb_substr_count($ngram, " ")+1; + [$ngram], $locale)) { + $ngram = implode(" ", $char_grams); + } + $ngram_num_words = + mb_substr_count($ngram, " ") + 1; if (($is_all && $ngram_num_words > 1) || - (!$is_all &&$ngram_num_words == $num_gram)){ + (!$is_all && + $ngram_num_words == $num_gram)) { $ngrams[$ngram] = $line_parts[2]; } } @@ -295,14 +320,20 @@ class NWordGrams $ngram = mb_ereg_replace( $replace_array, "", $matches[0]); $ngram = mb_ereg_replace("_", " ", $ngram); - $ngrams[] = $ngram; } if ($is_all && isset($ngram)) { $ngram_num_words = mb_substr_count($ngram, " ") + 1; - $max_gram_len = max($max_gram_len,$ngram_num_words); + $max_gram_len = + max($max_gram_len, $ngram_num_words); } } + if ($is_count_type && count($ngrams) > 4 * $max_terms + && $max_terms > 0) { + echo "..pruning results to $max_terms many\n"; + arsort($ngrams); + $ngrams = array_slice($ngrams, 0, $max_terms); + } } } } @@ -319,13 +350,13 @@ class NWordGrams // in is_all case add prefix*'s for (n >= 3)-grams if ($is_all) { for ($i = 0; $i < $num_ngrams_found; $i++) { - $ngram_in_word = mb_substr_count($ngrams[$i], " ")+1; + $ngram_in_word = mb_substr_count($ngrams[$i], " ") + 1; if ($ngram_in_word >= 3) { $ngram_parts = explode(" ", $ngrams[$i]); $ngram = $ngram_parts[0]; for ($j = 1; $j < $ngram_in_word - 1; $j++ ) { - $ngram .= " ".$ngram_parts[$j]; - $ngrams[] = $ngram."*"; + $ngram .= " " . $ngram_parts[$j]; + $ngrams[] = $ngram . "*"; } } } diff --git a/src/library/PhraseParser.php b/src/library/PhraseParser.php index 14eceb76a..097963ce8 100755 --- a/src/library/PhraseParser.php +++ b/src/library/PhraseParser.php @@ -134,6 +134,7 @@ class PhraseParser $string = trim(substr($string, strlen($control_word) + 1)); } else { self::canonicalizePunctuatedTerms($string, $lang); + self::underscoreEntities($string, $lang); } $terms = self::stemCharGramSegment($string, $lang); $num = count($terms); @@ -237,6 +238,7 @@ class PhraseParser 'QUESTION_ANSWER_EXTRACT' => 0]]; if (!isset(self::$programming_language_map[$lang])) { self::canonicalizePunctuatedTerms($string, $lang); + self::underscoreEntities($string, $lang); $phrase_list['TIMES']['CANONICALIZE'] = changeInMicrotime($start_time); } @@ -287,14 +289,26 @@ class PhraseParser mb_ereg_replace("\.\s*", "", $matches[0])); return $result; }, $string); - $ampersand_pattern = "/[A-Za-z]+(\s*(\s(\'n|\'N)\s|\&)\s*[A-Za-z])+/"; + $ap = "(\'|\u{2019}|\u{02BC})"; + $ampersand_pattern = "/[A-Za-z]+". + "(\s*(\s({$ap}n|{$ap}N)\s|\&)\s*[A-Za-z])+/u"; $string = preg_replace_callback($ampersand_pattern, function($matches) { + $ap = "(\'|\u{2019}|\u{02BC})"; $result = mb_strtolower( - mb_ereg_replace("\s*(\'n|\'N|\&)\s*","_and_",$matches[0])); + mb_ereg_replace("\s*(" . $ap . "n|" . $ap . "N|\&)\s*", + "_and_", $matches[0])); return $result; - }, - $string); + }, $string); + $contraction_pattern = "/\b[A-Za-z]+" . + "({$ap}[A-Za-z]+|\s*{$ap}\s*(s|t))\b/u"; + $string = preg_replace_callback($contraction_pattern, + function($matches) { + $result = mb_strtolower( + mb_ereg_replace("\s*\'|\u2019|\u02BC\s*", + "_ap_", $matches[0])); + return $result; + }, $string); $url_or_email_pattern = '@((gopher|http|https)://([^ \t\r\n\v\f\'\"\;\,<>])*)|'. '([A-Z0-9._%-]+\@[A-Z0-9.-]+\.[A-Z]{2,4})@i'; @@ -303,19 +317,67 @@ class PhraseParser $result = mb_ereg_replace("\.", "_d_",$matches[0]); $result = mb_ereg_replace("\:", "_c_",$result); $result = mb_ereg_replace("\/", "_s_",$result); - $result = mb_ereg_replace("\@", "_a_",$result); + $result = mb_ereg_replace("\@", "_at_",$result); $result = mb_ereg_replace("\[", "_bo_",$result); $result = mb_ereg_replace("\]", "_bc_",$result); $result = mb_ereg_replace("\(", "_po_",$result); $result = mb_ereg_replace("\)", "_pc_",$result); $result = mb_ereg_replace("\?", "_q_",$result); $result = mb_ereg_replace("\=", "_e_",$result); - $result = mb_ereg_replace("\&", "_a_",$result); + $result = mb_ereg_replace("\&", "_and_",$result); $result = mb_strtolower($result); return $result; }, $string); } + /** + * @param string& $string a string of words, etc which might involve such + * terms + * @param $lang a language tag to use as part of the canonicalization + * process not used right now + */ + public static function underscoreEntities(&$string, $lang = null) + { + if (!$lang) { + return; + } + $string = mb_strtolower($string); + $parts = preg_split("/\s+/u", $string); + $parts = array_filter($parts); + $num_parts = count($parts); + $current_entity = ""; + $out_string = ""; + $space = ""; + $i = 0; + $j = -1; + $k = 0; + while ($j < $num_parts) { + $j++; + $current_entity = trim(implode(" ", + array_slice($parts, $i, $j - $i))); + if ($j - $i > 1) { + if (NWordGrams::ngramsContains( + $current_entity, $lang, "all")) { + $last_entity = $current_entity; + $k = $j; + } + if (!NWordGrams::ngramsContains( + $current_entity . "*", $lang, "all")) { + $out_string .= $space . str_replace(" ", "_", + trim($last_entity)); + $space = " "; + $current_entity = ""; + $last_entity = ""; + $i = $k; + $j = $k - 1; + } + } else { + $last_entity = $current_entity; + $k = $j; + } + } + $string = $out_string . " " . $current_entity; + } /** * Splits string according to punctuation and white space then * extracts (stems/char grams) of terms and n word grams from the string diff --git a/src/library/ScraperManager.php b/src/library/ScraperManager.php index 59a4b8722..f3ffdb7d4 100644 --- a/src/library/ScraperManager.php +++ b/src/library/ScraperManager.php @@ -86,20 +86,24 @@ class ScraperManager $scrape_rules = preg_split('/###/u', $scrape_rules_string, 0, PREG_SPLIT_NO_EMPTY); if (count($scrape_rules) > 0) { - $temp_page = self::getContentByXquery($page, + $dom = self::getContentByXquery($page, $scrape_rules[0]); unset($scrape_rules[0]); - if (!empty($temp_page)) { + if (!empty($dom)) { foreach ($scrape_rules as $tag_to_remove) { - $new_temp_page = - self::removeContentByXquery($temp_page, $tag_to_remove); - if (!empty($new_temp_page)) { - $temp_page = $new_temp_page; + self::removeContentByXquery($dom, + $tag_to_remove); + if (empty($dom)) { + break; } } + if (!empty($dom)) { + $new_temp_page = $dom->saveHTML(); + } + set_error_handler(C\NS_CONFIGS . "yioop_error_handler"); } } - return empty($temp_page) ? $page : $temp_page; + return empty($new_temp_page) ? $page : $new_temp_page; } /** * If $signature begins with '/', checks to see if applying @@ -134,52 +138,47 @@ class ScraperManager * @param string $page a document to apply the xpath query against * @param string $query the xpath query to run * - * @return string the content found as a string, otherwise an empty string + * @return \DOMDocument dom of a simplified web page containing nodes + * matching xpath query within an html body tag. */ public static function getContentByXquery($page, $query) { - $result = ""; + $out_dom = null; $dom = new \DOMDocument(); set_error_handler(null); if (@$dom->loadHTML($page)) { $xpath = new \DOMXPath($dom); $xpath_result = $xpath->query($query); if (!empty($xpath_result) && $xpath_result->length > 0) { - $result = $dom->saveHTML($xpath_result->item(0)); + $out_dom = new \DOMDocument(); + $out_dom->loadHTML("<html><body></body></html>"); + $node = $out_dom->importNode($xpath_result->item(0), true); + $out_dom->documentElement->childNodes->item(0)->appendChild( + $node); } } set_error_handler(C\NS_CONFIGS . "yioop_error_handler"); - return $result; + return $out_dom; } /** - * Removes from the contents of a document the results of + * Removes from the contents of a DOMDocument the results of * an xpath query - * @param string $page a document to apply the xpath query against + * @param \DOMDocument $dom a document to apply the xpath query against * @param string $query the xpath query to run - * - * @return string the content less the xpath results as an HTML document */ - public static function removeContentByXquery($page, $query) + public static function removeContentByXquery($dom, $query) { - $result = $page; - $dom = new \DOMDocument(); - set_error_handler(null); - if (@$dom->loadHTML($page)) { - $xpath = new \DOMXPath($dom); - $xpath_result = $xpath->query($query); - if ($xpath_result->length > 0) { - $len = $xpath_result->length; - for ($i = 0; $i < $len; $i++) { - $node = $xpath_result->item($i); - $parent = $node->parentNode; - if ($parent) { - $parent->removeChild($node); - } + $xpath = new \DOMXPath($dom); + $xpath_result = $xpath->query($query); + if ($xpath_result->length > 0) { + $len = $xpath_result->length; + for ($i = 0; $i < $len; $i++) { + $node = $xpath_result->item($i); + $parent = $node->parentNode; + if ($parent) { + $parent->removeChild($node); } - $result = $dom->saveHTML(); } } - set_error_handler(C\NS_CONFIGS . "yioop_error_handler"); - return $result; } } diff --git a/src/library/processors/HtmlProcessor.php b/src/library/processors/HtmlProcessor.php index 59c1be909..1aeb650cb 100755 --- a/src/library/processors/HtmlProcessor.php +++ b/src/library/processors/HtmlProcessor.php @@ -145,9 +145,9 @@ class HtmlProcessor extends TextProcessor } $location = self::location($dom, $url); if ($location) { - $summary[self::LINKS][$location] = "location:".$url; + $summary[self::LINKS][$location] = "location:" . $url; $summary[self::LOCATION] = true; - $summary[self::DESCRIPTION] .= $url." => ".$location; + $summary[self::DESCRIPTION] .= $url . " => " . $location; if (!$summary[self::TITLE]) { $summary[self::TITLE] = $url; } diff --git a/src/library/processors/PageProcessor.php b/src/library/processors/PageProcessor.php index 445d2e508..f4458d3b9 100644 --- a/src/library/processors/PageProcessor.php +++ b/src/library/processors/PageProcessor.php @@ -182,7 +182,6 @@ abstract class PageProcessor implements CrawlConstants * the information in $page */ public abstract function process($page, $url); - /** * Get processors for different file types. constructing * them will populate the self::$indexed_file_types, diff --git a/src/library/summarizers/CentroidWeightedSummarizer.php b/src/library/summarizers/CentroidWeightedSummarizer.php index 6b3b91728..2b5892206 100644 --- a/src/library/summarizers/CentroidWeightedSummarizer.php +++ b/src/library/summarizers/CentroidWeightedSummarizer.php @@ -93,10 +93,11 @@ class CentroidWeightedSummarizer extends Summarizer /* Format the document to remove characters other than periods and alphanumerics. */ + $page = mb_strtolower($page); $formatted_doc = self::formatDoc($page); - $stop_obj = PhraseParser::getTokenizer($lang); /* Splitting into sentences */ $out_sentences = self::getSentences($page); + $stop_obj = PhraseParser::getTokenizer($lang); $sentences = self::removeStopWords($out_sentences, $stop_obj); $sentence_array = self::splitSentences($sentences, $lang); $terms = $sentence_array[0]; @@ -220,19 +221,20 @@ class CentroidWeightedSummarizer extends Summarizer $sentence = ""; $count = 0; $theshold_factor = 1; + $threshold = self::LONG_SENTENCE_THRESHOLD; foreach ($lines as $line) { $sentence .= " " . $line; if (strlen($line) < 2) { continue; } - if ($count < self::LONG_SENTENCE_THRESHOLD || + if ($count < $threshold || strlen($sentence) > $theshold_factor * - self::LONG_SENTENCE_LEN){ + self::LONG_SENTENCE_LEN) { $sentence = preg_replace("/\s+/ui", " ", $sentence); $out[] = trim($sentence); $count++; $theshold_factor = - pow(1.5, floor($count/self::LONG_SENTENCE_THRESHOLD)); + pow(1.5, floor($count/$threshold)); } $sentence = ""; } @@ -251,7 +253,7 @@ class CentroidWeightedSummarizer extends Summarizer public static function formatSentence($sent) { $sent = trim(preg_replace('/[^\p{L}\p{N}\s]+/u', - ' ', mb_strtolower($sent))); + ' ', $sent)); return $sent; } /** @@ -265,7 +267,7 @@ class CentroidWeightedSummarizer extends Summarizer public static function formatDoc($content) { $substitute = ['/[\n\r\-]+/', '/[^\p{L}\s\.]+/u', '/[\.]+/']; - $content = preg_replace($substitute, ' ', mb_strtolower($content)); + $content = preg_replace($substitute, ' ', $content); return $content; } /** diff --git a/src/locale/en_US/resources/Tokenizer.php b/src/locale/en_US/resources/Tokenizer.php index 2b714bbb1..db7ecc92d 100755 --- a/src/locale/en_US/resources/Tokenizer.php +++ b/src/locale/en_US/resources/Tokenizer.php @@ -78,8 +78,7 @@ class Tokenizer * List of verb-like parts of speech that might appear in lexicon file * @array */ - public static $verb_phrases = ["VB", "VBD", "VBG", "VBN", "VBP", - "VBZ"]; + public static $verb_phrases = ["VB", "VBD", "VBG", "VBN", "VBP", "VBZ"]; /** * List of noun-like parts of speech that might appear in lexicon file * @array diff --git a/tests/BloomFilterFileTest.php b/tests/BloomFilterFileTest.php index 17f084fbd..5be7adfdc 100755 --- a/tests/BloomFilterFileTest.php +++ b/tests/BloomFilterFileTest.php @@ -80,8 +80,16 @@ class BloomFilterFileTest extends UnitTest public function inTestCase() { $this->test_objects['FILE1']->add(77); + $this->test_objects['FILE1']->add("prime minister"); + $this->test_objects['FILE1']->add("prime minister*"); $this->assertTrue( $this->test_objects['FILE1']->contains(77), "File 1 contains 77"); + $this->assertTrue( + $this->test_objects['FILE1']->contains("prime minister"), + "File 1 contains prime minister"); + $this->assertTrue( + $this->test_objects['FILE1']->contains("prime minister*"), + "File 1 contains prime minister*"); $this->assertFalse( $this->test_objects['FILE1']->contains(66), "File 1 contains 66"); }