viewgit/inc/functions.php:22 Function utf8_encode() is deprecated [8192]
Filename | |
---|---|
controllers/components/crawl_component.php | |
lib/centroid.php | |
lib/processors/html_processor.php |
diff --git a/controllers/components/crawl_component.php b/controllers/components/crawl_component.php index f6333402e..d590b3635 100644 --- a/controllers/components/crawl_component.php +++ b/controllers/components/crawl_component.php @@ -1170,7 +1170,8 @@ class CrawlComponent extends Component implements CrawlConstants } } $page_processor = new $processor_name($plugin_processors, - $seed_info["general"]["max_description_len"]); + $seed_info["general"]["max_description_len"], + $seed_info["general"]["summarizer_option"]); restore_error_handler(); $doc_info = $page_processor->handle($_REQUEST['TESTPAGE'], $site[self::URL]); diff --git a/lib/centroid.php b/lib/centroid.php index 0d26d1bd1..b02a452ed 100644 --- a/lib/centroid.php +++ b/lib/centroid.php @@ -53,6 +53,20 @@ foreach(glob(LOCALE_DIR."/*/resources/tokenizer.php") class CentroidSummarizer { + + /** + * Number of bytes in a sentence before it is considered long + * We use strlen rather than mbstrlen. This might actually be + * a better metric of the potential of a sentence to have info. + */ + const LONG_SENTENCE_LEN = 30; + + /** + * Number of sentences in a document before only consider longer + * sentences in cenroid + */ + const LONG_SENTENCE_THRESHOLD = 200; + /** * Generates a centroid with which every sentence is ranked with cosine * ranking method and also generates a word cloud. @@ -216,6 +230,7 @@ class CentroidSummarizer PREG_SPLIT_NO_EMPTY); $out = array(); $sentence = ""; + $count = 0; foreach($lines as $line) { $sentence .= " " . $line; if(strlen($line) < 2) { @@ -223,14 +238,18 @@ class CentroidSummarizer } $end = substr($line, -2); if($end[0] != " " && $end[1] != " ") { - $out[] = $sentence; + if($count < self::LONG_SENTENCE_THRESHOLD || + strlen($sentence) > self::LONG_SENTENCE_LEN) { + $out[] = $sentence; + $count++; + } $sentence = ""; } } if($sentence != "") { $out[] = $sentence; } - return $content; + return $out; } /** * Formats the sentences to remove all characters except words, diff --git a/lib/processors/html_processor.php b/lib/processors/html_processor.php index 62f35aa85..d398b96d3 100755 --- a/lib/processors/html_processor.php +++ b/lib/processors/html_processor.php @@ -110,7 +110,7 @@ class HtmlProcessor extends TextProcessor $summary[self::TITLE], $url); if($this->summarizer_option == self::CENTROID_SUMMARIZER) { $summary_cloud = CentroidSummarizer::getCentroidSummary( - $dom_page, $lang); + $dom_page, $summary[self::LANG]); $summary[self::DESCRIPTION] = $summary_cloud[0]; $summary[self::WORD_CLOUD] = $summary_cloud[1]; } else {