viewgit/inc/functions.php:22 Function utf8_encode() is deprecated [8192]

Adds a threshold on sentence length for summaries on longer documents, a=chris

Chris Pollett [2014-05-10 18:May:th]
Adds a threshold on sentence length for summaries on longer documents, a=chris
Filename
controllers/components/crawl_component.php
lib/centroid.php
lib/processors/html_processor.php
diff --git a/controllers/components/crawl_component.php b/controllers/components/crawl_component.php
index f6333402e..d590b3635 100644
--- a/controllers/components/crawl_component.php
+++ b/controllers/components/crawl_component.php
@@ -1170,7 +1170,8 @@ class CrawlComponent extends Component implements CrawlConstants
                 }
             }
             $page_processor = new $processor_name($plugin_processors,
-                $seed_info["general"]["max_description_len"]);
+                $seed_info["general"]["max_description_len"],
+                $seed_info["general"]["summarizer_option"]);
             restore_error_handler();
             $doc_info = $page_processor->handle($_REQUEST['TESTPAGE'],
                 $site[self::URL]);
diff --git a/lib/centroid.php b/lib/centroid.php
index 0d26d1bd1..b02a452ed 100644
--- a/lib/centroid.php
+++ b/lib/centroid.php
@@ -53,6 +53,20 @@ foreach(glob(LOCALE_DIR."/*/resources/tokenizer.php")

 class CentroidSummarizer
 {
+
+    /**
+     * Number of bytes in a sentence before it is considered long
+     * We use strlen rather than mbstrlen. This might actually be
+     * a better metric of the potential of a sentence to have info.
+     */
+    const LONG_SENTENCE_LEN = 30;
+
+    /**
+     * Number of sentences in a document before only consider longer
+     * sentences in cenroid
+     */
+    const LONG_SENTENCE_THRESHOLD = 200;
+
     /**
      *  Generates a centroid with which every sentence is ranked with cosine
      *  ranking method and also generates a word cloud.
@@ -216,6 +230,7 @@ class CentroidSummarizer
             PREG_SPLIT_NO_EMPTY);
         $out = array();
         $sentence = "";
+        $count = 0;
         foreach($lines as $line) {
             $sentence .= " " . $line;
             if(strlen($line) < 2) {
@@ -223,14 +238,18 @@ class CentroidSummarizer
             }
             $end = substr($line, -2);
             if($end[0] != " " && $end[1] != " ") {
-                $out[] = $sentence;
+                if($count < self::LONG_SENTENCE_THRESHOLD ||
+                    strlen($sentence) > self::LONG_SENTENCE_LEN) {
+                    $out[] = $sentence;
+                    $count++;
+                }
                 $sentence = "";
             }
         }
         if($sentence != "") {
             $out[] = $sentence;
         }
-        return $content;
+        return $out;
     }
     /**
      *  Formats the sentences to remove all characters except words,
diff --git a/lib/processors/html_processor.php b/lib/processors/html_processor.php
index 62f35aa85..d398b96d3 100755
--- a/lib/processors/html_processor.php
+++ b/lib/processors/html_processor.php
@@ -110,7 +110,7 @@ class HtmlProcessor extends TextProcessor
                     $summary[self::TITLE], $url);
                 if($this->summarizer_option == self::CENTROID_SUMMARIZER) {
                     $summary_cloud = CentroidSummarizer::getCentroidSummary(
-                        $dom_page, $lang);
+                        $dom_page, $summary[self::LANG]);
                     $summary[self::DESCRIPTION] = $summary_cloud[0];
                     $summary[self::WORD_CLOUD] = $summary_cloud[1];
                 } else {
ViewGit