Modify getSentences so does sentence splitting slightly more accurately, a=chris

Chris Pollett [2014-05-09 23:May:th]
Modify getSentences so does sentence splitting slightly more accurately, a=chris
Filename
lib/centroid.php
diff --git a/lib/centroid.php b/lib/centroid.php
index 27e834553..0d26d1bd1 100644
--- a/lib/centroid.php
+++ b/lib/centroid.php
@@ -70,7 +70,7 @@ class CentroidSummarizer
         */
         $formatted_doc = self::formatDoc($doc);
         $stop_obj = PhraseParser::getTokenizer($lang);
-        if($stop_obj != NULL) {
+        if($stop_obj && method_exists($stop_obj, "stopwordsRemover")) {
             $doc_stop = $stop_obj->stopwordsRemover($doc);
         } else {
             $doc_stop = $doc;
@@ -212,8 +212,24 @@ class CentroidSummarizer
      */
     static function getSentences($content)
     {
-        $content = preg_split("/\.\s|[\n\r]+/", $content, -1,
+        $lines = preg_split("/[\.\!\?。]\s+|[\n\r][\n\r]+/u", $content, -1,
             PREG_SPLIT_NO_EMPTY);
+        $out = array();
+        $sentence = "";
+        foreach($lines as $line) {
+            $sentence .= " " . $line;
+            if(strlen($line) < 2) {
+                continue;
+            }
+            $end = substr($line, -2);
+            if($end[0] != " " && $end[1] != " ") {
+                $out[] = $sentence;
+                $sentence = "";
+            }
+        }
+        if($sentence != "") {
+            $out[] = $sentence;
+        }
         return $content;
     }
     /**
ViewGit