Added centroid method

Mangesh Dahale [2014-05-07 03:May:th]

Added centroid method

Added stopwordsremover in tokenizer

Signed-off-by: Chris Pollett <chris@pollett.org>

Filename
bin/fetcher.php
configs/config.php
controllers/components/crawl_component.php
css/search.css
lib/centroid.php
lib/crawl_constants.php
lib/processors/html_processor.php
lib/processors/text_processor.php
locale/en-US/resources/tokenizer.php
locale/zh-CN/resources/tokenizer.php
models/crawl_model.php
views/elements/crawloptions_element.php
views/search_view.php

diff --git a/bin/fetcher.php b/bin/fetcher.php
index f3b701d15..d71e8e429 100755
--- a/bin/fetcher.php
+++ b/bin/fetcher.php
@@ -313,7 +313,12 @@ class Fetcher implements CrawlConstants
      * @var string
      */
     var $crawl_order;
-
+    /**
+     * Stores the name of the crawler used for crawling.
+     * Possible values are Basic and Centroid
+     * @var string
+     */
+    var $summarizer_option;
     /**
      * Indicates the kind of crawl being performed: self::WEB_CRAWL indicates
      * a new crawl of the web; self::ARCHIVE_CRAWL indicates a crawl of an
@@ -547,6 +552,7 @@ class Fetcher implements CrawlConstants

         //we will get the correct crawl order from a queue_server
         $this->crawl_order = self::PAGE_IMPORTANCE;
+        $this->summarizer_option = self::CENTROID_SUMMARIZER;
     }

     /**
@@ -1320,6 +1326,7 @@ class Fetcher implements CrawlConstants
             $info[self::CURRENT_SERVER] = $this->current_server;
         }
         $update_fields = array(self::CRAWL_TYPE => "crawl_type",
+            self::SUMMARIZER_OPTION => "summarizer_option",
             self::CRAWL_INDEX => "crawl_index", self::CRAWL_ORDER =>
             'crawl_order', self::CACHE_PAGES => 'cache_pages',
             self::INDEXED_FILE_TYPES => 'indexed_file_types',
@@ -1774,6 +1781,11 @@ class Fetcher implements CrawlConstants
                     $site[self::HASH] = FetchUrl::computePageHash(
                         $site[self::PAGE]);
                 }
+                if(isset($doc_info[self::WORD_CLOUD])) {
+                        $site[self::WORD_CLOUD] = $doc_info[self::WORD_CLOUD];
+                    } else {
+                        $site[self::WORD_CLOUD] = NULL;
+                    }
                 if(isset($doc_info[self::CRAWL_DELAY])) {
                     $site[self::CRAWL_DELAY] = $doc_info[self::CRAWL_DELAY];
                 }
@@ -1844,6 +1856,10 @@ class Fetcher implements CrawlConstants
                     $summarized_site_pages[$i][self::LINKS] =
                         $site[self::DOC_INFO][self::LINKS];
                 }
+                if(isset($site[self::DOC_INFO][self::WORD_CLOUD])) {
+                    $summarized_site_pages[$i][self::WORD_CLOUD] =
+                        $site[self::DOC_INFO][self::WORD_CLOUD];
+                }
                 if(isset($site[self::DOC_INFO][self::THUMB])) {
                     $summarized_site_pages[$i][self::THUMB] =
                         $site[self::DOC_INFO][self::THUMB];
diff --git a/configs/config.php b/configs/config.php
index 222c0d5d0..307879a4f 100755
--- a/configs/config.php
+++ b/configs/config.php
@@ -106,7 +106,7 @@ if(MAINTENANCE_MODE && $_SERVER["SERVER_ADDR"] != $_SERVER["REMOTE_ADDR"]) {
 if(!defined('WORK_DIRECTORY')) {
 /*+++ The next block of code is machine edited, change at
 your own risk, please use configure web page instead +++*/
-define('WORK_DIRECTORY', '');
+define('WORK_DIRECTORY', 'C:/xampp/htdocs/yioop_data');
 /*++++++*/
 // end machine edited code
 }
diff --git a/controllers/components/crawl_component.php b/controllers/components/crawl_component.php
index 31fffbfc1..21a085d4e 100644
--- a/controllers/components/crawl_component.php
+++ b/controllers/components/crawl_component.php
@@ -463,6 +463,19 @@ class CrawlComponent extends Component implements CrawlConstants
                 tl('crawl_component_breadth_first'),
             self::PAGE_IMPORTANCE =>
                 tl('crawl_component_page_importance'));
+        $data['available_summarizers'] = array(
+            self::BASIC_SUMMARIZER =>
+                tl('crawl_component_basic'),
+            self::CENTROID_SUMMARIZER =>
+                tl('crawl_component_centroid'));
+        if(!$no_further_changes && isset($_REQUEST['summarizer_option'])
+            &&  in_array($_REQUEST['summarizer_option'],
+            array_keys($data['available_summarizers']))) {
+            $seed_info['general']['summarizer_option'] =
+                $_REQUEST['summarizer_option'];
+            $update_flag = true;
+        }
+        $data['summarizer_option'] = $seed_info['general']['summarizer_option'];

         if(!$no_further_changes && isset($_REQUEST['crawl_order'])
             &&  in_array($_REQUEST['crawl_order'],
diff --git a/css/search.css b/css/search.css
index 14b6b0f1b..054753581 100755
--- a/css/search.css
+++ b/css/search.css
@@ -61,7 +61,52 @@ p
 {
     color: green;
 }
+span.tab
+{
+    margin-left:1em;
+    font-weight: bold;
+    color : gray;
+}
+
+.wordcloud .wordcloud1
+{
+    font-size:16pt;
+    color: #008800;
+    text-decoration: none;
+}
+
+.wordcloud .wordcloud2
+{
+    font-size:15pt;
+    color: #009900;
+    text-decoration: none;
+}
+
+.wordcloud .wordcloud3
+{
+    font-size:14pt;
+    color: #00AA00;
+    text-decoration: none;
+}

+.wordcloud .wordcloud4
+{
+    font-size:13pt;
+    color: #00BB00;
+    text-decoration: none;
+}
+
+.wordcloud .wordcloud5
+{
+    font-size:12pt;
+    color: #00CC00;
+    text-decoration: none;
+}
+
+.wordcloud:hover
+{
+    text-decoration: underline;
+}

 .hidden
 {
@@ -1115,6 +1160,7 @@ ul.in-list li
 /*
   Styles for search and search result pages
  */
+
 .html-ltr .serp
 {
     left: 2.2in;
diff --git a/lib/centroid.php b/lib/centroid.php
new file mode 100644
index 000000000..874d0b823
--- /dev/null
+++ b/lib/centroid.php
@@ -0,0 +1,295 @@
+<?php
+/**
+ *  SeekQuarry/Yioop --
+ *  Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ *  Copyright (C) 2009 - 2014  Chris Pollett chris@pollett.org
+ *
+ *  LICENSE:
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *  END LICENSE
+ *
+ * @author Mangesh Dahale mangeshadahale@gmail.com
+ * @package seek_quarry
+ * @subpackage library
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2014
+ * @filesource
+ */
+if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}
+/**
+ * Reads in constants used as enums used for storing web sites
+ */
+require_once BASE_DIR."/lib/crawl_constants.php";
+/**
+ * Contains the max_description_length for the summary
+ */
+require_once BASE_DIR."/lib/processors/page_processor.php";
+/**
+ * Contains function getTokenizer to get the object of the language specified.
+ */
+require_once BASE_DIR."/lib/phrase_parser.php";
+/**
+ * Load in locale specific tokenizing code
+ */
+foreach(glob(LOCALE_DIR."/*/resources/tokenizer.php")
+    as $filename) {
+    require_once $filename;
+}
+
+class CentroidSummarizer
+{
+    /**
+     *  Generates a centroid with which every sentence is ranked with cosine
+     *  ranking method and also generates a word cloud.
+     *  @param string $doc complete raw page to generate the summary from.
+     *  @param string $lang language of the page to decide which stop words to
+     *      call proper tokenizer.php of the specified language.
+     *
+     *  @return array array of summary and word cloud
+     */
+    static function getCentroidSummary($doc, $lang)
+    {
+        $doc = self::pageProcessing($doc);
+        /* Format the document to remove characters other than periods and
+        alphanumerics.
+        */
+        $formatted_doc = self::formatDoc($doc);
+        $stop_obj = PhraseParser::getTokenizer($lang);
+        if($stop_obj != NULL) {
+            $doc_stop = $stop_obj -> stopwordsRemover($doc);
+        } else {
+            $doc_stop = $doc;
+        }
+
+        /* Spliting into sentences */
+        $sentences = self::getSentences($doc);
+        $n = count($sentences);
+
+        /*  Spliting into terms */
+        $doc_st = self::formatSentence($doc_stop);
+        $term = preg_split("/[\s,]+/u", $doc_st, -1, PREG_SPLIT_NO_EMPTY);
+        $terms = array_unique($term);
+        sort($terms);
+        $t = count($terms);
+
+        /* Initialize term-frequecy array */
+        $tf = array();
+        $tf = array_fill(0, $t, array_fill(0, $n, 0));
+
+        /* Initialize Nk array(Number of Documents the term occurs) */
+        $nk = array();
+        $nk = array_fill(0, $t, 0);
+
+        /* Count TF for each word */
+        for($i=0;$i<$n;$i++) {
+            for($j=0;$j<$t;$j++) {
+                $nt = substr_count($sentences[$i], $terms[$j]);
+                $tf[$i][$j] = 1+ log($nt);
+                if($nt!=0) {
+                    $nk[$j]++;
+                }
+                else {
+                    $tf[$i][$j] = 0;
+                }
+            }
+        }
+        /* Calculate weights of each term for every sentence */
+        $w = array();
+        $idf = array();
+        $idf_temp = 0;
+
+        for($k=0; $k<$t; $k++) {
+            $idf_temp = @($n/$nk[$k]);
+            if($nk[$k] == 0) {
+                $idf_temp = 0;
+            }
+            $tmp = @log($idf_temp);
+            $idf[$k] = $tmp;
+            for($i = 0; $i < $n; $i++) {
+                if($tmp >= 0 && $tmp < 10) {
+                    $w[$i][$k] = $tf[$i][$k] * $tmp;
+                } else {
+                    $w[$i][$k] = 0;
+                }
+
+            }
+        }
+
+        /* Count TF for finding centroid */
+        $doc_centroid = preg_replace('/[\.]+/', ' ', $formatted_doc);
+        $wc = array();
+        for($j=0; $j<$t; $j++) {
+            $nt = preg_match_all('/\b'.$terms[$j].'\b/', $doc_centroid);
+            $tfc[$j] = 1 + log($nt);
+            $wc[$j] = $tfc[$j] * $idf[$j];
+            if(is_nan($wc[$j]) || is_infinite($wc[$j]))
+                $wc[$j] = 0;
+        }
+        /* Calculate centroid */
+        arsort($wc);
+        $centroid = array();
+        $centroid = array_slice($wc, 0, 5, true);
+
+        /* Initializing centroid weight array by 0 */
+        $wc = array_fill(0, $t, 0);
+
+        /* Word cloud */
+        $i = 0;
+        $word_cloud = array();
+        foreach($centroid as $key => $value) {
+            $wc[$key] = $value;
+            $word_cloud[$i] = $terms[$key];
+            $i++;
+        }
+        ksort($wc);
+
+        /* Calculate similarity measure between centroid and each sentence */
+        $sim=array();
+        for($i=0; $i < $n; $i++) {
+            $a = $b1 = $b2 = $c1 = $c2 = $d = 0;
+            for($k=0; $k<$t; $k++) {
+                    $wik = $w[$i][$k];
+                    $wck = $wc[$k];
+                    $idfk = $idf[$k];
+                    $a += ($wik * $wck * $idfk);
+                    $b1 += ($wik * $wik);
+                    $c1 += ($wck * $wck);
+            }
+            $b2 = sqrt($b1);
+            $c2 = sqrt($c1);
+            $d = $b2 * $c2;
+            $sim[$i] = @($a / $d);
+            if($d == 0) {
+                $sim[$i] = 0;
+            }
+        }
+        arsort($sim);
+        /* Getting how many sentences should be there in summary */
+        $top = self::summarySentenceCount($sentences, $sim);
+        $sum_array = array();
+        $sum_array = array_slice($sim, 0, $top-1, true);
+        ksort($sum_array);
+        /* Printing Summary */
+        $summary = '';
+        $d = null;
+        foreach($sum_array as $key => $value) {
+            $summary .= "$sentences[$key]".". ";
+        }
+
+        /* Summary of text summarization */
+        $words = explode(" ", $doc);
+        $sum_words = explode(" ", $summary);
+        return array($summary, $word_cloud);
+    }
+    /**
+     *  Calculates how many sentences to put in the summary to match the
+     *  MAX_DESCRIPTION_LEN.
+     *  @param string $doc complete raw page to generate the summary from.
+     *  @param string $lang language of the page to decide which stop words to
+     *      call proper tokenizer.php of the specified language.
+     *
+     *  @return array array of summary and word cloud
+     */
+    static function summarySentenceCount($sentences, $sim)
+    {
+        $top = null;
+        $count = 0;
+        foreach($sim as $key => $value)
+        {
+            if($count < PageProcessor::$max_description_len) {
+                $count += strlen($sentences[$key]);
+                $top++;
+            }
+        }
+        return $top;
+    }
+    /**
+     *  Breaks any content into sentences by splitting it on spaces or carriage
+     *    returns
+     *  @param string $content complete page.
+     *  @return array array of sentences from that content.
+     */
+    static function getSentences($content)
+    {
+        $content = preg_split("/\.\s|[\n\r]+/", $content, -1,
+            PREG_SPLIT_NO_EMPTY);
+        return $content;
+    }
+    /**
+     *  Formats the sentences to remove all characters except words,
+     *    digits and spaces
+     *  @param string $sent complete page.
+     *  @return string formatted sentences.
+     */
+    static function formatSentence($sent)
+    {
+        $sent = trim(preg_replace('/[^\p{L}\p{N}\s]+/u',
+            ' ', mb_strtolower($sent)));
+        return $sent;
+    }
+
+    /**
+     *  Formats the document to remove carriage returns, hyphens and digits
+     *    as we will not be using digits in word cloud.
+     *    formatted document generated by this function is only used to compute
+     *    centroid.
+     *  @param string $content formatted page.
+     *  @return string formatted document.
+     */
+    static function formatDoc($content)
+    {
+        $substitute = array('/[\n\r\-]+/', '/[^\p{L}\s\.]+/u');
+        $content = preg_replace($substitute, '', mb_strtolower($content));
+        return $content;
+    }
+
+    /**
+     *  This function does an additional processing on the page
+     *    such as removing all the tags from the page
+     *  @param string $page complete page.
+     *  @return string processed page.
+     */
+    static function pageProcessing($page)
+    {
+        $substitutions = array('@<script[^>]*?>.*?</script>@si',
+            '/\&nbsp\;|\&rdquo\;|\&ldquo\;|\&mdash\;/si',
+            '@<style[^>]*?>.*?</style>@si', '/[\^\(\)]/',
+            '/\[(.*?)\]/', '/\t\n/'
+        );
+        $page = preg_replace($substitutions, ' ', $page);
+        $page = preg_replace('/\s{2,}/', '', $page);
+        $new_page = preg_replace("/\<br\s*(\/)?\s*\>/", "\n", $page);
+        $changed = false;
+        if($new_page != $page) {
+            $changed = true;
+            $page = $new_page;
+        }
+        $page = preg_replace("/\<\/(h1|h2|h3|h4|h5|h6|table|tr|td|div|".
+            "p|address|section)\s*\>/", "\n\n", $page);
+        $page = preg_replace("/\<a/", " <a", $page);
+        $page = preg_replace("/\&\#\d{3}(\d?)\;|\&\w+\;/", " ", $page);
+        $page = strip_tags($page);
+        if($changed) {
+            $page = preg_replace("/(\r?\n[\t| ]*){2}/", "\n", $page);
+        }
+        $page = preg_replace("/(\r?\n[\t| ]*)/", "\n", $page);
+        $page = preg_replace("/\n\n\n+/", "\n\n", $page);
+        return $page;
+    }
+}
+?>
\ No newline at end of file
diff --git a/lib/crawl_constants.php b/lib/crawl_constants.php
index 3bce0d381..61bf18672 100755
--- a/lib/crawl_constants.php
+++ b/lib/crawl_constants.php
@@ -249,5 +249,9 @@ interface CrawlConstants
     const TOR_PROXY = 'dh';
     const PROXY_SERVERS = 'di';
     const NEEDS_OFFSET_FLAG = 0x7FFFFFFF;
+    const BASIC_SUMMARIZER = 'dk';
+    const CENTROID_SUMMARIZER = 'dl';
+    const SUMMARIZER_OPTION = 'dm';
+    const WORD_CLOUD = 'dn';
 }
 ?>
\ No newline at end of file
diff --git a/lib/processors/html_processor.php b/lib/processors/html_processor.php
index c833b1386..c86f656aa 100755
--- a/lib/processors/html_processor.php
+++ b/lib/processors/html_processor.php
@@ -55,7 +55,10 @@ require_once BASE_DIR."/lib/processors/text_processor.php";
  * Load so can parse urls
  */
 require_once BASE_DIR."/lib/url_parser.php";
-
+/**
+* Get the centroid summary
+*/
+require_once BASE_DIR."/lib/centroid.php";
 /**
  * For guessing language from charset
  */
@@ -87,6 +90,17 @@ class HtmlProcessor extends TextProcessor
      *  @return array  a summary of the contents of the page
      *
      */
+    function getSeedInfo($use_default = false)
+    {
+        if(file_exists(WORK_DIRECTORY."/crawl.ini") && !$use_default) {
+            $info = parse_ini_with_fallback(WORK_DIRECTORY."/crawl.ini");
+        } else {
+            $info = parse_ini_with_fallback(
+                BASE_DIR."/configs/default_crawl.ini");
+        }
+
+        return $info;
+    }
     function process($page, $url)
     {
         $summary = NULL;
@@ -103,8 +117,20 @@ class HtmlProcessor extends TextProcessor
                 if($summary[self::TITLE] == "") {
                     $summary[self::TITLE] = self::crudeTitle($dom_page);
                 }
-                $summary[self::DESCRIPTION] = self::description($dom,
+                $summarizer = $this->getSeedInfo();
+                $lang = self::lang($dom,
+                    $summary[self::TITLE], $url);
+                if($summarizer['general']['summarizer_option']==
+                        self::CENTROID_SUMMARIZER) {
+                    $summary_cloud =
+                        CentroidSummarizer::getCentroidSummary($dom_page,$lang);
+                    $summary[self::DESCRIPTION] = $summary_cloud[0];
+                    $summary[self::WORD_CLOUD] = $summary_cloud[1];
+                }
+                else {
+                    $summary[self::DESCRIPTION] = self::description($dom,
                     $dom_page);
+                }
                 if(trim($summary[self::DESCRIPTION]) == "") {
                     $summary[self::DESCRIPTION] = self::crudeDescription(
                         $dom_page);
diff --git a/lib/processors/text_processor.php b/lib/processors/text_processor.php
index 2a8918e4b..8bd535290 100755
--- a/lib/processors/text_processor.php
+++ b/lib/processors/text_processor.php
@@ -52,6 +52,11 @@ $PAGE_PROCESSORS =  array_merge($PAGE_PROCESSORS, $add_types);
  */
 require_once BASE_DIR."/lib/processors/page_processor.php";

+/**
+* Get the centroid summary
+*/
+require_once BASE_DIR."/lib/centroid.php";
+
 /**
  * So can extract parts of the URL if need to guess lang
  */
@@ -80,14 +85,35 @@ class TextProcessor extends PageProcessor
      * @return array a summary of (title, description,links, and content) of
      *      the information in $page
      */
+    function getSeedInfo($use_default = false)
+    {
+        if(file_exists(WORK_DIRECTORY."/crawl.ini") && !$use_default) {
+            $info = parse_ini_with_fallback(WORK_DIRECTORY."/crawl.ini");
+        } else {
+            $info = parse_ini_with_fallback(
+                BASE_DIR."/configs/default_crawl.ini");
+        }
+
+        return $info;
+    }
     function process($page, $url)
     {
         $summary = NULL;
-
+        $summarizer = $this->getSeedInfo();
         if(is_string($page)) {
             $summary[self::TITLE] = "";
-            $summary[self::DESCRIPTION] = mb_substr($page, 0,
-                self::$max_description_len);
+            $lang = self::calculateLang($page);
+            if($summarizer['general']['summarizer_option']==
+                self::CENTROID_SUMMARIZER) {
+                $summary_cloud =
+                    CentroidSummarizer::getCentroidSummary($page,$lang);
+                $summary[self::DESCRIPTION] = $summary_cloud[0];
+                $summary[self::WORD_CLOUD] = $summary_cloud[1];
+            }
+            else {
+                $summary[self::DESCRIPTION] = mb_substr($page, 0,
+                    self::$max_description_len);
+            }
             $summary[self::LANG] = self::calculateLang(
                 $summary[self::DESCRIPTION]);
             $summary[self::LINKS] = self::extractHttpHttpsUrls($page);
diff --git a/locale/en-US/resources/tokenizer.php b/locale/en-US/resources/tokenizer.php
index 87c0b50bb..e5f7d2001 100755
--- a/locale/en-US/resources/tokenizer.php
+++ b/locale/en-US/resources/tokenizer.php
@@ -92,7 +92,125 @@ class EnTokenizer
     {
         return $pre_segment;
     }
-
+    /**
+     * Removes the stop words from the page
+     *
+     * @param string $page the page to remove stop words from.
+     * @return string $page with no stop words
+     */
+    static function stopwordsRemover($page)
+    {
+        $stop_words = array('a','able','about','above','abst',
+        'accordance','according','based','accordingly','across','act',
+        'actually','added','adj','affected','affecting','affects','after',
+        'afterwards','again','against','ah','all','almost','alone','along',
+        'already','also','although','always','am','among','amongst','an','and',
+        'announce','another','any','anybody','anyhow','anymore','anyone',
+        'anything','anyway','anyways','anywhere','apparently','approximately',
+        'are','aren','arent','arise','around','as','aside','ask','asking','at',
+        'auth','available','away','awfully','b','back','be','became','because',
+        'become','becomes','becoming','been','before','beforehand','begin',
+        'beginning','beginnings','begins','behind','being','believe','below',
+        'beside','besides','between','beyond','biol','both','brief','briefly',
+        'but','by','c','ca','came','can','cannot','cant','cause','causes',
+        'certain','certainly','co','com','come','comes','contain','containing',
+        'contains','could','couldnt','d','date','did','didnt',
+        'different','do','does','doesnt','doing',
+        'done','dont','down','downwards',
+        'due','during','e','each','ed','edu','effect','eg','eight','eighty',
+        'either','else','elsewhere','end',
+        'ending','enough','especially','et',
+        'et-al','etc','even','ever','every',
+        'everybody','everyone','everything'
+        ,'everywhere','ex','except','f','far','few','ff','fifth','first',
+        'five','fix','followed','following','follows','for','former',
+        'formerly','forth','found','four','from','further','furthermore',
+        'g','gave','get','gets','getting','give','given','gives','giving','go',
+        'goes','gone','got','gotten','h','had','happens','hardly','has','hasnt',
+        'have','havent','having','he','hed','hence','her','here','hereafter',
+        'hereby','herein','heres','hereupon','hers','herself','hes','hi','hid',
+        'him','himself','his','hither','home','how','howbeit',
+        'however','hundred','i','id','ie','if','ill',
+        'im','immediate','immediately',
+        'importance','important','in','inc','indeed','index','information',
+        'instead','into','invention','inward','is','isnt','it','itd','itll',
+        'its','itself','ive','j','just','k','keep','keeps',
+        'kept','kg','km','know',
+        'known','knows','l','largely','last','lately',
+        'later','latter','latterly',
+        'least','less','lest','let','lets','like','liked','likely','line',
+        'little','ll','look','looking','looks','ltd','m','made','mainly','make',
+        'makes','many','may','maybe','me','mean','means','meantime','meanwhile',
+        'merely','mg','might','million','miss','ml','more','moreover','most',
+        'mostly','mr','mrs','much','mug','must','my','myself','n','na','name',
+        'namely','nay','nd','near','nearly','necessarily','necessary','need',
+        'needs','neither','never','nevertheless','new','next',
+        'nine','ninety','no',
+        'nobody','non','none','nonetheless','noone',
+        'nor','normally','nos','not',
+        'noted','nothing','now','nowhere','o','obtain',
+        'obtained','obviously','of',
+        'off','often','oh','ok','okay','old','omitted','on','once','one','ones',
+        'only','onto','or','ord','other','others',
+        'otherwise','ought','our','ours',
+        'ourselves','out','outside','over','overall','owing','own','p','page',
+        'pages','part','particular','particularly',
+        'past','per','perhaps','placed',
+        'please','plus','poorly','possible','possibly','potentially','pp',
+        'predominantly','present','previously',
+        'primarily','probably','promptly',
+        'proud','provides','put','q','que','quickly','quite','qv','r','ran',
+        'rather','rd','re','readily','really','recent','recently','ref','refs',
+        'regarding','regardless','regards','related','relatively','research',
+        'respectively','resulted','resulting',
+        'results','right','run','s','said',
+        'same','saw','say','saying','says','sec',
+        'section','see','seeing','seem',
+        'seemed','seeming','seems',
+        'seen','self','selves','sent','seven','several',
+        'shall','she','shed','shell',
+        'shes','should','shouldnt','show','showed','shown','showns','shows',
+        'significant','significantly','similar','similarly','since',
+        'six','slightly',
+        'so','some','somebody','somehow','someone','somethan',
+        'something','sometime',
+        'sometimes','somewhat','somewhere','soon',
+        'sorry','specifically','specified',
+        'specify','specifying','still','stop','strongly','sub','substantially',
+        'successfully','such','sufficiently','suggest','sup','sure','t','take',
+        'taken','taking','tell','tends','th','than',
+        'thank','thanks','thanx','that',
+        'thatll','thats','thatve','the','their',
+        'theirs','them','themselves','then',
+        'thence','there','thereafter','thereby','thered','therefore','therein',
+        'therell','thereof','therere','theres','thereto','thereupon','thereve',
+        'these','they','theyd','theyll','theyre',
+        'theyve','think','this','those',
+        'thou','though','thoughh','thousand','throug',
+        'through','throughout','thru',
+        'thus','til','tip','to','together','too',
+        'took','toward','towards','tried',
+        'tries','truly','try','trying','ts','twice','two','u','un','under',
+        'unfortunately','unless','unlike','unlikely','until','unto','up','upon',
+        'ups','us','use','used','useful','usefully','usefulness','uses','using',
+        'usually','v','value','various','ve','very',
+        'via','viz','vol','vols','vs',
+        'w','want','wants','was','wasnt','way','we',
+        'wed','welcome','well','went',
+        'were','werent','weve','what','whatever',
+        'whatll','whats','when','whence',
+        'whenever','where','whereafter','whereas','whereby','wherein','wheres',
+        'whereupon','wherever','whether','which','while','whim','whither','who',
+        'whod','whoever','whole','wholl','whom','whomever','whos','whose','why',
+        'widely','willing','wish','with','within',
+        'without','wont','words','world',
+        'would','wouldnt','www','x','y','yes','yet','you','youd','youll','your',
+        'youre','yours','yourself','yourselves','youve','z','zero');
+
+        $page = preg_replace('/\b('.implode('|',$stop_words).')\b/', '',
+            mb_strtolower($page));
+        return $page;
+    }
     /**
      * Computes the stem of an English word
      *
diff --git a/locale/zh-CN/resources/tokenizer.php b/locale/zh-CN/resources/tokenizer.php
index c4e5e3223..e5f20c0ab 100755
--- a/locale/zh-CN/resources/tokenizer.php
+++ b/locale/zh-CN/resources/tokenizer.php
@@ -43,6 +43,16 @@ if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}

 class ZhTokenizer
 {
+    /**
+     * Removes the stop words from the page
+     * @param string $page the page to remove stop words from.
+     * @return string $page with no stop words
+     */
+    static function stopwordsRemover($page)
+    {
+        return $page;
+    }
+
     static function segment($pre_segment)
     {
         return PhraseParser::reverseMaximalMatch($pre_segment, "zh-CN");
diff --git a/models/crawl_model.php b/models/crawl_model.php
index 14d36cf19..42eda1de7 100755
--- a/models/crawl_model.php
+++ b/models/crawl_model.php
@@ -460,6 +460,9 @@ class CrawlModel extends ParallelModel implements CrawlConstants
         if(!isset($info["general"]["cache_pages"])) {
             $info["general"]["cache_pages"] = true;
         }
+        if(!isset($info["general"]["summarizer_option"])) {
+            $info["general"]["summarizer_option"] = "";
+            }
         $n = array();
         $n[] = <<<EOT
 ; ***** BEGIN LICENSE BLOCK *****
@@ -496,6 +499,8 @@ EOT;
         }
         $n[] = '[general]';
         $n[] = "crawl_order = '".$info['general']['crawl_order']."';";
+        $n[] = "summarizer_option = '".
+            $info['general']['summarizer_option']."';";
         $n[] = "crawl_type = '".$info['general']['crawl_type']."';";
         $n[] = "crawl_index = '".$info['general']['crawl_index']."';";
         $n[] = 'arc_dir = "'.$info["general"]["arc_dir"].'";';
@@ -611,6 +616,8 @@ EOT;
                 "crawl_index" => array(self::CRAWL_INDEX, ''),
                 "crawl_order" => array(self::CRAWL_ORDER,
                     self::PAGE_IMPORTANCE),
+                "summarizer_option" => array(self::SUMMARIZER_OPTION,
+                    self::BASIC_SUMMARIZER),
                 "arc_dir" => array(self::ARC_DIR, ''),
                 "arc_type" => array(self::ARC_TYPE, ''),
                 "cache_pages" => array(self::CACHE_PAGES, true),
diff --git a/views/elements/crawloptions_element.php b/views/elements/crawloptions_element.php
index a3d1941c8..01152b095 100644
--- a/views/elements/crawloptions_element.php
+++ b/views/elements/crawloptions_element.php
@@ -123,6 +123,13 @@ class CrawloptionsElement extends Element
                     name="restrict_sites_by_url" value="true"
                     onclick="setDisplay('toggle', this.checked)" <?php
                     e($data['TOGGLE_STATE']); ?> /></div>
+        <div class="top-margin"><label for="summarizer"><b><?php
+            e(tl('crawloptions_element_summarizer'))?></b></label><?php
+                $this->view->helper("options")->render("summarizer",
+                "summarizer_option",$data['available_summarizers'],
+                $data['summarizer_option']);
+            ?>
+        </div>
         <div id="toggle">
             <div class="top-margin"><label for="allowed-sites"><b><?php
             e(tl('crawloptions_element_allowed_to_crawl'))?></b></label></div>
diff --git a/views/search_view.php b/views/search_view.php
index 1cba33af5..9a3be9b44 100755
--- a/views/search_view.php
+++ b/views/search_view.php
@@ -245,7 +245,6 @@ class SearchView extends View implements CrawlConstants
                     continue;
                 }
                 ?>
-
                 <h2>
                 <?php
                     if(strpos($link_url, self::GIT_EXTENSION)) { ?>
@@ -280,9 +279,24 @@ class SearchView extends View implements CrawlConstants
                         $data['VIDEO_SOURCES'], $data["OPEN_IN_TABS"]);
                 }
                 ?>
-                <p class="echo-link" <?php e($subtitle); ?>><?php
+                <p><span class="echo-link"<?php e($subtitle); ?>><?php
                     e(UrlParser::simplifyUrl($url, 100)." ");
-                ?></p>
+                ?></span>
+                <?php if(isset($page[self::WORD_CLOUD])) {
+                    $cloud = $page[self::WORD_CLOUD];
+                    $i = 1;
+                    e("<span class='tab'>Word cloud:</span>");
+                        foreach($cloud as $word) {?>
+                        <span class="wordcloud">
+                        <a class='wordcloud<?php e($i)?>'
+                        href="?<?php e(CSRF_TOKEN."=".$data[CSRF_TOKEN]);
+                            ?>&amp;its=<?php e($data['its']);?>
+                            &amp;q=<?php e($word);?>"><?php
+                        e($this->helper("displayresults")->
+                            render($word)."</span></a>");
+                        $i++;
+                        }
+                    }?></p>
                 <?php if(!isset($page[self::ROBOT_METAS]) ||
                     !in_array("NOSNIPPET", $page[self::ROBOT_METAS])) {
                         $description = isset($page[self::DESCRIPTION]) ?

ViewGit