viewgit/inc/functions.php:22 Function utf8_encode() is deprecated [8192]
diff --git a/bin/fetcher.php b/bin/fetcher.php index f3b701d15..d71e8e429 100755 --- a/bin/fetcher.php +++ b/bin/fetcher.php @@ -313,7 +313,12 @@ class Fetcher implements CrawlConstants * @var string */ var $crawl_order; - + /** + * Stores the name of the crawler used for crawling. + * Possible values are Basic and Centroid + * @var string + */ + var $summarizer_option; /** * Indicates the kind of crawl being performed: self::WEB_CRAWL indicates * a new crawl of the web; self::ARCHIVE_CRAWL indicates a crawl of an @@ -547,6 +552,7 @@ class Fetcher implements CrawlConstants //we will get the correct crawl order from a queue_server $this->crawl_order = self::PAGE_IMPORTANCE; + $this->summarizer_option = self::CENTROID_SUMMARIZER; } /** @@ -1320,6 +1326,7 @@ class Fetcher implements CrawlConstants $info[self::CURRENT_SERVER] = $this->current_server; } $update_fields = array(self::CRAWL_TYPE => "crawl_type", + self::SUMMARIZER_OPTION => "summarizer_option", self::CRAWL_INDEX => "crawl_index", self::CRAWL_ORDER => 'crawl_order', self::CACHE_PAGES => 'cache_pages', self::INDEXED_FILE_TYPES => 'indexed_file_types', @@ -1774,6 +1781,11 @@ class Fetcher implements CrawlConstants $site[self::HASH] = FetchUrl::computePageHash( $site[self::PAGE]); } + if(isset($doc_info[self::WORD_CLOUD])) { + $site[self::WORD_CLOUD] = $doc_info[self::WORD_CLOUD]; + } else { + $site[self::WORD_CLOUD] = NULL; + } if(isset($doc_info[self::CRAWL_DELAY])) { $site[self::CRAWL_DELAY] = $doc_info[self::CRAWL_DELAY]; } @@ -1844,6 +1856,10 @@ class Fetcher implements CrawlConstants $summarized_site_pages[$i][self::LINKS] = $site[self::DOC_INFO][self::LINKS]; } + if(isset($site[self::DOC_INFO][self::WORD_CLOUD])) { + $summarized_site_pages[$i][self::WORD_CLOUD] = + $site[self::DOC_INFO][self::WORD_CLOUD]; + } if(isset($site[self::DOC_INFO][self::THUMB])) { $summarized_site_pages[$i][self::THUMB] = $site[self::DOC_INFO][self::THUMB]; diff --git a/configs/config.php b/configs/config.php index 222c0d5d0..307879a4f 100755 --- a/configs/config.php +++ b/configs/config.php @@ -106,7 +106,7 @@ if(MAINTENANCE_MODE && $_SERVER["SERVER_ADDR"] != $_SERVER["REMOTE_ADDR"]) { if(!defined('WORK_DIRECTORY')) { /*+++ The next block of code is machine edited, change at your own risk, please use configure web page instead +++*/ -define('WORK_DIRECTORY', ''); +define('WORK_DIRECTORY', 'C:/xampp/htdocs/yioop_data'); /*++++++*/ // end machine edited code } diff --git a/controllers/components/crawl_component.php b/controllers/components/crawl_component.php index 31fffbfc1..21a085d4e 100644 --- a/controllers/components/crawl_component.php +++ b/controllers/components/crawl_component.php @@ -463,6 +463,19 @@ class CrawlComponent extends Component implements CrawlConstants tl('crawl_component_breadth_first'), self::PAGE_IMPORTANCE => tl('crawl_component_page_importance')); + $data['available_summarizers'] = array( + self::BASIC_SUMMARIZER => + tl('crawl_component_basic'), + self::CENTROID_SUMMARIZER => + tl('crawl_component_centroid')); + if(!$no_further_changes && isset($_REQUEST['summarizer_option']) + && in_array($_REQUEST['summarizer_option'], + array_keys($data['available_summarizers']))) { + $seed_info['general']['summarizer_option'] = + $_REQUEST['summarizer_option']; + $update_flag = true; + } + $data['summarizer_option'] = $seed_info['general']['summarizer_option']; if(!$no_further_changes && isset($_REQUEST['crawl_order']) && in_array($_REQUEST['crawl_order'], diff --git a/css/search.css b/css/search.css index 14b6b0f1b..054753581 100755 --- a/css/search.css +++ b/css/search.css @@ -61,7 +61,52 @@ p { color: green; } +span.tab +{ + margin-left:1em; + font-weight: bold; + color : gray; +} + +.wordcloud .wordcloud1 +{ + font-size:16pt; + color: #008800; + text-decoration: none; +} + +.wordcloud .wordcloud2 +{ + font-size:15pt; + color: #009900; + text-decoration: none; +} + +.wordcloud .wordcloud3 +{ + font-size:14pt; + color: #00AA00; + text-decoration: none; +} +.wordcloud .wordcloud4 +{ + font-size:13pt; + color: #00BB00; + text-decoration: none; +} + +.wordcloud .wordcloud5 +{ + font-size:12pt; + color: #00CC00; + text-decoration: none; +} + +.wordcloud:hover +{ + text-decoration: underline; +} .hidden { @@ -1115,6 +1160,7 @@ ul.in-list li /* Styles for search and search result pages */ + .html-ltr .serp { left: 2.2in; diff --git a/lib/centroid.php b/lib/centroid.php new file mode 100644 index 000000000..874d0b823 --- /dev/null +++ b/lib/centroid.php @@ -0,0 +1,295 @@ +<?php +/** + * SeekQuarry/Yioop -- + * Open Source Pure PHP Search Engine, Crawler, and Indexer + * + * Copyright (C) 2009 - 2014 Chris Pollett chris@pollett.org + * + * LICENSE: + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * END LICENSE + * + * @author Mangesh Dahale mangeshadahale@gmail.com + * @package seek_quarry + * @subpackage library + * @license http://www.gnu.org/licenses/ GPL3 + * @link http://www.seekquarry.com/ + * @copyright 2009 - 2014 + * @filesource + */ +if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} +/** + * Reads in constants used as enums used for storing web sites + */ +require_once BASE_DIR."/lib/crawl_constants.php"; +/** + * Contains the max_description_length for the summary + */ +require_once BASE_DIR."/lib/processors/page_processor.php"; +/** + * Contains function getTokenizer to get the object of the language specified. + */ +require_once BASE_DIR."/lib/phrase_parser.php"; +/** + * Load in locale specific tokenizing code + */ +foreach(glob(LOCALE_DIR."/*/resources/tokenizer.php") + as $filename) { + require_once $filename; +} + +class CentroidSummarizer +{ + /** + * Generates a centroid with which every sentence is ranked with cosine + * ranking method and also generates a word cloud. + * @param string $doc complete raw page to generate the summary from. + * @param string $lang language of the page to decide which stop words to + * call proper tokenizer.php of the specified language. + * + * @return array array of summary and word cloud + */ + static function getCentroidSummary($doc, $lang) + { + $doc = self::pageProcessing($doc); + /* Format the document to remove characters other than periods and + alphanumerics. + */ + $formatted_doc = self::formatDoc($doc); + $stop_obj = PhraseParser::getTokenizer($lang); + if($stop_obj != NULL) { + $doc_stop = $stop_obj -> stopwordsRemover($doc); + } else { + $doc_stop = $doc; + } + + /* Spliting into sentences */ + $sentences = self::getSentences($doc); + $n = count($sentences); + + /* Spliting into terms */ + $doc_st = self::formatSentence($doc_stop); + $term = preg_split("/[\s,]+/u", $doc_st, -1, PREG_SPLIT_NO_EMPTY); + $terms = array_unique($term); + sort($terms); + $t = count($terms); + + /* Initialize term-frequecy array */ + $tf = array(); + $tf = array_fill(0, $t, array_fill(0, $n, 0)); + + /* Initialize Nk array(Number of Documents the term occurs) */ + $nk = array(); + $nk = array_fill(0, $t, 0); + + /* Count TF for each word */ + for($i=0;$i<$n;$i++) { + for($j=0;$j<$t;$j++) { + $nt = substr_count($sentences[$i], $terms[$j]); + $tf[$i][$j] = 1+ log($nt); + if($nt!=0) { + $nk[$j]++; + } + else { + $tf[$i][$j] = 0; + } + } + } + /* Calculate weights of each term for every sentence */ + $w = array(); + $idf = array(); + $idf_temp = 0; + + for($k=0; $k<$t; $k++) { + $idf_temp = @($n/$nk[$k]); + if($nk[$k] == 0) { + $idf_temp = 0; + } + $tmp = @log($idf_temp); + $idf[$k] = $tmp; + for($i = 0; $i < $n; $i++) { + if($tmp >= 0 && $tmp < 10) { + $w[$i][$k] = $tf[$i][$k] * $tmp; + } else { + $w[$i][$k] = 0; + } + + } + } + + /* Count TF for finding centroid */ + $doc_centroid = preg_replace('/[\.]+/', ' ', $formatted_doc); + $wc = array(); + for($j=0; $j<$t; $j++) { + $nt = preg_match_all('/\b'.$terms[$j].'\b/', $doc_centroid); + $tfc[$j] = 1 + log($nt); + $wc[$j] = $tfc[$j] * $idf[$j]; + if(is_nan($wc[$j]) || is_infinite($wc[$j])) + $wc[$j] = 0; + } + /* Calculate centroid */ + arsort($wc); + $centroid = array(); + $centroid = array_slice($wc, 0, 5, true); + + /* Initializing centroid weight array by 0 */ + $wc = array_fill(0, $t, 0); + + /* Word cloud */ + $i = 0; + $word_cloud = array(); + foreach($centroid as $key => $value) { + $wc[$key] = $value; + $word_cloud[$i] = $terms[$key]; + $i++; + } + ksort($wc); + + /* Calculate similarity measure between centroid and each sentence */ + $sim=array(); + for($i=0; $i < $n; $i++) { + $a = $b1 = $b2 = $c1 = $c2 = $d = 0; + for($k=0; $k<$t; $k++) { + $wik = $w[$i][$k]; + $wck = $wc[$k]; + $idfk = $idf[$k]; + $a += ($wik * $wck * $idfk); + $b1 += ($wik * $wik); + $c1 += ($wck * $wck); + } + $b2 = sqrt($b1); + $c2 = sqrt($c1); + $d = $b2 * $c2; + $sim[$i] = @($a / $d); + if($d == 0) { + $sim[$i] = 0; + } + } + arsort($sim); + /* Getting how many sentences should be there in summary */ + $top = self::summarySentenceCount($sentences, $sim); + $sum_array = array(); + $sum_array = array_slice($sim, 0, $top-1, true); + ksort($sum_array); + /* Printing Summary */ + $summary = ''; + $d = null; + foreach($sum_array as $key => $value) { + $summary .= "$sentences[$key]".". "; + } + + /* Summary of text summarization */ + $words = explode(" ", $doc); + $sum_words = explode(" ", $summary); + return array($summary, $word_cloud); + } + /** + * Calculates how many sentences to put in the summary to match the + * MAX_DESCRIPTION_LEN. + * @param string $doc complete raw page to generate the summary from. + * @param string $lang language of the page to decide which stop words to + * call proper tokenizer.php of the specified language. + * + * @return array array of summary and word cloud + */ + static function summarySentenceCount($sentences, $sim) + { + $top = null; + $count = 0; + foreach($sim as $key => $value) + { + if($count < PageProcessor::$max_description_len) { + $count += strlen($sentences[$key]); + $top++; + } + } + return $top; + } + /** + * Breaks any content into sentences by splitting it on spaces or carriage + * returns + * @param string $content complete page. + * @return array array of sentences from that content. + */ + static function getSentences($content) + { + $content = preg_split("/\.\s|[\n\r]+/", $content, -1, + PREG_SPLIT_NO_EMPTY); + return $content; + } + /** + * Formats the sentences to remove all characters except words, + * digits and spaces + * @param string $sent complete page. + * @return string formatted sentences. + */ + static function formatSentence($sent) + { + $sent = trim(preg_replace('/[^\p{L}\p{N}\s]+/u', + ' ', mb_strtolower($sent))); + return $sent; + } + + /** + * Formats the document to remove carriage returns, hyphens and digits + * as we will not be using digits in word cloud. + * formatted document generated by this function is only used to compute + * centroid. + * @param string $content formatted page. + * @return string formatted document. + */ + static function formatDoc($content) + { + $substitute = array('/[\n\r\-]+/', '/[^\p{L}\s\.]+/u'); + $content = preg_replace($substitute, '', mb_strtolower($content)); + return $content; + } + + /** + * This function does an additional processing on the page + * such as removing all the tags from the page + * @param string $page complete page. + * @return string processed page. + */ + static function pageProcessing($page) + { + $substitutions = array('@<script[^>]*?>.*?</script>@si', + '/\ \;|\&rdquo\;|\&ldquo\;|\&mdash\;/si', + '@<style[^>]*?>.*?</style>@si', '/[\^\(\)]/', + '/\[(.*?)\]/', '/\t\n/' + ); + $page = preg_replace($substitutions, ' ', $page); + $page = preg_replace('/\s{2,}/', '', $page); + $new_page = preg_replace("/\<br\s*(\/)?\s*\>/", "\n", $page); + $changed = false; + if($new_page != $page) { + $changed = true; + $page = $new_page; + } + $page = preg_replace("/\<\/(h1|h2|h3|h4|h5|h6|table|tr|td|div|". + "p|address|section)\s*\>/", "\n\n", $page); + $page = preg_replace("/\<a/", " <a", $page); + $page = preg_replace("/\&\#\d{3}(\d?)\;|\&\w+\;/", " ", $page); + $page = strip_tags($page); + if($changed) { + $page = preg_replace("/(\r?\n[\t| ]*){2}/", "\n", $page); + } + $page = preg_replace("/(\r?\n[\t| ]*)/", "\n", $page); + $page = preg_replace("/\n\n\n+/", "\n\n", $page); + return $page; + } +} +?> \ No newline at end of file diff --git a/lib/crawl_constants.php b/lib/crawl_constants.php index 3bce0d381..61bf18672 100755 --- a/lib/crawl_constants.php +++ b/lib/crawl_constants.php @@ -249,5 +249,9 @@ interface CrawlConstants const TOR_PROXY = 'dh'; const PROXY_SERVERS = 'di'; const NEEDS_OFFSET_FLAG = 0x7FFFFFFF; + const BASIC_SUMMARIZER = 'dk'; + const CENTROID_SUMMARIZER = 'dl'; + const SUMMARIZER_OPTION = 'dm'; + const WORD_CLOUD = 'dn'; } ?> \ No newline at end of file diff --git a/lib/processors/html_processor.php b/lib/processors/html_processor.php index c833b1386..c86f656aa 100755 --- a/lib/processors/html_processor.php +++ b/lib/processors/html_processor.php @@ -55,7 +55,10 @@ require_once BASE_DIR."/lib/processors/text_processor.php"; * Load so can parse urls */ require_once BASE_DIR."/lib/url_parser.php"; - +/** +* Get the centroid summary +*/ +require_once BASE_DIR."/lib/centroid.php"; /** * For guessing language from charset */ @@ -87,6 +90,17 @@ class HtmlProcessor extends TextProcessor * @return array a summary of the contents of the page * */ + function getSeedInfo($use_default = false) + { + if(file_exists(WORK_DIRECTORY."/crawl.ini") && !$use_default) { + $info = parse_ini_with_fallback(WORK_DIRECTORY."/crawl.ini"); + } else { + $info = parse_ini_with_fallback( + BASE_DIR."/configs/default_crawl.ini"); + } + + return $info; + } function process($page, $url) { $summary = NULL; @@ -103,8 +117,20 @@ class HtmlProcessor extends TextProcessor if($summary[self::TITLE] == "") { $summary[self::TITLE] = self::crudeTitle($dom_page); } - $summary[self::DESCRIPTION] = self::description($dom, + $summarizer = $this->getSeedInfo(); + $lang = self::lang($dom, + $summary[self::TITLE], $url); + if($summarizer['general']['summarizer_option']== + self::CENTROID_SUMMARIZER) { + $summary_cloud = + CentroidSummarizer::getCentroidSummary($dom_page,$lang); + $summary[self::DESCRIPTION] = $summary_cloud[0]; + $summary[self::WORD_CLOUD] = $summary_cloud[1]; + } + else { + $summary[self::DESCRIPTION] = self::description($dom, $dom_page); + } if(trim($summary[self::DESCRIPTION]) == "") { $summary[self::DESCRIPTION] = self::crudeDescription( $dom_page); diff --git a/lib/processors/text_processor.php b/lib/processors/text_processor.php index 2a8918e4b..8bd535290 100755 --- a/lib/processors/text_processor.php +++ b/lib/processors/text_processor.php @@ -52,6 +52,11 @@ $PAGE_PROCESSORS = array_merge($PAGE_PROCESSORS, $add_types); */ require_once BASE_DIR."/lib/processors/page_processor.php"; +/** +* Get the centroid summary +*/ +require_once BASE_DIR."/lib/centroid.php"; + /** * So can extract parts of the URL if need to guess lang */ @@ -80,14 +85,35 @@ class TextProcessor extends PageProcessor * @return array a summary of (title, description,links, and content) of * the information in $page */ + function getSeedInfo($use_default = false) + { + if(file_exists(WORK_DIRECTORY."/crawl.ini") && !$use_default) { + $info = parse_ini_with_fallback(WORK_DIRECTORY."/crawl.ini"); + } else { + $info = parse_ini_with_fallback( + BASE_DIR."/configs/default_crawl.ini"); + } + + return $info; + } function process($page, $url) { $summary = NULL; - + $summarizer = $this->getSeedInfo(); if(is_string($page)) { $summary[self::TITLE] = ""; - $summary[self::DESCRIPTION] = mb_substr($page, 0, - self::$max_description_len); + $lang = self::calculateLang($page); + if($summarizer['general']['summarizer_option']== + self::CENTROID_SUMMARIZER) { + $summary_cloud = + CentroidSummarizer::getCentroidSummary($page,$lang); + $summary[self::DESCRIPTION] = $summary_cloud[0]; + $summary[self::WORD_CLOUD] = $summary_cloud[1]; + } + else { + $summary[self::DESCRIPTION] = mb_substr($page, 0, + self::$max_description_len); + } $summary[self::LANG] = self::calculateLang( $summary[self::DESCRIPTION]); $summary[self::LINKS] = self::extractHttpHttpsUrls($page); diff --git a/locale/en-US/resources/tokenizer.php b/locale/en-US/resources/tokenizer.php index 87c0b50bb..e5f7d2001 100755 --- a/locale/en-US/resources/tokenizer.php +++ b/locale/en-US/resources/tokenizer.php @@ -92,7 +92,125 @@ class EnTokenizer { return $pre_segment; } - + /** + * Removes the stop words from the page + * + * @param string $page the page to remove stop words from. + * @return string $page with no stop words + */ + static function stopwordsRemover($page) + { + $stop_words = array('a','able','about','above','abst', + 'accordance','according','based','accordingly','across','act', + 'actually','added','adj','affected','affecting','affects','after', + 'afterwards','again','against','ah','all','almost','alone','along', + 'already','also','although','always','am','among','amongst','an','and', + 'announce','another','any','anybody','anyhow','anymore','anyone', + 'anything','anyway','anyways','anywhere','apparently','approximately', + 'are','aren','arent','arise','around','as','aside','ask','asking','at', + 'auth','available','away','awfully','b','back','be','became','because', + 'become','becomes','becoming','been','before','beforehand','begin', + 'beginning','beginnings','begins','behind','being','believe','below', + 'beside','besides','between','beyond','biol','both','brief','briefly', + 'but','by','c','ca','came','can','cannot','cant','cause','causes', + 'certain','certainly','co','com','come','comes','contain','containing', + 'contains','could','couldnt','d','date','did','didnt', + 'different','do','does','doesnt','doing', + 'done','dont','down','downwards', + 'due','during','e','each','ed','edu','effect','eg','eight','eighty', + 'either','else','elsewhere','end', + 'ending','enough','especially','et', + 'et-al','etc','even','ever','every', + 'everybody','everyone','everything' + ,'everywhere','ex','except','f','far','few','ff','fifth','first', + 'five','fix','followed','following','follows','for','former', + 'formerly','forth','found','four','from','further','furthermore', + 'g','gave','get','gets','getting','give','given','gives','giving','go', + 'goes','gone','got','gotten','h','had','happens','hardly','has','hasnt', + 'have','havent','having','he','hed','hence','her','here','hereafter', + 'hereby','herein','heres','hereupon','hers','herself','hes','hi','hid', + 'him','himself','his','hither','home','how','howbeit', + 'however','hundred','i','id','ie','if','ill', + 'im','immediate','immediately', + 'importance','important','in','inc','indeed','index','information', + 'instead','into','invention','inward','is','isnt','it','itd','itll', + 'its','itself','ive','j','just','k','keep','keeps', + 'kept','kg','km','know', + 'known','knows','l','largely','last','lately', + 'later','latter','latterly', + 'least','less','lest','let','lets','like','liked','likely','line', + 'little','ll','look','looking','looks','ltd','m','made','mainly','make', + 'makes','many','may','maybe','me','mean','means','meantime','meanwhile', + 'merely','mg','might','million','miss','ml','more','moreover','most', + 'mostly','mr','mrs','much','mug','must','my','myself','n','na','name', + 'namely','nay','nd','near','nearly','necessarily','necessary','need', + 'needs','neither','never','nevertheless','new','next', + 'nine','ninety','no', + 'nobody','non','none','nonetheless','noone', + 'nor','normally','nos','not', + 'noted','nothing','now','nowhere','o','obtain', + 'obtained','obviously','of', + 'off','often','oh','ok','okay','old','omitted','on','once','one','ones', + 'only','onto','or','ord','other','others', + 'otherwise','ought','our','ours', + 'ourselves','out','outside','over','overall','owing','own','p','page', + 'pages','part','particular','particularly', + 'past','per','perhaps','placed', + 'please','plus','poorly','possible','possibly','potentially','pp', + 'predominantly','present','previously', + 'primarily','probably','promptly', + 'proud','provides','put','q','que','quickly','quite','qv','r','ran', + 'rather','rd','re','readily','really','recent','recently','ref','refs', + 'regarding','regardless','regards','related','relatively','research', + 'respectively','resulted','resulting', + 'results','right','run','s','said', + 'same','saw','say','saying','says','sec', + 'section','see','seeing','seem', + 'seemed','seeming','seems', + 'seen','self','selves','sent','seven','several', + 'shall','she','shed','shell', + 'shes','should','shouldnt','show','showed','shown','showns','shows', + 'significant','significantly','similar','similarly','since', + 'six','slightly', + 'so','some','somebody','somehow','someone','somethan', + 'something','sometime', + 'sometimes','somewhat','somewhere','soon', + 'sorry','specifically','specified', + 'specify','specifying','still','stop','strongly','sub','substantially', + 'successfully','such','sufficiently','suggest','sup','sure','t','take', + 'taken','taking','tell','tends','th','than', + 'thank','thanks','thanx','that', + 'thatll','thats','thatve','the','their', + 'theirs','them','themselves','then', + 'thence','there','thereafter','thereby','thered','therefore','therein', + 'therell','thereof','therere','theres','thereto','thereupon','thereve', + 'these','they','theyd','theyll','theyre', + 'theyve','think','this','those', + 'thou','though','thoughh','thousand','throug', + 'through','throughout','thru', + 'thus','til','tip','to','together','too', + 'took','toward','towards','tried', + 'tries','truly','try','trying','ts','twice','two','u','un','under', + 'unfortunately','unless','unlike','unlikely','until','unto','up','upon', + 'ups','us','use','used','useful','usefully','usefulness','uses','using', + 'usually','v','value','various','ve','very', + 'via','viz','vol','vols','vs', + 'w','want','wants','was','wasnt','way','we', + 'wed','welcome','well','went', + 'were','werent','weve','what','whatever', + 'whatll','whats','when','whence', + 'whenever','where','whereafter','whereas','whereby','wherein','wheres', + 'whereupon','wherever','whether','which','while','whim','whither','who', + 'whod','whoever','whole','wholl','whom','whomever','whos','whose','why', + 'widely','willing','wish','with','within', + 'without','wont','words','world', + 'would','wouldnt','www','x','y','yes','yet','you','youd','youll','your', + 'youre','yours','yourself','yourselves','youve','z','zero'); + + $page = preg_replace('/\b('.implode('|',$stop_words).')\b/', '', + mb_strtolower($page)); + return $page; + } /** * Computes the stem of an English word * diff --git a/locale/zh-CN/resources/tokenizer.php b/locale/zh-CN/resources/tokenizer.php index c4e5e3223..e5f20c0ab 100755 --- a/locale/zh-CN/resources/tokenizer.php +++ b/locale/zh-CN/resources/tokenizer.php @@ -43,6 +43,16 @@ if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} class ZhTokenizer { + /** + * Removes the stop words from the page + * @param string $page the page to remove stop words from. + * @return string $page with no stop words + */ + static function stopwordsRemover($page) + { + return $page; + } + static function segment($pre_segment) { return PhraseParser::reverseMaximalMatch($pre_segment, "zh-CN"); diff --git a/models/crawl_model.php b/models/crawl_model.php index 14d36cf19..42eda1de7 100755 --- a/models/crawl_model.php +++ b/models/crawl_model.php @@ -460,6 +460,9 @@ class CrawlModel extends ParallelModel implements CrawlConstants if(!isset($info["general"]["cache_pages"])) { $info["general"]["cache_pages"] = true; } + if(!isset($info["general"]["summarizer_option"])) { + $info["general"]["summarizer_option"] = ""; + } $n = array(); $n[] = <<<EOT ; ***** BEGIN LICENSE BLOCK ***** @@ -496,6 +499,8 @@ EOT; } $n[] = '[general]'; $n[] = "crawl_order = '".$info['general']['crawl_order']."';"; + $n[] = "summarizer_option = '". + $info['general']['summarizer_option']."';"; $n[] = "crawl_type = '".$info['general']['crawl_type']."';"; $n[] = "crawl_index = '".$info['general']['crawl_index']."';"; $n[] = 'arc_dir = "'.$info["general"]["arc_dir"].'";'; @@ -611,6 +616,8 @@ EOT; "crawl_index" => array(self::CRAWL_INDEX, ''), "crawl_order" => array(self::CRAWL_ORDER, self::PAGE_IMPORTANCE), + "summarizer_option" => array(self::SUMMARIZER_OPTION, + self::BASIC_SUMMARIZER), "arc_dir" => array(self::ARC_DIR, ''), "arc_type" => array(self::ARC_TYPE, ''), "cache_pages" => array(self::CACHE_PAGES, true), diff --git a/views/elements/crawloptions_element.php b/views/elements/crawloptions_element.php index a3d1941c8..01152b095 100644 --- a/views/elements/crawloptions_element.php +++ b/views/elements/crawloptions_element.php @@ -123,6 +123,13 @@ class CrawloptionsElement extends Element name="restrict_sites_by_url" value="true" onclick="setDisplay('toggle', this.checked)" <?php e($data['TOGGLE_STATE']); ?> /></div> + <div class="top-margin"><label for="summarizer"><b><?php + e(tl('crawloptions_element_summarizer'))?></b></label><?php + $this->view->helper("options")->render("summarizer", + "summarizer_option",$data['available_summarizers'], + $data['summarizer_option']); + ?> + </div> <div id="toggle"> <div class="top-margin"><label for="allowed-sites"><b><?php e(tl('crawloptions_element_allowed_to_crawl'))?></b></label></div> diff --git a/views/search_view.php b/views/search_view.php index 1cba33af5..9a3be9b44 100755 --- a/views/search_view.php +++ b/views/search_view.php @@ -245,7 +245,6 @@ class SearchView extends View implements CrawlConstants continue; } ?> - <h2> <?php if(strpos($link_url, self::GIT_EXTENSION)) { ?> @@ -280,9 +279,24 @@ class SearchView extends View implements CrawlConstants $data['VIDEO_SOURCES'], $data["OPEN_IN_TABS"]); } ?> - <p class="echo-link" <?php e($subtitle); ?>><?php + <p><span class="echo-link"<?php e($subtitle); ?>><?php e(UrlParser::simplifyUrl($url, 100)." "); - ?></p> + ?></span> + <?php if(isset($page[self::WORD_CLOUD])) { + $cloud = $page[self::WORD_CLOUD]; + $i = 1; + e("<span class='tab'>Word cloud:</span>"); + foreach($cloud as $word) {?> + <span class="wordcloud"> + <a class='wordcloud<?php e($i)?>' + href="?<?php e(CSRF_TOKEN."=".$data[CSRF_TOKEN]); + ?>&its=<?php e($data['its']);?> + &q=<?php e($word);?>"><?php + e($this->helper("displayresults")-> + render($word)."</span></a>"); + $i++; + } + }?></p> <?php if(!isset($page[self::ROBOT_METAS]) || !in_array("NOSNIPPET", $page[self::ROBOT_METAS])) { $description = isset($page[self::DESCRIPTION]) ?