viewgit/inc/functions.php:22 Function utf8_encode() is deprecated [8192]
diff --git a/bin/fetcher.php b/bin/fetcher.php index 8f2cfe264..b1573b67c 100755 --- a/bin/fetcher.php +++ b/bin/fetcher.php @@ -315,7 +315,7 @@ class Fetcher implements CrawlConstants var $crawl_order; /** * Stores the name of the summarizer used for crawling. - * Possible values are Basic and Centroid + * Possible values are self::BASIC and self::CENTROID_SUMMARIZER * @var string */ var $summarizer_option; @@ -552,7 +552,7 @@ class Fetcher implements CrawlConstants //we will get the correct crawl order from a queue_server $this->crawl_order = self::PAGE_IMPORTANCE; - $this->summarizer_option = self::CENTROID_SUMMARIZER; + $this->summarizer_option = self::BASIC_SUMMARIZER; } /** @@ -634,7 +634,6 @@ class Fetcher implements CrawlConstants crawlLog("MAIN LOOP CASE 4 -- WEB SCHEDULER"); } $info = $this->checkScheduler(); - if($info === false) { crawlLog("Cannot connect to name server...". " will try again in ".FETCH_SLEEP_TIME." seconds."); @@ -645,7 +644,6 @@ class Fetcher implements CrawlConstants crawlLog("MAIN LOOP CASE 5 -- NO CURRENT CRAWL"); $info[self::STATUS] = self::NO_DATA_STATE; } - /* case(2), case(3) might have set info without $info[self::STATUS] being set */ @@ -653,13 +651,11 @@ class Fetcher implements CrawlConstants if($info === true) {$info = array();} $info[self::STATUS] = self::CONTINUE_STATE; } - if($info[self::STATUS] == self::NO_DATA_STATE) { crawlLog("No data. Sleeping..."); sleep(FETCH_SLEEP_TIME); continue; } - $tmp_base_name = (isset($info[self::CRAWL_TIME])) ? CRAWL_DIR."/cache/{$prefix}" . self::archive_base_name . $info[self::CRAWL_TIME] : ""; @@ -673,7 +669,6 @@ class Fetcher implements CrawlConstants } $this->to_crawl_again = array(); $this->found_sites = array(); - gc_collect_cycles(); $this->web_archive = new WebArchiveBundle($tmp_base_name, false); @@ -682,7 +677,6 @@ class Fetcher implements CrawlConstants $this->sum_seen_description_length = 0; $this->sum_seen_site_link_length = 0; $this->num_seen_sites = 0; - crawlLog("New name: ".$this->web_archive->dir_name); crawlLog("Switching archive..."); if(!isset($info[self::ARC_DATA])) { @@ -1325,16 +1319,18 @@ class Fetcher implements CrawlConstants } else { $info[self::CURRENT_SERVER] = $this->current_server; } - $update_fields = array(self::CRAWL_TYPE => "crawl_type", - self::SUMMARIZER_OPTION => "summarizer_option", - self::CRAWL_INDEX => "crawl_index", self::CRAWL_ORDER => - 'crawl_order', self::CACHE_PAGES => 'cache_pages', - self::INDEXED_FILE_TYPES => 'indexed_file_types', - self::RESTRICT_SITES_BY_URL => 'restrict_sites_by_url', + $update_fields = array( self::ALLOWED_SITES => 'allowed_sites', + self::CACHE_PAGES => 'cache_pages', + self::CRAWL_INDEX => "crawl_index", + self::CRAWL_ORDER => 'crawl_order', + self::CRAWL_TYPE => "crawl_type", self::DISALLOWED_SITES => 'disallowed_sites', - self::TOR_PROXY => 'tor_proxy', - self::PROXY_SERVERS => 'proxy_servers'); + self::INDEXED_FILE_TYPES => 'indexed_file_types', + self::PROXY_SERVERS => 'proxy_servers', + self::RESTRICT_SITES_BY_URL => 'restrict_sites_by_url', + self::SUMMARIZER_OPTION => "summarizer_option", + self::TOR_PROXY => 'tor_proxy'); foreach($update_fields as $info_field => $field) { if(isset($info[$info_field])) { $this->$field = $info[$info_field]; @@ -1683,10 +1679,12 @@ class Fetcher implements CrawlConstants if(isset($this->plugin_processors[$page_processor])) { $processor = new $page_processor( $this->plugin_processors[$page_processor], - $this->max_description_len); + $this->max_description_len, + $this->summarizer_option); } else { $processor = new $page_processor(array(), - $this->max_description_len); + $this->max_description_len, + $this->summarizer_option); } } diff --git a/configs/config.php b/configs/config.php index 222c0d5d0..1df605c0c 100755 --- a/configs/config.php +++ b/configs/config.php @@ -80,9 +80,9 @@ if(!defined('TIME_ZONE')) { $COMPONENT_ACTIVITIES = array( "accountaccess" => array("signin", "manageAccount", "manageUsers", "manageRoles"), - "social" => array("manageGroups", "groupFeeds", "mixCrawls"), "crawl" => array("manageCrawls", "manageClassifiers", "pageOptions", "resultsEditor", "searchSources"), + "social" => array("manageGroups", "groupFeeds", "mixCrawls"), "system" => array("manageMachines", "manageLocales", "serverSettings", "configure") ); diff --git a/controllers/components/crawl_component.php b/controllers/components/crawl_component.php index e789a0c0a..f6333402e 100644 --- a/controllers/components/crawl_component.php +++ b/controllers/components/crawl_component.php @@ -280,6 +280,10 @@ class CrawlComponent extends Component implements CrawlConstants $crawl_params[self::INDEXED_FILE_TYPES] = $seed_info['indexed_file_types']['extensions']; } + if(isset($seed_info['general']['summarizer_option'])) { + $crawl_params[self::SUMMARIZER_OPTION] = + $seed_info['general']['summarizer_option']; + } if(isset($seed_info['active_classifiers']['label'])) { // Note that 'label' is actually an array of active class labels. $crawl_params[self::ACTIVE_CLASSIFIERS] = @@ -474,8 +478,8 @@ class CrawlComponent extends Component implements CrawlConstants $_REQUEST['summarizer_option']; $update_flag = true; } - $data['summarizer_option'] = $seed_info['general']['summarizer_option']; - + $data['summarizer_option'] = + $seed_info['general']['summarizer_option']; if(!$no_further_changes && isset($_REQUEST['crawl_order']) && in_array($_REQUEST['crawl_order'], array_keys($data['available_crawl_orders']))) { @@ -669,7 +673,7 @@ class CrawlComponent extends Component implements CrawlConstants break; case 'search': - $search_array = + $search_array = $parent->tableSearchRequestHandler($data, array('name')); break; diff --git a/controllers/fetch_controller.php b/controllers/fetch_controller.php index 28a75c30e..056cc7557 100755 --- a/controllers/fetch_controller.php +++ b/controllers/fetch_controller.php @@ -568,8 +568,9 @@ class FetchController extends Controller implements CrawlConstants $to_copy_fields = array(self::ALLOWED_SITES, self::ARC_DIR, self::ARC_TYPE, self::CRAWL_INDEX, self::CRAWL_TYPE, self::DISALLOWED_SITES, self::INDEXED_FILE_TYPES, - self::PROXY_SERVERS, self::TOR_PROXY, - self::RESTRICT_SITES_BY_URL + self::PROXY_SERVERS, self::RESTRICT_SITES_BY_URL, + self::SUMMARIZER_OPTION, + self::TOR_PROXY ); foreach($to_copy_fields as $field) { if(isset($status[$field])) { diff --git a/lib/phrase_parser.php b/lib/phrase_parser.php index cb2ff9cf5..10820820a 100755 --- a/lib/phrase_parser.php +++ b/lib/phrase_parser.php @@ -86,18 +86,19 @@ class PhraseParser * A list of meta words that might be extracted from a query * @var array */ - static $meta_words_list = array('link:', 'site:', 'version:', - 'modified:', 'filetype:', 'info:', '\-', 'os:', 'server:', 'date:', - "numlinks:", 'index:', 'i:', 'ip:', 'weight:', 'w:', 'u:', 'time:', - 'code:', 'lang:', 'media:', 'elink:', 'location:', 'size:', 'host:', - 'dns:', 'path:', 'robot:', 'safe:', 'guid:', 'class:', - 'class-score:'); + static $meta_words_list = array('\-', 'class:', 'class-score:', 'code:', + 'date:', 'dns:', 'elink:', 'filetype:', 'guid:', 'host:', 'i:', + 'info:', 'index:', 'ip:', 'link:', 'modified:', + 'lang:', 'media:', 'location:', 'numlinks:', 'os:', + 'path:', 'robot:', 'safe:', 'server:', 'site:', 'size:', + 'time:', 'u:', 'version:','weight:', 'w:' + ); /** * Those meta words whose values will be encoded as part of word_ids * @var array */ - static $materialized_metas = array("media:", "safe:", "class:"); + static $materialized_metas = array("class:", "media:", "safe:"); /** * A list of meta words that might be extracted from a query diff --git a/lib/processors/html_processor.php b/lib/processors/html_processor.php index f20d6a003..62f35aa85 100755 --- a/lib/processors/html_processor.php +++ b/lib/processors/html_processor.php @@ -90,17 +90,6 @@ class HtmlProcessor extends TextProcessor * @return array a summary of the contents of the page * */ - function getSeedInfo($use_default = false) - { - if(file_exists(WORK_DIRECTORY."/crawl.ini") && !$use_default) { - $info = parse_ini_with_fallback(WORK_DIRECTORY."/crawl.ini"); - } else { - $info = parse_ini_with_fallback( - BASE_DIR."/configs/default_crawl.ini"); - } - - return $info; - } function process($page, $url) { $summary = NULL; @@ -117,11 +106,9 @@ class HtmlProcessor extends TextProcessor if($summary[self::TITLE] == "") { $summary[self::TITLE] = self::crudeTitle($dom_page); } - $summarizer = $this->getSeedInfo(); - $lang = self::lang($dom, + $summary[self::LANG] = self::lang($dom, $summary[self::TITLE], $url); - if($summarizer['general']['summarizer_option']== - self::CENTROID_SUMMARIZER) { + if($this->summarizer_option == self::CENTROID_SUMMARIZER) { $summary_cloud = CentroidSummarizer::getCentroidSummary( $dom_page, $lang); $summary[self::DESCRIPTION] = $summary_cloud[0]; @@ -134,8 +121,6 @@ class HtmlProcessor extends TextProcessor $summary[self::DESCRIPTION] = self::crudeDescription( $dom_page); } - $summary[self::LANG] = self::lang($dom, - $summary[self::DESCRIPTION], $url); $summary[self::LINKS] = self::links($dom, $url); if($summary[self::LINKS] == array()) { $summary[self::LINKS] = parent::extractHttpHttpsUrls( @@ -295,11 +280,10 @@ class HtmlProcessor extends TextProcessor { $xpath = new DOMXPath($dom); $titles = $xpath->evaluate("/html//title"); - $title = ""; foreach($titles as $pre_title) { - $title .= self::domNodeToString($pre_title); + $title .= $pre_title->nodeValue; } if($title == "") { $title_parts = array("/html//h1", "/html//h2", "/html//h3", @@ -307,7 +291,7 @@ class HtmlProcessor extends TextProcessor foreach($title_parts as $part) { $doc_nodes = $xpath->evaluate($part); foreach($doc_nodes as $node) { - $title .= " .. ".self::domNodeToString($node); + $title .= " .. ".$node->nodeValue; if(strlen($title) > self::MAX_TITLE_LEN) { break 2;} } @@ -494,7 +478,7 @@ class HtmlProcessor extends TextProcessor $len = strlen($url); if(!UrlParser::checkRecursiveUrl($url) && $len < MAX_URL_LENGTH && $len > 4) { - $text = self::domNodeToString($href); + $text = $href->nodeValue; if(isset($sites[$url])) { $sites[$url] .=" .. ". preg_replace("/\s+/", " ", strip_tags($text)); diff --git a/lib/processors/page_processor.php b/lib/processors/page_processor.php index ab883b7fa..1fd69990d 100644 --- a/lib/processors/page_processor.php +++ b/lib/processors/page_processor.php @@ -62,6 +62,11 @@ abstract class PageProcessor implements CrawlConstants */ var $plugin_instances; + /** + * Stores the name of the summarizer used for crawling. + * Possible values are self::BASIC and self::CENTROID_SUMMARIZER + * @var string + var $summarizer_option; /** * Max number of chars to extract for description from a page to index. * Only words in the description are indexed. @@ -77,8 +82,10 @@ abstract class PageProcessor implements CrawlConstants * do further processing on the data handles by this page * processor */ - function __construct($plugins = array(), $max_description_len = NULL) { + function __construct($plugins = array(), $max_description_len = NULL, + $summarizer_option = self::BASIC_SUMMARIZER) { $this->plugin_instances = $plugins; + $this->summarizer_option = $summarizer_option; if($max_description_len != NULL) { self::$max_description_len = $max_description_len; } else { diff --git a/lib/processors/sitemap_processor.php b/lib/processors/sitemap_processor.php index 4972b1d54..2eea02fd1 100644 --- a/lib/processors/sitemap_processor.php +++ b/lib/processors/sitemap_processor.php @@ -73,6 +73,7 @@ class SitemapProcessor extends TextProcessor if($dom !==false) { $summary[self::TITLE] = $url; $summary[self::DESCRIPTION] = "Sitemap of ".$url; + $summary[self::LANG] = "en-US"; $summary[self::LINKS] = self::links($dom, $url); if(strlen($summary[self::DESCRIPTION] . $summary[self::TITLE]) == 0 && count($summary[self::LINKS]) == 0) { diff --git a/lib/processors/text_processor.php b/lib/processors/text_processor.php index 8bd535290..c05e34d67 100755 --- a/lib/processors/text_processor.php +++ b/lib/processors/text_processor.php @@ -85,17 +85,6 @@ class TextProcessor extends PageProcessor * @return array a summary of (title, description,links, and content) of * the information in $page */ - function getSeedInfo($use_default = false) - { - if(file_exists(WORK_DIRECTORY."/crawl.ini") && !$use_default) { - $info = parse_ini_with_fallback(WORK_DIRECTORY."/crawl.ini"); - } else { - $info = parse_ini_with_fallback( - BASE_DIR."/configs/default_crawl.ini"); - } - - return $info; - } function process($page, $url) { $summary = NULL; @@ -103,8 +92,7 @@ class TextProcessor extends PageProcessor if(is_string($page)) { $summary[self::TITLE] = ""; $lang = self::calculateLang($page); - if($summarizer['general']['summarizer_option']== - self::CENTROID_SUMMARIZER) { + if($this->summarizer_option == self::CENTROID_SUMMARIZER) { $summary_cloud = CentroidSummarizer::getCentroidSummary($page,$lang); $summary[self::DESCRIPTION] = $summary_cloud[0];