viewgit/inc/functions.php:22 Function utf8_encode() is deprecated [8192]

Fixes to crawling under manage machines after centroid patch, a=chris

Chris Pollett [2014-05-10 17:May:th]
Fixes to crawling under manage machines after centroid patch, a=chris
Filename
bin/fetcher.php
configs/config.php
controllers/components/crawl_component.php
controllers/fetch_controller.php
lib/phrase_parser.php
lib/processors/html_processor.php
lib/processors/page_processor.php
lib/processors/sitemap_processor.php
lib/processors/text_processor.php
diff --git a/bin/fetcher.php b/bin/fetcher.php
index 8f2cfe264..b1573b67c 100755
--- a/bin/fetcher.php
+++ b/bin/fetcher.php
@@ -315,7 +315,7 @@ class Fetcher implements CrawlConstants
     var $crawl_order;
     /**
      * Stores the name of the summarizer used for crawling.
-     * Possible values are Basic and Centroid
+     * Possible values are self::BASIC and self::CENTROID_SUMMARIZER
      * @var string
      */
     var $summarizer_option;
@@ -552,7 +552,7 @@ class Fetcher implements CrawlConstants

         //we will get the correct crawl order from a queue_server
         $this->crawl_order = self::PAGE_IMPORTANCE;
-        $this->summarizer_option = self::CENTROID_SUMMARIZER;
+        $this->summarizer_option = self::BASIC_SUMMARIZER;
     }

     /**
@@ -634,7 +634,6 @@ class Fetcher implements CrawlConstants
                     crawlLog("MAIN LOOP CASE 4 -- WEB SCHEDULER");
                 }
                 $info = $this->checkScheduler();
-
                 if($info === false) {
                     crawlLog("Cannot connect to name server...".
                         " will try again in ".FETCH_SLEEP_TIME." seconds.");
@@ -645,7 +644,6 @@ class Fetcher implements CrawlConstants
                 crawlLog("MAIN LOOP CASE 5 -- NO CURRENT CRAWL");
                 $info[self::STATUS] = self::NO_DATA_STATE;
             }
-
             /* case(2), case(3) might have set info without
                $info[self::STATUS] being set
              */
@@ -653,13 +651,11 @@ class Fetcher implements CrawlConstants
                 if($info === true) {$info = array();}
                 $info[self::STATUS] = self::CONTINUE_STATE;
             }
-
             if($info[self::STATUS] == self::NO_DATA_STATE) {
                 crawlLog("No data. Sleeping...");
                 sleep(FETCH_SLEEP_TIME);
                 continue;
             }
-
             $tmp_base_name = (isset($info[self::CRAWL_TIME])) ?
                 CRAWL_DIR."/cache/{$prefix}" . self::archive_base_name .
                     $info[self::CRAWL_TIME] : "";
@@ -673,7 +669,6 @@ class Fetcher implements CrawlConstants
                 }
                 $this->to_crawl_again = array();
                 $this->found_sites = array();
-
                 gc_collect_cycles();
                 $this->web_archive = new WebArchiveBundle($tmp_base_name,
                     false);
@@ -682,7 +677,6 @@ class Fetcher implements CrawlConstants
                 $this->sum_seen_description_length = 0;
                 $this->sum_seen_site_link_length = 0;
                 $this->num_seen_sites = 0;
-
                 crawlLog("New name: ".$this->web_archive->dir_name);
                 crawlLog("Switching archive...");
                 if(!isset($info[self::ARC_DATA])) {
@@ -1325,16 +1319,18 @@ class Fetcher implements CrawlConstants
         } else {
             $info[self::CURRENT_SERVER] = $this->current_server;
         }
-        $update_fields = array(self::CRAWL_TYPE => "crawl_type",
-            self::SUMMARIZER_OPTION => "summarizer_option",
-            self::CRAWL_INDEX => "crawl_index", self::CRAWL_ORDER =>
-            'crawl_order', self::CACHE_PAGES => 'cache_pages',
-            self::INDEXED_FILE_TYPES => 'indexed_file_types',
-            self::RESTRICT_SITES_BY_URL => 'restrict_sites_by_url',
+        $update_fields = array(
             self::ALLOWED_SITES => 'allowed_sites',
+            self::CACHE_PAGES => 'cache_pages',
+            self::CRAWL_INDEX => "crawl_index",
+            self::CRAWL_ORDER => 'crawl_order',
+            self::CRAWL_TYPE => "crawl_type",
             self::DISALLOWED_SITES => 'disallowed_sites',
-            self::TOR_PROXY => 'tor_proxy',
-            self::PROXY_SERVERS => 'proxy_servers');
+            self::INDEXED_FILE_TYPES => 'indexed_file_types',
+            self::PROXY_SERVERS => 'proxy_servers',
+            self::RESTRICT_SITES_BY_URL => 'restrict_sites_by_url',
+            self::SUMMARIZER_OPTION => "summarizer_option",
+            self::TOR_PROXY => 'tor_proxy');
         foreach($update_fields as $info_field => $field) {
             if(isset($info[$info_field])) {
                 $this->$field = $info[$info_field];
@@ -1683,10 +1679,12 @@ class Fetcher implements CrawlConstants
                 if(isset($this->plugin_processors[$page_processor])) {
                     $processor = new $page_processor(
                         $this->plugin_processors[$page_processor],
-                        $this->max_description_len);
+                        $this->max_description_len,
+                        $this->summarizer_option);
                 } else {
                     $processor = new $page_processor(array(),
-                        $this->max_description_len);
+                        $this->max_description_len,
+                        $this->summarizer_option);
                 }
             }

diff --git a/configs/config.php b/configs/config.php
index 222c0d5d0..1df605c0c 100755
--- a/configs/config.php
+++ b/configs/config.php
@@ -80,9 +80,9 @@ if(!defined('TIME_ZONE')) {
 $COMPONENT_ACTIVITIES = array(
     "accountaccess" => array("signin", "manageAccount", "manageUsers",
         "manageRoles"),
-    "social" => array("manageGroups", "groupFeeds", "mixCrawls"),
     "crawl" => array("manageCrawls", "manageClassifiers", "pageOptions",
         "resultsEditor", "searchSources"),
+    "social" => array("manageGroups", "groupFeeds", "mixCrawls"),
     "system" => array("manageMachines", "manageLocales",
         "serverSettings", "configure")
 );
diff --git a/controllers/components/crawl_component.php b/controllers/components/crawl_component.php
index e789a0c0a..f6333402e 100644
--- a/controllers/components/crawl_component.php
+++ b/controllers/components/crawl_component.php
@@ -280,6 +280,10 @@ class CrawlComponent extends Component implements CrawlConstants
             $crawl_params[self::INDEXED_FILE_TYPES] =
                 $seed_info['indexed_file_types']['extensions'];
         }
+        if(isset($seed_info['general']['summarizer_option'])) {
+            $crawl_params[self::SUMMARIZER_OPTION] =
+                $seed_info['general']['summarizer_option'];
+        }
         if(isset($seed_info['active_classifiers']['label'])) {
             // Note that 'label' is actually an array of active class labels.
             $crawl_params[self::ACTIVE_CLASSIFIERS] =
@@ -474,8 +478,8 @@ class CrawlComponent extends Component implements CrawlConstants
                 $_REQUEST['summarizer_option'];
             $update_flag = true;
         }
-        $data['summarizer_option'] = $seed_info['general']['summarizer_option'];
-
+        $data['summarizer_option'] =
+            $seed_info['general']['summarizer_option'];
         if(!$no_further_changes && isset($_REQUEST['crawl_order'])
             &&  in_array($_REQUEST['crawl_order'],
                 array_keys($data['available_crawl_orders']))) {
@@ -669,7 +673,7 @@ class CrawlComponent extends Component implements CrawlConstants
                 break;

                 case 'search':
-                    $search_array =
+                    $search_array =
                         $parent->tableSearchRequestHandler($data,
                         array('name'));
                 break;
diff --git a/controllers/fetch_controller.php b/controllers/fetch_controller.php
index 28a75c30e..056cc7557 100755
--- a/controllers/fetch_controller.php
+++ b/controllers/fetch_controller.php
@@ -568,8 +568,9 @@ class FetchController extends Controller implements CrawlConstants
                 $to_copy_fields = array(self::ALLOWED_SITES, self::ARC_DIR,
                     self::ARC_TYPE, self::CRAWL_INDEX, self::CRAWL_TYPE,
                     self::DISALLOWED_SITES, self::INDEXED_FILE_TYPES,
-                    self::PROXY_SERVERS, self::TOR_PROXY,
-                    self::RESTRICT_SITES_BY_URL
+                    self::PROXY_SERVERS, self::RESTRICT_SITES_BY_URL,
+                    self::SUMMARIZER_OPTION,
+                    self::TOR_PROXY
                     );
                 foreach($to_copy_fields as $field) {
                     if(isset($status[$field])) {
diff --git a/lib/phrase_parser.php b/lib/phrase_parser.php
index cb2ff9cf5..10820820a 100755
--- a/lib/phrase_parser.php
+++ b/lib/phrase_parser.php
@@ -86,18 +86,19 @@ class PhraseParser
      * A list of meta words that might be extracted from a query
      * @var array
      */
-    static $meta_words_list = array('link:', 'site:', 'version:',
-            'modified:', 'filetype:', 'info:', '\-', 'os:', 'server:', 'date:',
-            "numlinks:", 'index:', 'i:', 'ip:', 'weight:', 'w:', 'u:', 'time:',
-            'code:', 'lang:', 'media:', 'elink:', 'location:', 'size:', 'host:',
-            'dns:', 'path:', 'robot:', 'safe:', 'guid:', 'class:',
-            'class-score:');
+    static $meta_words_list = array('\-', 'class:', 'class-score:', 'code:',
+            'date:', 'dns:', 'elink:', 'filetype:', 'guid:', 'host:', 'i:',
+            'info:', 'index:', 'ip:', 'link:', 'modified:',
+            'lang:', 'media:', 'location:', 'numlinks:', 'os:',
+            'path:', 'robot:', 'safe:', 'server:', 'site:', 'size:',
+            'time:', 'u:', 'version:','weight:', 'w:'
+            );

     /**
      * Those meta words whose values will be encoded as part of word_ids
      * @var array
      */
-    static $materialized_metas = array("media:", "safe:", "class:");
+    static $materialized_metas = array("class:", "media:", "safe:");

     /**
      * A list of meta words that might be extracted from a query
diff --git a/lib/processors/html_processor.php b/lib/processors/html_processor.php
index f20d6a003..62f35aa85 100755
--- a/lib/processors/html_processor.php
+++ b/lib/processors/html_processor.php
@@ -90,17 +90,6 @@ class HtmlProcessor extends TextProcessor
      *  @return array  a summary of the contents of the page
      *
      */
-    function getSeedInfo($use_default = false)
-    {
-        if(file_exists(WORK_DIRECTORY."/crawl.ini") && !$use_default) {
-            $info = parse_ini_with_fallback(WORK_DIRECTORY."/crawl.ini");
-        } else {
-            $info = parse_ini_with_fallback(
-                BASE_DIR."/configs/default_crawl.ini");
-        }
-
-        return $info;
-    }
     function process($page, $url)
     {
         $summary = NULL;
@@ -117,11 +106,9 @@ class HtmlProcessor extends TextProcessor
                 if($summary[self::TITLE] == "") {
                     $summary[self::TITLE] = self::crudeTitle($dom_page);
                 }
-                $summarizer = $this->getSeedInfo();
-                $lang = self::lang($dom,
+                $summary[self::LANG] = self::lang($dom,
                     $summary[self::TITLE], $url);
-                if($summarizer['general']['summarizer_option']==
-                        self::CENTROID_SUMMARIZER) {
+                if($this->summarizer_option == self::CENTROID_SUMMARIZER) {
                     $summary_cloud = CentroidSummarizer::getCentroidSummary(
                         $dom_page, $lang);
                     $summary[self::DESCRIPTION] = $summary_cloud[0];
@@ -134,8 +121,6 @@ class HtmlProcessor extends TextProcessor
                     $summary[self::DESCRIPTION] = self::crudeDescription(
                         $dom_page);
                 }
-                $summary[self::LANG] = self::lang($dom,
-                    $summary[self::DESCRIPTION], $url);
                 $summary[self::LINKS] = self::links($dom, $url);
                 if($summary[self::LINKS] == array()) {
                     $summary[self::LINKS] = parent::extractHttpHttpsUrls(
@@ -295,11 +280,10 @@ class HtmlProcessor extends TextProcessor
     {
         $xpath = new DOMXPath($dom);
         $titles = $xpath->evaluate("/html//title");
-
         $title = "";

         foreach($titles as $pre_title) {
-            $title .= self::domNodeToString($pre_title);
+                $title .= $pre_title->nodeValue;
         }
         if($title == "") {
             $title_parts = array("/html//h1", "/html//h2", "/html//h3",
@@ -307,7 +291,7 @@ class HtmlProcessor extends TextProcessor
             foreach($title_parts as $part) {
                 $doc_nodes = $xpath->evaluate($part);
                 foreach($doc_nodes as $node) {
-                    $title .= " .. ".self::domNodeToString($node);
+                    $title .= " .. ".$node->nodeValue;
                     if(strlen($title) >
                         self::MAX_TITLE_LEN) { break 2;}
                 }
@@ -494,7 +478,7 @@ class HtmlProcessor extends TextProcessor
                     $len = strlen($url);
                     if(!UrlParser::checkRecursiveUrl($url)  &&
                         $len < MAX_URL_LENGTH && $len > 4) {
-                        $text = self::domNodeToString($href);
+                        $text = $href->nodeValue;
                         if(isset($sites[$url])) {
                             $sites[$url] .=" .. ".
                                 preg_replace("/\s+/", " ", strip_tags($text));
diff --git a/lib/processors/page_processor.php b/lib/processors/page_processor.php
index ab883b7fa..1fd69990d 100644
--- a/lib/processors/page_processor.php
+++ b/lib/processors/page_processor.php
@@ -62,6 +62,11 @@ abstract class PageProcessor implements CrawlConstants
      */
     var $plugin_instances;

+    /**
+     * Stores the name of the summarizer used for crawling.
+     * Possible values are self::BASIC and self::CENTROID_SUMMARIZER
+     * @var string
+     var $summarizer_option;
     /**
      * Max number of chars to extract for description from a page to index.
      * Only words in the description are indexed.
@@ -77,8 +82,10 @@ abstract class PageProcessor implements CrawlConstants
      *      do further processing on the data handles by this page
      *      processor
      */
-    function __construct($plugins = array(), $max_description_len = NULL) {
+    function __construct($plugins = array(), $max_description_len = NULL,
+        $summarizer_option = self::BASIC_SUMMARIZER) {
         $this->plugin_instances = $plugins;
+        $this->summarizer_option = $summarizer_option;
         if($max_description_len != NULL) {
             self::$max_description_len = $max_description_len;
         } else {
diff --git a/lib/processors/sitemap_processor.php b/lib/processors/sitemap_processor.php
index 4972b1d54..2eea02fd1 100644
--- a/lib/processors/sitemap_processor.php
+++ b/lib/processors/sitemap_processor.php
@@ -73,6 +73,7 @@ class SitemapProcessor extends TextProcessor
             if($dom !==false) {
                 $summary[self::TITLE] = $url;
                 $summary[self::DESCRIPTION] = "Sitemap of ".$url;
+                $summary[self::LANG] = "en-US";
                 $summary[self::LINKS] = self::links($dom, $url);
                 if(strlen($summary[self::DESCRIPTION] . $summary[self::TITLE])
                     == 0 && count($summary[self::LINKS]) == 0) {
diff --git a/lib/processors/text_processor.php b/lib/processors/text_processor.php
index 8bd535290..c05e34d67 100755
--- a/lib/processors/text_processor.php
+++ b/lib/processors/text_processor.php
@@ -85,17 +85,6 @@ class TextProcessor extends PageProcessor
      * @return array a summary of (title, description,links, and content) of
      *      the information in $page
      */
-    function getSeedInfo($use_default = false)
-    {
-        if(file_exists(WORK_DIRECTORY."/crawl.ini") && !$use_default) {
-            $info = parse_ini_with_fallback(WORK_DIRECTORY."/crawl.ini");
-        } else {
-            $info = parse_ini_with_fallback(
-                BASE_DIR."/configs/default_crawl.ini");
-        }
-
-        return $info;
-    }
     function process($page, $url)
     {
         $summary = NULL;
@@ -103,8 +92,7 @@ class TextProcessor extends PageProcessor
         if(is_string($page)) {
             $summary[self::TITLE] = "";
             $lang = self::calculateLang($page);
-            if($summarizer['general']['summarizer_option']==
-                self::CENTROID_SUMMARIZER) {
+            if($this->summarizer_option == self::CENTROID_SUMMARIZER) {
                 $summary_cloud =
                     CentroidSummarizer::getCentroidSummary($page,$lang);
                 $summary[self::DESCRIPTION] = $summary_cloud[0];
ViewGit