viewgit/inc/functions.php:22 Function utf8_encode() is deprecated [8192]

tweaks search parameters and hourly crawl rate data, a=chris

Chris Pollett [2011-08-06 17:Aug:th]
tweaks search parameters and hourly crawl rate data, a=chris
Filename
bin/queue_server.php
configs/config.php
controllers/admin_controller.php
lib/index_bundle_iterators/group_iterator.php
diff --git a/bin/queue_server.php b/bin/queue_server.php
index 8434dfee1..2224ccede 100755
--- a/bin/queue_server.php
+++ b/bin/queue_server.php
@@ -1147,23 +1147,15 @@ class QueueServer implements CrawlConstants
         $now = time();
         if(count($this->hourly_crawl_data) > 0 ) {
             $last_recent_hourly_pair = array_pop($this->hourly_crawl_data);
-            $change_in_time_hours =
-                floatval(($now - $last_recent_hourly_pair[0])/3600.0);
-            $change_in_urls = floatval($info_bundle['COUNT'] -
-                $last_recent_hourly_pair[1]);
-            $crawl_status['VISITED_URLS_COUNT_PER_HOUR'] =
-                $change_in_urls/$change_in_time_hours;
-            if($change_in_time_hours <= 1) {
+            $change_in_time =
+                ($now - $last_recent_hourly_pair[0]);
+            if($change_in_time <= 3600) {
                 $this->hourly_crawl_data[] = $last_recent_hourly_pair;
             }
-        } else {
-            $change_in_time_hours =
-                floatval(($now - $this->crawltime)/3600.0);
-            $crawl_status['VISITED_URLS_COUNT_PER_HOUR'] =
-                $info_bundle['COUNT']/$change_in_time_hours;
-        }
+        }
         array_unshift($this->hourly_crawl_data,
             array($now, $info_bundle['COUNT']));
+        $crawl_status['VISITED_COUNT_HISTORY'] = $this->hourly_crawl_data;
         $crawl_status['VISITED_URLS_COUNT'] =$info_bundle['VISITED_URLS_COUNT'];
         $crawl_status['DESCRIPTION'] = $index_archive_info['DESCRIPTION'];
         $crawl_status['QUEUE_PEAK_MEMORY'] = memory_get_peak_usage();
diff --git a/configs/config.php b/configs/config.php
index 5067e1c15..b395fe1dd 100755
--- a/configs/config.php
+++ b/configs/config.php
@@ -320,13 +320,13 @@ define ('EN_RATIO', 0.9);
 define ('AD_HOC_TITLE_LENGTH', 10);

 /** BM25F weight for title text */
-define ('TITLE_WEIGHT', 4);
+define ('TITLE_WEIGHT', 2);

 /** BM25F weight for other text within doc*/
 define ('DESCRIPTION_WEIGHT', 1);

 /** BM25F weight for other text within links to a doc*/
-define ('LINK_WEIGHT', 1);
+define ('LINK_WEIGHT', 0.5);


 /**
diff --git a/controllers/admin_controller.php b/controllers/admin_controller.php
index 4225a28dc..5a0af7b3d 100755
--- a/controllers/admin_controller.php
+++ b/controllers/admin_controller.php
@@ -251,6 +251,18 @@ class AdminController extends Controller implements CrawlConstants
                 $data = array_merge($data, $crawl_status);
             }
         }
+        if(isset($data['VISITED_COUNT_HISTORY']) &&
+            count($data['VISITED_COUNT_HISTORY']) > 1) {
+            $recent = array_shift($data['VISITED_COUNT_HISTORY']);
+            $oldest = array_pop($data['VISITED_COUNT_HISTORY']);
+            $change_in_time_hours = floatval($recent[0] - $oldest[0])/3600.;
+            $change_in_urls = $recent[1] - $oldest[1];
+            $data['VISITED_URLS_COUNT_PER_HOUR'] =
+                number_format($change_in_urls/$change_in_time_hours, 2,
+                    ".", "");
+        } else {
+            $data['VISITED_URLS_COUNT_PER_HOUR'] = 0;
+        }
         $data['RECENT_CRAWLS'] = $this->crawlModel->getCrawlList(false, true);
         if(isset($data['CRAWL_TIME'])) {
             //erase from previous crawl list any active crawl
diff --git a/lib/index_bundle_iterators/group_iterator.php b/lib/index_bundle_iterators/group_iterator.php
index 1161c505e..78471326e 100644
--- a/lib/index_bundle_iterators/group_iterator.php
+++ b/lib/index_bundle_iterators/group_iterator.php
@@ -458,7 +458,8 @@ class GroupIterator extends IndexBundleIterator
                     $boost = 0;
                 }
                 $out_pages[$hash_url][self::SCORE] =
-                    $out_pages[$hash_url][self::HASH_SUM_SCORE] + $boost;
+                    $out_pages[$hash_url][self::HASH_SUM_SCORE]
+                     + 0*$boost;
             } else {
                 $out_pages[$hash_url][self::SCORE] =
                     $out_pages[$hash_url][self::HASH_SUM_SCORE];
@@ -499,7 +500,7 @@ class GroupIterator extends IndexBundleIterator
                 $min = ($current_rank < $min ) ? $current_rank : $min;
                 $max = ($max < $current_rank ) ? $current_rank : $max;
                 $sum_score += $hash_page[self::DOC_RANK]
-                    * $relevance_boost * pow(1.2,$hash_page[self::RELEVANCE]) *
+                    * $relevance_boost * pow(1.1,$hash_page[self::RELEVANCE]) *
                     $hash_page[self::PROXIMITY] * $domain_weights[$hash_host];
                 $sum_rank += $hash_page[self::DOC_RANK]
                     * $domain_weights[$hash_host];
ViewGit