viewgit/inc/functions.php:22 Function utf8_encode() is deprecated [8192]

Fixes a bug in how doc_len was being stored for links also pulls into config weighting parameters for BM25 and tweaks proximity calculation, a=chris

Chris Pollett [2011-08-06 08:Aug:th]
Fixes a bug in how doc_len was being stored for links also pulls into config weighting parameters for BM25 and tweaks proximity calculation, a=chris
Filename
configs/config.php
lib/index_bundle_iterators/group_iterator.php
lib/index_bundle_iterators/intersect_iterator.php
lib/index_shard.php
locale/en-US/statistics.txt
diff --git a/configs/config.php b/configs/config.php
index 195f98ec0..5067e1c15 100755
--- a/configs/config.php
+++ b/configs/config.php
@@ -316,6 +316,18 @@ define ('PUNCT', "\.|\,|\:|\;|\"|\'|\`|\[|\]|\{|\}|\(|\)|\!|\||\&");
 /** Percentage ASCII text before guess we dealing with english*/
 define ('EN_RATIO', 0.9);

+/** Number of total description deemed title */
+define ('AD_HOC_TITLE_LENGTH', 10);
+
+/** BM25F weight for title text */
+define ('TITLE_WEIGHT', 4);
+
+/** BM25F weight for other text within doc*/
+define ('DESCRIPTION_WEIGHT', 1);
+
+/** BM25F weight for other text within links to a doc*/
+define ('LINK_WEIGHT', 1);
+

 /**
  * How many non robot urls the fetcher successfully downloads before
diff --git a/lib/index_bundle_iterators/group_iterator.php b/lib/index_bundle_iterators/group_iterator.php
index 2a5fe3792..1161c505e 100644
--- a/lib/index_bundle_iterators/group_iterator.php
+++ b/lib/index_bundle_iterators/group_iterator.php
@@ -166,6 +166,7 @@ class GroupIterator extends IndexBundleIterator
     {
         // first get a block of documents on which grouping can be done
         $pages =  $this->getPagesToGroup();
+
         $this->count_block_unfiltered = count($pages);
         if(!is_array($pages)) {
             return $pages;
@@ -456,11 +457,8 @@ class GroupIterator extends IndexBundleIterator
                 } else {
                     $boost = 0;
                 }
-
                 $out_pages[$hash_url][self::SCORE] =
-                    ($out_pages[$hash_url][self::HASH_SUM_SCORE] +
-                        $boost *$out_pages[$hash_url][self::RELEVANCE]
-                        );
+                    $out_pages[$hash_url][self::HASH_SUM_SCORE] + $boost;
             } else {
                 $out_pages[$hash_url][self::SCORE] =
                     $out_pages[$hash_url][self::HASH_SUM_SCORE];
diff --git a/lib/index_bundle_iterators/intersect_iterator.php b/lib/index_bundle_iterators/intersect_iterator.php
index 8c9ab5178..e476fbe3d 100644
--- a/lib/index_bundle_iterators/intersect_iterator.php
+++ b/lib/index_bundle_iterators/intersect_iterator.php
@@ -212,6 +212,7 @@ class IntersectIterator extends IndexBundleIterator
         $counters = array_fill(0, $num_iterators, 0);

         $min_diff = 5000000;
+        $weight = DESCRIPTION_WEIGHT;
         do {
             $min_counter = ($counters[0] < $len_lists[0] - 1) ? 0 : -1;
             $o_position = $position_lists[0][$counters[0]];
@@ -230,11 +231,14 @@ class IntersectIterator extends IndexBundleIterator
             }
             if($total_diff < $min_diff) {
                 $min_diff = $total_diff;
+                if($positions[$num_iterators -1] < AD_HOC_TITLE_LENGTH) {
+                    $weight = TITLE_WEIGHT;
+                }
             }
             if($min_counter >=0) $counters[$min_counter]++;
         } while($min_counter >= 0);

-        return ($num_iterators - 1)/$min_diff;
+        return $weight*($num_iterators - 1)/$min_diff;
     }

     /**
diff --git a/lib/index_shard.php b/lib/index_shard.php
index bf6cf0c04..3a1139650 100644
--- a/lib/index_shard.php
+++ b/lib/index_shard.php
@@ -421,8 +421,8 @@ class IndexShard extends PersistentStructure implements
             $rank <<= 19;
             $flags += $rank;
         }
-
-        $len_num_keys = $this->packDoclenNum(($flags + $doc_len), $num_keys);
+        $item_len = ($is_doc) ? $doc_len: $link_doc_len;
+        $len_num_keys = $this->packDoclenNum(($flags + $item_len), $num_keys);

         $this->doc_infos .=  $len_num_keys;
         $added_len += strlen($len_num_keys);
@@ -596,8 +596,9 @@ class IndexShard extends PersistentStructure implements
         $offset = 0;
         list($doc_index, $item[self::POSITION_LIST]) =
             $this->unpackPosting($posting, $offset);
-        $item[self::PROXIMITY] = 1;
-        $occurrences = count($item[self::POSITION_LIST]);
+        $item[self::PROXIMITY] =
+            $this->computeProximity($item[self::POSITION_LIST]);
+        $occurrences = $this->weightedCount($item[self::POSITION_LIST]);

         if($occurrences < $occurs) {
             $occurrences = $occurs;
@@ -620,7 +621,7 @@ class IndexShard extends PersistentStructure implements

         $is_doc = (($doc_len & self::LINK_FLAG) == 0) ? true : false;
         if(!$is_doc) {
-            $doc_len -= self::LINK_FLAG;
+            $doc_len &= (self::LINK_FLAG - 1);
         }
         $item[self::IS_DOC] = $is_doc;
         /*
@@ -631,7 +632,7 @@ class IndexShard extends PersistentStructure implements
         $pre_rank = ($doc_len & $rank_mask);
         if( $pre_rank > 0) {
             $item[self::DOC_RANK] = $pre_rank >> 19;
-            $doc_len -= $pre_rank;
+            $doc_len &= (2 << 19 - 1);
         }

         $skip_stats = false;
@@ -643,10 +644,12 @@ class IndexShard extends PersistentStructure implements
         } else if($is_doc) {
             $average_doc_len = $this->len_all_docs/$this->num_docs;
             $num_docs = $this->num_docs;
+            $type_weight = 1;
         } else {
             $average_doc_len = ($this->num_link_docs != 0) ?
                 $this->len_all_link_docs/$this->num_link_docs : 0;
             $num_docs = $this->num_link_docs;
+            $type_weight = LINK_WEIGHT;
         }
         if(!isset($item['KEY'])) {
             $doc_id = $this->getDocInfoSubstring(
@@ -657,17 +660,36 @@ class IndexShard extends PersistentStructure implements
         if(!$skip_stats) {
             self::docStats($item, $occurrences, $doc_len, $num_doc_or_links,
                 $average_doc_len, $num_docs,
-                $this->num_docs + $this->num_link_docs);
+                $this->num_docs + $this->num_link_docs, $type_weight);
         }

         return $doc_id;

     }
+    /**
+     *
+     */
+    function weightedCount($position_list) {
+        $count = 0;
+        foreach($position_list as $position) {
+            $count += ($position < AD_HOC_TITLE_LENGTH) ?
+                TITLE_WEIGHT : DESCRIPTION_WEIGHT;
+        }
+        return $count;
+    }
+
+    /**
+     *
+     */
+    function computeProximity($position_list) {
+        return ($position_list[0] < AD_HOC_TITLE_LENGTH) ?
+            TITLE_WEIGHT : DESCRIPTION_WEIGHT;
+    }

     /**
      *  Computes BM25F relevance and a score for the supplied item based
-     *  on the supplied parameters
-     *
+     *  on the supplied parameters.
+     *
      *  @param array &$item doc summary to compute a relevance and score for.
      *      Pass-by-ref so self::RELEVANCE and self::SCORE fields can be changed
      *  @param int $occurrences - number of occurences of the term in the item
@@ -677,9 +699,10 @@ class IndexShard extends PersistentStructure implements
      *  @param int $num_docs either number of links or number of docs depending
      *      if item represents a link or a doc.
      *  @param int $total_docs_or_links number of docs or links in corpus
+     *  @param float BM25F weight for this component (doc or link) of score
      */
     static function docStats(&$item, $occurrences, $doc_len, $num_doc_or_links,
-        $average_doc_len, $num_docs, $total_docs_or_links)
+        $average_doc_len, $num_docs, $total_docs_or_links, $type_weight)
     {

         $doc_ratio = ($average_doc_len > 0) ?
@@ -695,9 +718,7 @@ class IndexShard extends PersistentStructure implements
         $IDF = log(($num_docs - $num_term_occurrences + 0.5) /
             ($num_term_occurrences + 0.5));

-        $item[self::RELEVANCE] = 0.5 * $IDF * $pre_relevance;
-
-
+        $item[self::RELEVANCE] = 0.5 * $IDF * $pre_relevance * $type_weight;
         $item[self::SCORE] = $item[self::DOC_RANK]
             * $item[self::RELEVANCE];
     }
@@ -1593,8 +1614,10 @@ class IndexShard extends PersistentStructure implements
         $shard = new IndexShard($fname);
         if($data === NULL) {
             $fh = fopen($fname, "rb");
+            $shard->file_len = filesize($fname);
             $header = fread($fh, self::HEADER_LENGTH);
         } else {
+            $shard->file_len = strlen($data);
             $header = substr($data, 0, self::HEADER_LENGTH);
             $pos = self::HEADER_LENGTH;
         }
diff --git a/locale/en-US/statistics.txt b/locale/en-US/statistics.txt
index b6bef56f0..5a165df53 100755
--- a/locale/en-US/statistics.txt
+++ b/locale/en-US/statistics.txt
@@ -1 +1 @@
-d:99;
\ No newline at end of file
+d:100;
\ No newline at end of file
ViewGit