viewgit/inc/functions.php:22 Function utf8_encode() is deprecated [8192]
Filename | |
---|---|
configs/config.php | |
lib/index_bundle_iterators/group_iterator.php | |
lib/index_bundle_iterators/intersect_iterator.php | |
lib/index_shard.php | |
locale/en-US/statistics.txt |
diff --git a/configs/config.php b/configs/config.php index 195f98ec0..5067e1c15 100755 --- a/configs/config.php +++ b/configs/config.php @@ -316,6 +316,18 @@ define ('PUNCT', "\.|\,|\:|\;|\"|\'|\`|\[|\]|\{|\}|\(|\)|\!|\||\&"); /** Percentage ASCII text before guess we dealing with english*/ define ('EN_RATIO', 0.9); +/** Number of total description deemed title */ +define ('AD_HOC_TITLE_LENGTH', 10); + +/** BM25F weight for title text */ +define ('TITLE_WEIGHT', 4); + +/** BM25F weight for other text within doc*/ +define ('DESCRIPTION_WEIGHT', 1); + +/** BM25F weight for other text within links to a doc*/ +define ('LINK_WEIGHT', 1); + /** * How many non robot urls the fetcher successfully downloads before diff --git a/lib/index_bundle_iterators/group_iterator.php b/lib/index_bundle_iterators/group_iterator.php index 2a5fe3792..1161c505e 100644 --- a/lib/index_bundle_iterators/group_iterator.php +++ b/lib/index_bundle_iterators/group_iterator.php @@ -166,6 +166,7 @@ class GroupIterator extends IndexBundleIterator { // first get a block of documents on which grouping can be done $pages = $this->getPagesToGroup(); + $this->count_block_unfiltered = count($pages); if(!is_array($pages)) { return $pages; @@ -456,11 +457,8 @@ class GroupIterator extends IndexBundleIterator } else { $boost = 0; } - $out_pages[$hash_url][self::SCORE] = - ($out_pages[$hash_url][self::HASH_SUM_SCORE] + - $boost *$out_pages[$hash_url][self::RELEVANCE] - ); + $out_pages[$hash_url][self::HASH_SUM_SCORE] + $boost; } else { $out_pages[$hash_url][self::SCORE] = $out_pages[$hash_url][self::HASH_SUM_SCORE]; diff --git a/lib/index_bundle_iterators/intersect_iterator.php b/lib/index_bundle_iterators/intersect_iterator.php index 8c9ab5178..e476fbe3d 100644 --- a/lib/index_bundle_iterators/intersect_iterator.php +++ b/lib/index_bundle_iterators/intersect_iterator.php @@ -212,6 +212,7 @@ class IntersectIterator extends IndexBundleIterator $counters = array_fill(0, $num_iterators, 0); $min_diff = 5000000; + $weight = DESCRIPTION_WEIGHT; do { $min_counter = ($counters[0] < $len_lists[0] - 1) ? 0 : -1; $o_position = $position_lists[0][$counters[0]]; @@ -230,11 +231,14 @@ class IntersectIterator extends IndexBundleIterator } if($total_diff < $min_diff) { $min_diff = $total_diff; + if($positions[$num_iterators -1] < AD_HOC_TITLE_LENGTH) { + $weight = TITLE_WEIGHT; + } } if($min_counter >=0) $counters[$min_counter]++; } while($min_counter >= 0); - return ($num_iterators - 1)/$min_diff; + return $weight*($num_iterators - 1)/$min_diff; } /** diff --git a/lib/index_shard.php b/lib/index_shard.php index bf6cf0c04..3a1139650 100644 --- a/lib/index_shard.php +++ b/lib/index_shard.php @@ -421,8 +421,8 @@ class IndexShard extends PersistentStructure implements $rank <<= 19; $flags += $rank; } - - $len_num_keys = $this->packDoclenNum(($flags + $doc_len), $num_keys); + $item_len = ($is_doc) ? $doc_len: $link_doc_len; + $len_num_keys = $this->packDoclenNum(($flags + $item_len), $num_keys); $this->doc_infos .= $len_num_keys; $added_len += strlen($len_num_keys); @@ -596,8 +596,9 @@ class IndexShard extends PersistentStructure implements $offset = 0; list($doc_index, $item[self::POSITION_LIST]) = $this->unpackPosting($posting, $offset); - $item[self::PROXIMITY] = 1; - $occurrences = count($item[self::POSITION_LIST]); + $item[self::PROXIMITY] = + $this->computeProximity($item[self::POSITION_LIST]); + $occurrences = $this->weightedCount($item[self::POSITION_LIST]); if($occurrences < $occurs) { $occurrences = $occurs; @@ -620,7 +621,7 @@ class IndexShard extends PersistentStructure implements $is_doc = (($doc_len & self::LINK_FLAG) == 0) ? true : false; if(!$is_doc) { - $doc_len -= self::LINK_FLAG; + $doc_len &= (self::LINK_FLAG - 1); } $item[self::IS_DOC] = $is_doc; /* @@ -631,7 +632,7 @@ class IndexShard extends PersistentStructure implements $pre_rank = ($doc_len & $rank_mask); if( $pre_rank > 0) { $item[self::DOC_RANK] = $pre_rank >> 19; - $doc_len -= $pre_rank; + $doc_len &= (2 << 19 - 1); } $skip_stats = false; @@ -643,10 +644,12 @@ class IndexShard extends PersistentStructure implements } else if($is_doc) { $average_doc_len = $this->len_all_docs/$this->num_docs; $num_docs = $this->num_docs; + $type_weight = 1; } else { $average_doc_len = ($this->num_link_docs != 0) ? $this->len_all_link_docs/$this->num_link_docs : 0; $num_docs = $this->num_link_docs; + $type_weight = LINK_WEIGHT; } if(!isset($item['KEY'])) { $doc_id = $this->getDocInfoSubstring( @@ -657,17 +660,36 @@ class IndexShard extends PersistentStructure implements if(!$skip_stats) { self::docStats($item, $occurrences, $doc_len, $num_doc_or_links, $average_doc_len, $num_docs, - $this->num_docs + $this->num_link_docs); + $this->num_docs + $this->num_link_docs, $type_weight); } return $doc_id; } + /** + * + */ + function weightedCount($position_list) { + $count = 0; + foreach($position_list as $position) { + $count += ($position < AD_HOC_TITLE_LENGTH) ? + TITLE_WEIGHT : DESCRIPTION_WEIGHT; + } + return $count; + } + + /** + * + */ + function computeProximity($position_list) { + return ($position_list[0] < AD_HOC_TITLE_LENGTH) ? + TITLE_WEIGHT : DESCRIPTION_WEIGHT; + } /** * Computes BM25F relevance and a score for the supplied item based - * on the supplied parameters - * + * on the supplied parameters. + * * @param array &$item doc summary to compute a relevance and score for. * Pass-by-ref so self::RELEVANCE and self::SCORE fields can be changed * @param int $occurrences - number of occurences of the term in the item @@ -677,9 +699,10 @@ class IndexShard extends PersistentStructure implements * @param int $num_docs either number of links or number of docs depending * if item represents a link or a doc. * @param int $total_docs_or_links number of docs or links in corpus + * @param float BM25F weight for this component (doc or link) of score */ static function docStats(&$item, $occurrences, $doc_len, $num_doc_or_links, - $average_doc_len, $num_docs, $total_docs_or_links) + $average_doc_len, $num_docs, $total_docs_or_links, $type_weight) { $doc_ratio = ($average_doc_len > 0) ? @@ -695,9 +718,7 @@ class IndexShard extends PersistentStructure implements $IDF = log(($num_docs - $num_term_occurrences + 0.5) / ($num_term_occurrences + 0.5)); - $item[self::RELEVANCE] = 0.5 * $IDF * $pre_relevance; - - + $item[self::RELEVANCE] = 0.5 * $IDF * $pre_relevance * $type_weight; $item[self::SCORE] = $item[self::DOC_RANK] * $item[self::RELEVANCE]; } @@ -1593,8 +1614,10 @@ class IndexShard extends PersistentStructure implements $shard = new IndexShard($fname); if($data === NULL) { $fh = fopen($fname, "rb"); + $shard->file_len = filesize($fname); $header = fread($fh, self::HEADER_LENGTH); } else { + $shard->file_len = strlen($data); $header = substr($data, 0, self::HEADER_LENGTH); $pos = self::HEADER_LENGTH; } diff --git a/locale/en-US/statistics.txt b/locale/en-US/statistics.txt index b6bef56f0..5a165df53 100755 --- a/locale/en-US/statistics.txt +++ b/locale/en-US/statistics.txt @@ -1 +1 @@ -d:99; \ No newline at end of file +d:100; \ No newline at end of file