fixes a bug in galloping search and one in grouping, a=chris

Chris Pollett [2010-12-30 00:Dec:th]
fixes a bug in galloping search and one in grouping, a=chris
Filename
lib/crawl_constants.php
lib/index_archive_bundle.php
lib/index_bundle_iterators/group_iterator.php
lib/index_bundle_iterators/word_iterator.php
lib/index_shard.php
diff --git a/lib/crawl_constants.php b/lib/crawl_constants.php
index 6bf0b71a1..016ab7637 100644
--- a/lib/crawl_constants.php
+++ b/lib/crawl_constants.php
@@ -136,6 +136,7 @@ interface CrawlConstants
     const META_WORDS ='ao';
     const CACHE_PAGE_PARTITION = 'ap';
     const GENERATION = 'aq';
+    const HASH_URL_COUNT = 'ar';

     const NEEDS_OFFSET_FLAG = 0x7FFFFFFF;

diff --git a/lib/index_archive_bundle.php b/lib/index_archive_bundle.php
index 2211def1a..2b1973034 100644
--- a/lib/index_archive_bundle.php
+++ b/lib/index_archive_bundle.php
@@ -203,10 +203,10 @@ class IndexArchiveBundle implements CrawlConstants
      * Adds the provided mini inverted index data to the IndexArchiveBundle
      * Expects initGenerationToAdd to be called before, so generation is correct
      *
-     * @param object &$index_shard a mini inverted index of word_key=>doc data
+     * @param object $index_shard a mini inverted index of word_key=>doc data
      *      to add to this IndexArchiveBundle
      */
-    public function addIndexData(&$index_shard)
+    public function addIndexData($index_shard)
     {

         crawlLog("**ADD INDEX DIAGNOSTIC INFO...");
diff --git a/lib/index_bundle_iterators/group_iterator.php b/lib/index_bundle_iterators/group_iterator.php
index e38554342..5f4ebbaec 100644
--- a/lib/index_bundle_iterators/group_iterator.php
+++ b/lib/index_bundle_iterators/group_iterator.php
@@ -254,12 +254,27 @@ class GroupIterator extends IndexBundleIterator
                     $hash = $pre_out_pages[$hash_url][0][self::HASH];
                     if(isset($seen_hashes[$hash])) {
                         $previous_url = $seen_hashes[$hash];
-                        $pre_out_pages[$previous_url] =
-                            array_merge($pre_out_pages[$previous_url],
+                        if($pre_out_pages[$previous_url][0][
+                            self::HASH_URL_COUNT] >=
+                            count($pre_out_pages[$hash_url])) {
+                            $pre_out_pages[$previous_url] =
+                                array_merge($pre_out_pages[$previous_url],
                                 $pre_out_pages[$hash_url]);
-                        unset($pre_out_pages[$hash_url]);
+                            unset($pre_out_pages[$hash_url]);
+                        } else {
+                            $seen_hashes[$hash] = $hash_url;
+                            $pre_out_pages[$hash_url][0][self::HASH_URL_COUNT] =
+                                count($pre_out_pages[$hash_url]);
+                            $pre_out_pages[$hash_url] =
+                                array_merge($pre_out_pages[$hash_url],
+                                    $pre_out_pages[$previous_url]
+                                );
+                            unset($pre_out_pages[$previous_url]);
+                        }
                     } else {
                         $seen_hashes[$hash] = $hash_url;
+                        $pre_out_pages[$hash_url][0][self::HASH_URL_COUNT] =
+                            count($pre_out_pages[$hash_url]);
                     }
                 }
             }
diff --git a/lib/index_bundle_iterators/word_iterator.php b/lib/index_bundle_iterators/word_iterator.php
index ec5a0937e..716f83238 100644
--- a/lib/index_bundle_iterators/word_iterator.php
+++ b/lib/index_bundle_iterators/word_iterator.php
@@ -242,18 +242,21 @@ class WordIterator extends IndexBundleIterator
                 $this->advanceGeneration();
             }
             if($gen_doc_offset !== null) {
-                while($this->current_generation < $gen_doc_offset[0]) {
+                $last_current_generation = -1;
+                while($this->current_generation < $gen_doc_offset[0] &&
+                      $last_current_generation != $this->current_generation) {
                     $this->advanceGeneration();
+                    $last_current_generation = $this->current_generation;
                 }
                 $this->index->setCurrentShard($this->current_generation, true);

-                $this->current_offset =
+                $this->current_offset =
                     $this->index->getCurrentShard(
-                        )->nextPostingOffsetDocOffset($this->next_offset,
+                        )->nextPostingOffsetDocOffset($this->next_offset,
                             $this->last_offset, $gen_doc_offset[1]);
                 if($this->current_offset === false) {
                     $this->current_offset = $this->last_offset + 1;
-                    return;
+                    $this->advanceGeneration();
                 }
                 $this->seen_docs =
                     ($this->current_offset - $this->start_offset)/
diff --git a/lib/index_shard.php b/lib/index_shard.php
index 2f35d2bf5..c6e3cea3c 100644
--- a/lib/index_shard.php
+++ b/lib/index_shard.php
@@ -554,11 +554,11 @@ class IndexShard extends PersistentStructure implements CrawlConstants
         $high = $end;
         $stride = 1;
         $gallop_phase = true;
-
         do {
             $posting = $this->getWordDocsSubstring($current*self::POSTING_LEN,
                 self::POSTING_LEN);
             list($post_doc_index, ) = $this->unpackPosting($posting);
+
             if($doc_index == $post_doc_index) {
                 return $current * self::POSTING_LEN;
             } else if($doc_index < $post_doc_index) {
@@ -574,6 +574,12 @@ class IndexShard extends PersistentStructure implements CrawlConstants
                 if($gallop_phase) {
                     $current += $stride;
                     $stride <<= 1;
+                    if($current > $end ) {
+                        $current = $end;
+                        $gallop_phase = false;
+                    }
+                } else if($current >= $end) {
+                    return false;
                 } else {
                     if($current + 1 == $high) {
                         $current++;
@@ -626,9 +632,9 @@ class IndexShard extends PersistentStructure implements CrawlConstants
      * Adds the contents of the supplied $index_shard to the current index
      * shard
      *
-     * @param object &$index_shard the shard to append to the current shard
+     * @param object $index_shard the shard to append to the current shard
      */
-    function appendIndexShard(&$index_shard)
+    function appendIndexShard($index_shard)
     {
         if($this->word_docs_packed == true) {
             $this->unpackWordDocs();
@@ -642,9 +648,9 @@ class IndexShard extends PersistentStructure implements CrawlConstants
             $postings_len = strlen($postings);
             // update doc offsets for newly added docs
             for($i = 0; $i < $postings_len; $i += self::POSTING_LEN) {
-                $num = unpackInt(substr($postings, $i, 4));
+                $num = unpackInt(substr($postings, $i, self::POSTING_LEN));
                 $num += ($this->docids_len << 4);
-                charCopy(pack("N", $num), $postings, $i, 4);
+                charCopy(pack("N", $num), $postings, $i, self::POSTING_LEN);
             }
             if(!isset($this->words[$word_id])) {
                 $this->words[$word_id] = $postings;
@@ -683,7 +689,7 @@ class IndexShard extends PersistentStructure implements CrawlConstants
                 substr($doc_info_string, 0, self::POSTING_LEN));
             list($doc_len, $num_keys) =
                 $this->unpackPosting(substr($doc_info_string,
-                    self::POSTING_LEN));
+                    self::POSTING_LEN, self::POSTING_LEN));
             $key_count = ($num_keys % 2 == 0) ? $num_keys + 2: $num_keys + 1;
             $row_len = self::DOC_KEY_LEN * ($key_count);

@@ -1095,9 +1101,9 @@ class IndexShard extends PersistentStructure implements CrawlConstants
      *  Split a header string into a shards field variable
      *
      *  @param string $header a string with packed shard header data
-     *  @param object &shard IndexShard to put data into
+     *  @param object shard IndexShard to put data into
      */
-    static function headerToShardFields($header, &$shard)
+    static function headerToShardFields($header, $shard)
     {
         $header_array = str_split($header, 4);
         $header_data = array_map('unpackInt', $header_array);
@@ -1121,7 +1127,7 @@ class IndexShard extends PersistentStructure implements CrawlConstants
      * @param int $key index in array - we don't use
      * @param object $shard IndexShard to add the entry to word table for
      */
-    static function makeWords(&$value, $key, &$shard)
+    static function makeWords(&$value, $key, $shard)
     {
         $shard->words[substr($value, 0, 8)] = substr($value, 8, 8);
     }
ViewGit