diff --git a/lib/crawl_constants.php b/lib/crawl_constants.php
index 6bf0b71a1..016ab7637 100644
--- a/lib/crawl_constants.php
+++ b/lib/crawl_constants.php
@@ -136,6 +136,7 @@ interface CrawlConstants
const META_WORDS ='ao';
const CACHE_PAGE_PARTITION = 'ap';
const GENERATION = 'aq';
+ const HASH_URL_COUNT = 'ar';
const NEEDS_OFFSET_FLAG = 0x7FFFFFFF;
diff --git a/lib/index_archive_bundle.php b/lib/index_archive_bundle.php
index 2211def1a..2b1973034 100644
--- a/lib/index_archive_bundle.php
+++ b/lib/index_archive_bundle.php
@@ -203,10 +203,10 @@ class IndexArchiveBundle implements CrawlConstants
* Adds the provided mini inverted index data to the IndexArchiveBundle
* Expects initGenerationToAdd to be called before, so generation is correct
*
- * @param object &$index_shard a mini inverted index of word_key=>doc data
+ * @param object $index_shard a mini inverted index of word_key=>doc data
* to add to this IndexArchiveBundle
*/
- public function addIndexData(&$index_shard)
+ public function addIndexData($index_shard)
{
crawlLog("**ADD INDEX DIAGNOSTIC INFO...");
diff --git a/lib/index_bundle_iterators/group_iterator.php b/lib/index_bundle_iterators/group_iterator.php
index e38554342..5f4ebbaec 100644
--- a/lib/index_bundle_iterators/group_iterator.php
+++ b/lib/index_bundle_iterators/group_iterator.php
@@ -254,12 +254,27 @@ class GroupIterator extends IndexBundleIterator
$hash = $pre_out_pages[$hash_url][0][self::HASH];
if(isset($seen_hashes[$hash])) {
$previous_url = $seen_hashes[$hash];
- $pre_out_pages[$previous_url] =
- array_merge($pre_out_pages[$previous_url],
+ if($pre_out_pages[$previous_url][0][
+ self::HASH_URL_COUNT] >=
+ count($pre_out_pages[$hash_url])) {
+ $pre_out_pages[$previous_url] =
+ array_merge($pre_out_pages[$previous_url],
$pre_out_pages[$hash_url]);
- unset($pre_out_pages[$hash_url]);
+ unset($pre_out_pages[$hash_url]);
+ } else {
+ $seen_hashes[$hash] = $hash_url;
+ $pre_out_pages[$hash_url][0][self::HASH_URL_COUNT] =
+ count($pre_out_pages[$hash_url]);
+ $pre_out_pages[$hash_url] =
+ array_merge($pre_out_pages[$hash_url],
+ $pre_out_pages[$previous_url]
+ );
+ unset($pre_out_pages[$previous_url]);
+ }
} else {
$seen_hashes[$hash] = $hash_url;
+ $pre_out_pages[$hash_url][0][self::HASH_URL_COUNT] =
+ count($pre_out_pages[$hash_url]);
}
}
}
diff --git a/lib/index_bundle_iterators/word_iterator.php b/lib/index_bundle_iterators/word_iterator.php
index ec5a0937e..716f83238 100644
--- a/lib/index_bundle_iterators/word_iterator.php
+++ b/lib/index_bundle_iterators/word_iterator.php
@@ -242,18 +242,21 @@ class WordIterator extends IndexBundleIterator
$this->advanceGeneration();
}
if($gen_doc_offset !== null) {
- while($this->current_generation < $gen_doc_offset[0]) {
+ $last_current_generation = -1;
+ while($this->current_generation < $gen_doc_offset[0] &&
+ $last_current_generation != $this->current_generation) {
$this->advanceGeneration();
+ $last_current_generation = $this->current_generation;
}
$this->index->setCurrentShard($this->current_generation, true);
- $this->current_offset =
+ $this->current_offset =
$this->index->getCurrentShard(
- )->nextPostingOffsetDocOffset($this->next_offset,
+ )->nextPostingOffsetDocOffset($this->next_offset,
$this->last_offset, $gen_doc_offset[1]);
if($this->current_offset === false) {
$this->current_offset = $this->last_offset + 1;
- return;
+ $this->advanceGeneration();
}
$this->seen_docs =
($this->current_offset - $this->start_offset)/
diff --git a/lib/index_shard.php b/lib/index_shard.php
index 2f35d2bf5..c6e3cea3c 100644
--- a/lib/index_shard.php
+++ b/lib/index_shard.php
@@ -554,11 +554,11 @@ class IndexShard extends PersistentStructure implements CrawlConstants
$high = $end;
$stride = 1;
$gallop_phase = true;
-
do {
$posting = $this->getWordDocsSubstring($current*self::POSTING_LEN,
self::POSTING_LEN);
list($post_doc_index, ) = $this->unpackPosting($posting);
+
if($doc_index == $post_doc_index) {
return $current * self::POSTING_LEN;
} else if($doc_index < $post_doc_index) {
@@ -574,6 +574,12 @@ class IndexShard extends PersistentStructure implements CrawlConstants
if($gallop_phase) {
$current += $stride;
$stride <<= 1;
+ if($current > $end ) {
+ $current = $end;
+ $gallop_phase = false;
+ }
+ } else if($current >= $end) {
+ return false;
} else {
if($current + 1 == $high) {
$current++;
@@ -626,9 +632,9 @@ class IndexShard extends PersistentStructure implements CrawlConstants
* Adds the contents of the supplied $index_shard to the current index
* shard
*
- * @param object &$index_shard the shard to append to the current shard
+ * @param object $index_shard the shard to append to the current shard
*/
- function appendIndexShard(&$index_shard)
+ function appendIndexShard($index_shard)
{
if($this->word_docs_packed == true) {
$this->unpackWordDocs();
@@ -642,9 +648,9 @@ class IndexShard extends PersistentStructure implements CrawlConstants
$postings_len = strlen($postings);
// update doc offsets for newly added docs
for($i = 0; $i < $postings_len; $i += self::POSTING_LEN) {
- $num = unpackInt(substr($postings, $i, 4));
+ $num = unpackInt(substr($postings, $i, self::POSTING_LEN));
$num += ($this->docids_len << 4);
- charCopy(pack("N", $num), $postings, $i, 4);
+ charCopy(pack("N", $num), $postings, $i, self::POSTING_LEN);
}
if(!isset($this->words[$word_id])) {
$this->words[$word_id] = $postings;
@@ -683,7 +689,7 @@ class IndexShard extends PersistentStructure implements CrawlConstants
substr($doc_info_string, 0, self::POSTING_LEN));
list($doc_len, $num_keys) =
$this->unpackPosting(substr($doc_info_string,
- self::POSTING_LEN));
+ self::POSTING_LEN, self::POSTING_LEN));
$key_count = ($num_keys % 2 == 0) ? $num_keys + 2: $num_keys + 1;
$row_len = self::DOC_KEY_LEN * ($key_count);
@@ -1095,9 +1101,9 @@ class IndexShard extends PersistentStructure implements CrawlConstants
* Split a header string into a shards field variable
*
* @param string $header a string with packed shard header data
- * @param object &shard IndexShard to put data into
+ * @param object shard IndexShard to put data into
*/
- static function headerToShardFields($header, &$shard)
+ static function headerToShardFields($header, $shard)
{
$header_array = str_split($header, 4);
$header_data = array_map('unpackInt', $header_array);
@@ -1121,7 +1127,7 @@ class IndexShard extends PersistentStructure implements CrawlConstants
* @param int $key index in array - we don't use
* @param object $shard IndexShard to add the entry to word table for
*/
- static function makeWords(&$value, $key, &$shard)
+ static function makeWords(&$value, $key, $shard)
{
$shard->words[substr($value, 0, 8)] = substr($value, 8, 8);
}