Last commit for lib/index_dictionary.php: 9ff742e4cc2ef0dba312dd0c5f642890b6945730

First pass at converting files to use autoloading! Take care if you have an old yioop system you are upgrading, a=chris

Chris Pollett [2015-07-01 02:Jul:st]

First pass at converting files to use autoloading! Take care if you have an old yioop system you are upgrading, a=chris

<?php
/**
 *  SeekQuarry/Yioop --
 *  Open Source Pure PHP Search Engine, Crawler, and Indexer
 *
 *  Copyright (C) 2009 - 2013  Chris Pollett chris@pollett.org
 *
 *  LICENSE:
 *
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 *  END LICENSE
 *
 * @author Chris Pollett chris@pollett.org
 * @package seek_quarry
 * @subpackage library
 * @license http://www.gnu.org/licenses/ GPL3
 * @link http://www.seekquarry.com/
 * @copyright 2009 - 2013
 * @filesource
 */

if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}


/**
 *Loads common constants for web crawling
 */
require_once  BASE_DIR.'/lib/crawl_constants.php';

/**
 * Data structure used to store for entries of the form:
 * word id, index shard generation, posting list offset, and length of
 * posting list. It has entries for all words stored in a given
 * IndexArchiveBundle. There might be multiple entries for a given word_id
 * if it occurs in more than one index shard in the given IndexArchiveBundle.
 *
 * In terms of file structure, a dictionary is stored a folder consisting of
 * 256 subfolders. Each subfolder is used to store the word_ids beginning with
 * a particular character. Within a folder are files of various tier levels
 * representing the data stored. As crawling proceeds words from a shard are
 * added to the dictionary in files of tier level 0 either with suffix A or B.
 * If it is detected that both an A and a B file of a given tier level exist,
 * then the results of these two files are merged to a new file at one tier
 * level up . The old files are then deleted. This process is applied
 * recursively until there is at most an A file on each level.
 *
 * @author Chris Pollett
 *
 * @package seek_quarry
 * @subpackage library
 */

class IndexDictionary implements CrawlConstants
{
    /**
     * Folder name to use for this IndexDictionary
     * @var string
     */
    var $dir_name;
    /**
     * Array of file handle for files in the dictionary. Members are used
     * to read files to look up words.
     *
     * @var resource
     */
    var $fhs;

    /**
     * Array of file lengths for files in the dictionary. Use so don't try to
     * seek past end of files
     *
     * @var int
     */
    var $file_lens;

    /**
     * An cached array of disk blocks for an index dictionary that has not
     * been completely loaded into memory.
     * @var array
     */
    var $blocks;

    /**
     * The highest tiered index in the IndexDictionary
     * @var int
     */
    var $max_tier;

    /**
     * Tier currently being used to read dictionary data from
     * @var int
     */
    var $read_tier;
    /**
     * Tiers which currently have data for reading
     * @var array
     */
    var $active_tiers;

    /**
     * Length of the doc strings for each of the shards that have been added
     * to the dictionary.
     * @var array
     */
    var $shard_doc_lens;

    /**
     * When merging two files on a given dictionary tier. This is the max number
     * of bytes to read in one go. (Must be divisible by WORD_ITEM_LEN)
     */
     const SEGMENT_SIZE = 20000000;

    /**
     * Size in bytes of one block in IndexDictionary
     */
    const DICT_BLOCK_SIZE = 4096;

    /**
     * Disk block size is 1<< this power
     */
    const DICT_BLOCK_POWER = 12;
    /**
     * Size of an item in the prefix index used to look up words.
     * If the sub-dir was 65 (ASCII A), and the second char  was also
     * ASCII 65, then the corresonding prefix record would be the
     * offset to the first word_id beginning with AA, followed by the
     * number of such AA records.
     */
    const PREFIX_ITEM_SIZE = 8;
    /**
     * Number of possible prefix records (number of possible values for
     * second char of a word id)
     */
    const NUM_PREFIX_LETTERS = 256;
    /**
     * One dictionary file represents the words whose ids begin with a
     * fixed char. Amongst these id, the prefix index gives offsets for
     * where id's with a given second char start. The total length of the
     * records needed is PREFIX_ITEM_SIZE * NUM_PREFIX_LETTERS.
     */
    const PREFIX_HEADER_SIZE = 2048;

    /**
     * Makes an index dictionary with the given name
     *
     * @param string $dir_name the directory name to store the index dictionary
     *      in
     */
    function __construct($dir_name)
    {
        $this->dir_name = $dir_name;
        if(!is_dir($this->dir_name)) {
            mkdir($this->dir_name);
            IndexDictionary::makePrefixLetters($this->dir_name);
            $this->max_tier = 0;
        } else {
            $this->max_tier = unserialize(
                file_get_contents($this->dir_name."/max_tier.txt"));
            $this->read_tier = $this->max_tier;
            $tiers = glob($this->dir_name."/0/*A.dic");
            natsort($tiers);
            $this->active_tiers = array();
            foreach($tiers as $tier) {
                $path = pathinfo($tier);
                array_unshift($this->active_tiers,
                    substr($path["filename"], 0, -1));
            }
        }
    }

    /**
     * Makes dictionary sub-directories for each of the 256 possible first
     * hash characters that crawHash in raw mode code output.
     * @param string $dir_name base directory in which these sub-directories
     *      should be made
     */
    static function makePrefixLetters($dir_name)
    {
        for($i = 0; $i < self::NUM_PREFIX_LETTERS; $i++) {
            mkdir($dir_name."/$i");
        }
        file_put_contents($dir_name."/max_tier.txt",
            serialize(0));
    }

    /**
     * Adds the words in the provided IndexShard to the dictionary.
     * Merges tiers as needed.
     *
     * @param object $index_shard the shard to add the word to the dictionary
     *      with
     * @param object $callback object with join function to be
     *      called if process is taking too  long
     */
    function addShardDictionary($index_shard, $callback = NULL)
    {
        $out_slot = "A";
        if(file_exists($this->dir_name."/0/0A.dic")) {
            $out_slot ="B";
        }
        crawlLog("Adding shard data to dictionary files...");
        $index_shard->getShardHeader();
        $base_offset = IndexShard::HEADER_LENGTH + $index_shard->prefixes_len;
        $prefix_string = $index_shard->getShardSubstring(
            IndexShard::HEADER_LENGTH, $index_shard->prefixes_len, false);
        $next_offset = $base_offset;
        $word_item_len = IndexShard::WORD_ITEM_LEN;
        $num_prefix_letters = self::NUM_PREFIX_LETTERS;
        $prefix_item_size = self::PREFIX_ITEM_SIZE;
        for($i = 0; $i < $num_prefix_letters; $i++) {

            $last_offset = $next_offset;
            // adjust prefix values
            $first_offset_flag = true;
            $last_set = -1;
            for($j = 0; $j < $num_prefix_letters; $j++) {
                $prefix_info = $this->extractPrefixRecord($prefix_string,
                        ($i << 8) + $j);
                if($prefix_info !== false) {
                    list($offset, $count) = $prefix_info;
                    if($first_offset_flag) {
                        $first_offset = $offset;
                        $first_offset_flag = false;
                    }
                    $offset -= $first_offset;
                    $out = pack("N*", $offset, $count);
                    $last_set = $j;
                    $last_out = $prefix_info;
                    charCopy($out, $prefix_string,
                        (($i << 8) + $j) * $prefix_item_size,
                        $prefix_item_size);
                }
            }
            // write prefixes
            $fh = fopen($this->dir_name."/$i/0".$out_slot.".dic", "wb");
            fwrite($fh, substr($prefix_string,
                $i*self::PREFIX_HEADER_SIZE, self::PREFIX_HEADER_SIZE));
            $j = self::NUM_PREFIX_LETTERS;
            // write words
            if($last_set >= 0) {
                list($offset, $count) = $last_out;
                $next_offset = $base_offset + $offset +
                    $count * $word_item_len;
                fwrite($fh, $index_shard->getShardSubstring($last_offset,
                    $next_offset - $last_offset, false));
            }
            fclose($fh);
        }
        unset($prefix_string);
        crawlLog("Incrementally Merging tiers of dictionary");
        // log merge tiers if needed
        $tier = 0;
        while($out_slot == "B") {
            if($callback != NULL) {
                $callback->join();
            }
            $out_slot = "A";
            if(file_exists($this->dir_name."/0/".($tier + 1)."A.dic")) {
                $out_slot ="B";
            }
            $this->mergeTier($tier, $out_slot);
            $tier++;
            if($tier > $this->max_tier) {
                $this->max_tier = $tier;
                file_put_contents($this->dir_name."/max_tier.txt",
                    serialize($this->max_tier));
            }
        }
        crawlLog("...Done Incremental (Not Full) Merging of Dictionary Tiers");

    }

    /**
     * Merges for each first letter subdirectory, the $tier pair of files
     * of dictinary words. The output is stored in $out_slot.
     *
     * @param int $tier tier level to perform the merge of files at
     * @param string either "A" or "B", the suffix but not extension of the
     *      file one tier up to create with the merged results.
     */
    function mergeTier($tier, $out_slot)
    {
        for($i = 0; $i < self::NUM_PREFIX_LETTERS; $i++) {
            $this-> mergeTierFiles($i, $tier, $out_slot);
        }
    }

    /**
     * For a fixed prefix directory merges the $tier pair of files
     * of dictinary words. The output is stored in $out_slot.
     *
     * @param int $prefix which prefix directory to perform the merge of files
     * @param int $tier tier level to perform the merge of files at
     * @param string either "A" or "B", the suffix but not extension of the
     *      file one tier up to create with the merged results.
     */
    function mergeTierFiles($prefix, $tier, $out_slot)
    {
        $file_a = $this->dir_name."/$prefix/$tier"."A.dic";
        $file_b = $this->dir_name."/$prefix/$tier"."B.dic";
        $size_a = filesize($file_a);
        $size_b = filesize($file_b);

        $fhA = fopen( $file_a, "rb");
        $fhB = fopen( $file_b, "rb");
        $fhOut = fopen( $this->dir_name."/$prefix/".($tier + 1).
            "$out_slot.dic", "wb");
        $prefix_string_a = fread($fhA, self::PREFIX_HEADER_SIZE);
        $prefix_string_b = fread($fhB, self::PREFIX_HEADER_SIZE);
        $prefix_string_out = "";
        $offset = 0;
        $word_item_len = IndexShard::WORD_ITEM_LEN;
        $blank = IndexShard::BLANK;
        $num_prefix_letters = self::NUM_PREFIX_LETTERS;

        for($j = 0; $j < $num_prefix_letters; $j++) {
            $record_a = $this->extractPrefixRecord($prefix_string_a, $j);
            $record_b = $this->extractPrefixRecord($prefix_string_b, $j);
            if($record_a === false && $record_b === false) {
                $prefix_string_out .= $blank;
            } else if($record_a === false){
                $prefix_string_out .=
                    $this->makePrefixRecord($offset, $record_b[1]);
                $offset += $record_b[1] * $word_item_len;
            } else if($record_b === false){
                $prefix_string_out .=
                    $this->makePrefixRecord($offset, $record_a[1]);
                $offset += $record_a[1] * $word_item_len;
            } else {
                $count = $record_a[1] + $record_b[1];
                $prefix_string_out .=
                    $this->makePrefixRecord($offset, $count);
                $offset += $count * $word_item_len;
            }
        }
        fwrite($fhOut, $prefix_string_out);
        $remaining_a = $size_a - self::PREFIX_HEADER_SIZE;
        $remaining_b = $size_b - self::PREFIX_HEADER_SIZE;
        $done = false;
        $work_string_a = "";
        $read_size_a = 0;
        $offset_a = 0;
        $work_string_b = "";
        $read_size_b = 0;
        $offset_b = 0;
        $out = "";
        $out_len = 0;
        $segment_size = self::SEGMENT_SIZE;


        while($remaining_a > 0 || $remaining_b > 0 ||
            $offset_a < $read_size_a || $offset_b < $read_size_b) {
            if($offset_a >= $read_size_a && $remaining_a > 0) {
                $read_size_a = min($remaining_a, $segment_size);
                $work_string_a = fread($fhA, $read_size_a);
                $remaining_a -= $read_size_a;
                $offset_a = 0;
            }
            if($offset_b >= $read_size_b && $remaining_b > 0) {
                $read_size_b = min($remaining_b, $segment_size);
                $work_string_b = fread($fhB, $read_size_b);
                $remaining_b -= $read_size_b;
                $offset_b = 0;
            }
            if($offset_a < $read_size_a) {
                $record_a = substr($work_string_a, $offset_a,
                    $word_item_len);
            }
            if($offset_b < $read_size_b) {
                $record_b = substr($work_string_b, $offset_b,
                    $word_item_len);
            }
            if($offset_b >= $read_size_b) {
                $out .= $record_a;
                $offset_a += $word_item_len;
            } else if ($offset_a >= $read_size_a) {
                $out .= $record_b;
                $offset_b += $word_item_len;
            } else if ($this->recordCmp($record_a, $record_b) < 0){
                $out .= $record_a;
                $offset_a += $word_item_len;
            } else {
                $out .= $record_b;
                $offset_b += $word_item_len;
            }
            $out_len += $word_item_len;
            if($out_len >=  $segment_size) {
                fwrite($fhOut, $out);
                $out = "";
                $out_len = 0;
            }
        }
        fwrite($fhOut, $out);
        fclose($fhA);
        fclose($fhB);
        unlink($file_a);
        unlink($file_b);
        fclose($fhOut);
    }

    /**
     * Does a lexicographical comparison of the word_ids of two word records.
     *
     * @param string $record_a
     * @param string $record_b
     * @return int less than 0 if $record_a less than $record_b;
     *      greater than 0 if $record_b is less than $record_a; 0 otherwise
     */
    function recordCmp($record_a, $record_b)
    {
        return strcmp(substr($record_a, 0, IndexShard::WORD_KEY_LEN),
            substr($record_b, 0,  IndexShard::WORD_KEY_LEN));
    }

    /**
     * Returns the $record_num'th prefix record from $prefix_string
     *
     * @param string $prefix_string string to get record from
     * @param int $record_num which record to extract
     * @return array $offset, $count  array
     */
    function extractPrefixRecord(&$prefix_string, $record_num)
    {

        $record = substr($prefix_string, self::PREFIX_ITEM_SIZE*$record_num,
             self::PREFIX_ITEM_SIZE);
        if($record == IndexShard::BLANK) {
            return false;
        }
        return array_values(unpack("N*", $record));
    }

    /**
     * Makes a prefix record string out of an offset and count (packs and
     * concatenates).
     *
     * @param int $offset byte offset into words for the prefix record
     * @param int $count number of word with that prefix
     * @return string the packed record
     */
    function makePrefixRecord($offset, $count)
    {
        return pack("N*", $offset, $count);
    }

    /**
     * Merges for each tier and for each first letter subdirectory,
     * the $tier pair of (A and B) files  of dictionary words. If max_tier has
     * not been reached but only one of the two tier files is present then that
     * file is renamed with a name one tier higher. The output in all cases is
     * stored in file ending with A or B one tier up. B is used if an A file is
     * already present.
     * @param object $callback object with join function to be
     *      called if process is taking too long
     * @param int $max_tier the maximum tier to merge to merge till --
     *      if not set then $this->max_tier used. Otherwise, one would
     *      typically set to a value bigger than $this->max_tier
     */
    function mergeAllTiers($callback = NULL, $max_tier = -1)
    {
        $new_tier = false;

        crawlLog("Starting Full Merge of Dictionary Tiers");

        if($max_tier == -1) {
            $max_tier = $this->max_tier;
        }

        for($i = 0; $i < self::NUM_PREFIX_LETTERS; $i++) {
            for($j = 0; $j <= $max_tier; $j++) {
                crawlLog("...Processing Prefix Number $i Tier $j Max Tier ".
                    $max_tier);
                if($callback != NULL) {
                    $callback->join();
                }
                $a_exists = file_exists($this->dir_name."/$i/".$j."A.dic");
                $b_exists = file_exists($this->dir_name."/$i/".$j."B.dic");
                $higher_a = file_exists($this->dir_name."/$i/".($j+1)."A.dic");
                if($a_exists && $b_exists) {
                    $out_slot = ($higher_a) ? "B" : "A";
                    $this->mergeTierFiles($i, $j, $out_slot);
                    if($j == $max_tier) {$new_tier = true;}
                } else if ($a_exists && $higher_a) {
                    rename($this->dir_name."/$i/".$j."A.dic",
                        $this->dir_name."/$i/".($j + 1)."B.dic");
                } else if ($a_exists && $j < $max_tier) {
                    rename($this->dir_name."/$i/".$j."A.dic",
                        $this->dir_name."/$i/".($j + 1)."A.dic");
                }
            }
        }
        if($new_tier) {
            $max_tier++;
            file_put_contents($this->dir_name."/max_tier.txt",
                serialize($max_tier));
            $this->max_tier = $max_tier;
        }
        crawlLog("...End Full Merge of Dictionary Tiers");
    }

    /**
     * For each index shard generation a word occurred in, return as part of
     * array, an array entry of the form generation, first offset,
     * last offset, and number of documents the
     * word occurred in for this shard. The first offset (similarly, the last
     * offset) is the byte offset into the word_docs string of the first
     * (last) record involving that word.
     *
     * @param string $word_id id of the word one wants to look up
     * @param bool $raw whether the id is our version of base64 encoded or not
     * @param bool $extract whether to extract an array of entries or to just
     *      return the word info as a string
     * @return mixed an array of entries of the form
     *      generation, first offset, last offset, count or
     *      just a string of the word_info data if $extract is false
     */
     function getWordInfo($word_id, $raw = false, $extract = true)
     {
        $info = array();
        foreach($this->active_tiers as $tier) {
            $tier_info =$this->getWordInfoTier($word_id, $raw, $extract, $tier);
            if(is_array($tier_info)) {
                $info = array_merge($info, $tier_info);
            }
        }
        return $info;
     }

     /**
      *  This method facilitates query processing of an ongoing crawl.
      *  During an ongoing crawl, the dictionary is arranged into tiers
      *  as per the logarithmic merge algortihm rather than just one tier
      *  as in a crawl that has been stopped.  Word info for more
      *  recently crawled pages will tend to be in lower tiers than data
      *  that was crawled earlier. getWordInfoTier gets word info data for
      *  a specific tier in the index dictionary. Each tier will
      *  have word info for a specific, disjoint set of shards, so the format of
      *  how to look up posting lists in a shard can be the same
      *  regardless of the tier: an array entry is of the form
      *  generation, first offset, last offset, and number of documents the
      *  word occurred in for this shard.
      *
      *  @param string $word_id id of the word one wants to look up
      *  @param bool $raw whether the id is our version of base64 encoded or
      *  not
      *  @param int $tier which tier to get word info from
      *  @return mixed an array of entries of the form
      *      generation, first offset, last offset, count or
      *      just a string of the word_info data if $extract is false
      */
     function getWordInfoTier($word_id, $raw, $extract, $tier)
     {
        if(isset($this->fhs)) {
            $this->tier_fhs[$this->read_tier] = $this->fhs;
            if(isset($this->tier_fhs[$tier])) {
                $this->fhs = $this->tier_fhs[$tier];
            } else {
                $this->fhs = array();
            }
        } else {
            $this->fhs = array();
        }
        $this->read_tier = $tier;
        $word_key_len = IndexShard::WORD_KEY_LEN;
        if(strlen($word_id) < $word_key_len) {
            return false;
        }
        if($raw == false) {
            //get rid of out modified base64 encoding
            $word_id = unbase64Hash($word_id);
        }

        $word_item_len = IndexShard::WORD_ITEM_LEN;
        $word_data_len = $word_item_len - $word_key_len;
        $file_num = ord($word_id[0]);

        $prefix = ord($word_id[1]);
        $prefix_info = $this->getDictSubstring($file_num,
            self::PREFIX_ITEM_SIZE*$prefix, self::PREFIX_ITEM_SIZE);
        if($prefix_info == IndexShard::BLANK) {
            return false;
        }
        list(, $offset, $high) = unpack("N*", $prefix_info);
        $high--;

        $start = self::PREFIX_HEADER_SIZE  + $offset;
        $low = 0;
        $check_loc = (($low + $high) >> 1);
        $found = false;
        // find a record with word id
        do {
            $old_check_loc = $check_loc;

            $word_string = $this->getDictSubstring($file_num, $start +
                $check_loc * $word_item_len, $word_item_len);

            if($word_string == false) {return false;}
            $id = substr($word_string, 0, $word_key_len);
            $cmp = strcmp($word_id, $id);
            if($cmp === 0) {
                $found = true;
                break;
            } else if ($cmp < 0) {
                $high = $check_loc;
                $check_loc = (($low + $check_loc) >> 1);
            } else {
                if($check_loc + 1 == $high) {
                    $check_loc++;
                }
                $low = $check_loc;
                $check_loc = (($high + $check_loc) >> 1);
            }
        } while($old_check_loc != $check_loc);

        if(!$found) {
            return false;
        }
        //now extract the info

        $word_string = substr($word_string, $word_key_len);
        if($extract) {
            $info = array();
            $info[0] = IndexShard::getWordInfoFromString($word_string, true);
        } else {
            $info = $word_string;
        }
        //up to first record with word id
        $test_loc = $check_loc - 1;
        $start_loc = $check_loc;

        while ($test_loc >= $low) {
            $word_string = $this->getDictSubstring($file_num, $start +
                $test_loc * $word_item_len, $word_item_len);
            if($word_string == "" ) break;
            $id = substr($word_string, 0, $word_key_len);
            if(strcmp($word_id, $id) != 0 ) break;
            $start_loc = $test_loc;
            $test_loc--;
            $ws = substr($word_string, $word_key_len);
            if($extract) {
                $tmp = IndexShard::getWordInfoFromString($ws, true);
                array_push($info, $tmp);
            } else {
                $info = $ws . $info;
            }
        }
        //until last record with word id

        $test_loc = $check_loc + 1;

        while ($test_loc <= $high) {
            $word_string = $this->getDictSubstring($file_num, $start +
                $test_loc * $word_item_len, $word_item_len);
            if($word_string == "" ) break;
            $id = substr($word_string, 0, $word_key_len);
            if(strcmp($word_id, $id) != 0 ) break;
            $test_loc++;
            $ws = substr($word_string, $word_key_len);
            if($extract) {
                $tmp = IndexShard::getWordInfoFromString($ws, true);
                array_unshift($info, $tmp);
            } else {
                $info .= $ws;
            }
        }
        return $info;
    }

    /**
     *  Given an array of $key => $word_id associations returns an array of
     *  $key => $num_docs of that $word_id
     *
     *  @param array &$key_words associative array of $key => $word_id's
     *  @return array $key => $num_docs associations
     */
     function getNumDocsArray(&$key_words)
     {
        $file_key_words = array();
        foreach($key_words as $key => $word_id) {
            $file_key_words[ord($word_id[0])][$key] = $word_id;
        }
        $num_docs_array = array();
        foreach($file_key_words as $file_num => $k_words) {
            foreach($k_words as $key => $word_id) {
                $info = $this->getWordInfo($word_id, true);
                $num_generations = count($info);
                $num_docs = 0;
                for($i = 0; $i < $num_generations; $i++) {
                    $num_docs += $info[$i][3];
                }
                $num_docs_array[$key] = $num_docs;
            }
        }
        return $num_docs_array;
     }

    /**
     *  Gets from disk $len many bytes beginning at $offset from the
     *  $file_num prefix file in the index dictionary
     *
     * @param int $file_num which prefix file to read from (always reads
     *      a file at the max_tier level)
     * @param int $offset byte offset to start reading from
     * @param int $len number of bytes to read
     * @return string data from that location  in the shard
     */
    function getDictSubstring($file_num, $offset, $len)
    {
        $block_offset =  ($offset >> self::DICT_BLOCK_POWER)
            << self::DICT_BLOCK_POWER;
        $start_loc = $offset - $block_offset;
        //if all in one block do it quickly
        if($start_loc + $len < self::DICT_BLOCK_SIZE) {
            $data = $this->readBlockDictAtOffset($file_num, $block_offset);
            return substr($data, $start_loc, $len);
        }

        // otherwise, this loop is slower, but handles general case
        $data = $this->readBlockDictAtOffset($file_num, $block_offset);
        if($data === false) {return "";}
        $substring = substr($data, $start_loc);
        $db_size = self::DICT_BLOCK_SIZE;
        while (strlen($substring) < $len) {
            $data = $this->readBlockDictAtOffset($file_num, $block_offset);
            if($data === false) {return $substring;}
            $block_offset += $db_size;
            $substring .= $data;
        }

        return substr($substring, 0, $len);
    }


    /**
     * Reads DICT_BLOCK_SIZE bytes from the prefix file $file_num beginning
     * at byte offset $bytes
     *
     * @param int $file_num which dictionary file (given by first letter prefix)
     *      to read from
     * @param int $bytes byte offset to start reading from
     * @return &string data fromIndexShard file
     */
    function &readBlockDictAtOffset($file_num, $bytes)
    {
        $false = false;
        if(isset($this->blocks[$file_num][$bytes])) {
            return $this->blocks[$file_num][$bytes];
        }
        if(!isset($this->fhs[$file_num]) || $this->fhs[$file_num] === NULL) {
            $file_name = $this->dir_name."/$file_num/".$this->read_tier."A.dic";
            if(!file_exists($file_name)) return $false;
            $this->fhs[$file_num] = fopen($file_name, "rb");
            if($this->fhs[$file_num] === false) return $false;
            $this->file_lens[$file_num] = filesize($file_name);
        }
        if($bytes >= $this->file_lens[$file_num]) {

            return $false;
        }
        $seek = fseek($this->fhs[$file_num], $bytes, SEEK_SET);
        if($seek < 0) {
            return $false;
        }
        $this->blocks[$file_num][$bytes] = fread($this->fhs[$file_num],
            self::DICT_BLOCK_SIZE);

        return $this->blocks[$file_num][$bytes];
    }


}
 ?>

ViewGit