Last commit for src/models/PhraseModel.php: 55621f89eb585b515f5c0b94348a13fae5fefd9c

fixes a getPostingsString bug where string needed to be decode255'd, remove a lot of the code for serving results for older index formats

Chris Pollett [2024-01-24 05:Jan:th]

fixes a getPostingsString bug where string needed to be decode255'd, remove a lot of the code for serving results for older index formats

<?php
/**
 * SeekQuarry/Yioop --
 * Open Source Pure PHP Search Engine, Crawler, and Indexer
 *
 * Copyright (C) 2009 - 2019  Chris Pollett chris@pollett.org
 *
 * LICENSE:
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 *
 * END LICENSE
 *
 * @author Chris Pollett chris@pollett.org
 * @license https://www.gnu.org/licenses/ GPL3
 * @link https://www.seekquarry.com/
 * @copyright 2009 - 2019
 * @filesource
 */
namespace seekquarry\yioop\models;

use seekquarry\yioop\configs as C;
use seekquarry\yioop\library as L;
use seekquarry\yioop\library\AnalyticsManager;
use seekquarry\yioop\library\IndexManager;
use seekquarry\yioop\library\PhraseParser;
use seekquarry\yioop\library\index_bundle_iterators as I;

/**
 * logging is done during crawl not through web,
 * so it will not be used in the phrase model
 */
if (!C\nsdefined("POST_PROCESSING") && !C\nsdefined("LOG_TO_FILES")) {
    C\nsdefine("LOG_TO_FILES", false);
}
/**
 *
 * This is class is used to handle
 * results for a given phrase search
 *
 * @author Chris Pollett
 */
class PhraseModel extends ParallelModel
{
    /** an associative array of additional meta words and
     * the max description length of results if such a meta word is used
     * this array is typically set in index.php
     *
     * @var array
     */
    public $additional_meta_words;
    /**
     * Used to hold query statistics about the current query
     * @var array
     */
    public $query_info;
    /**
     * Used to hold extension of programming language which is used the language
     * @var string
     */
    public $programming_language_map;
    /**
     * A indicator to indicate source code files
     * @var string
     */
    public $program_indicator;
    /**
     * Length of info hash record phrse
     */
     const INFO_HASH_LEN = 16;
    /**
     * Number of pages (search result assoc array) to cache in one go in
     * memcache or filecache.
     * Size chosen based on 1MB max object size for memcache or filecache
     */
     const NUM_CACHE_PAGES = 10;
    /**
     * {@inheritDoc}
     *
     * @param string $db_name the name of the database for the search engine
     * @param bool $connect whether to connect to the database by default
     *     after making the datasource class
     */
    public function __construct($db_name = C\DB_NAME, $connect = true)
    {
        parent::__construct($db_name, $connect);
        $this->programming_language_map = ['java' => 'java', 'py' => 'py'];
        $this->program_indicator = false;
    }
    /**
     * Returns whether there is a index with the provide timestamp
     *
     * @param int $index_time_stamp timestamp of the index to check if in cache
     * @return bool whether it exists or not
     */
    public function indexExists($index_time_stamp)
    {
        return file_exists(C\CRAWL_DIR.'/cache/IndexData'.$index_time_stamp);
    }
    /**
     * Rewrites a mix query so that it maps directly to a query about crawls
     *
     * @param string $query the original before a rewrite
     * @param object $mix a mix object saying how the mix is built out of crawls
     *
     * @return string a rewritten query in terms of crawls
     */
    public function rewriteMixQuery($query, $mix)
    {
        $disjunct_phrases = explode("|", $query);
        $rewrite = "";
        if (isset($mix['FRAGMENTS'])) {
            foreach ($mix['FRAGMENTS'] as $fragment) {
                $pipe = "";
                foreach ($disjunct_phrases as $disjunct) {
                    $rewrite .= $pipe;
                    $pipe = ' | ';
                    $disjunct_string = $disjunct;
                    $base_weight = 1;
                    $pattern = "/(\s)(index:(\S)+)/";
                    preg_match_all($pattern, $query, $matches);
                    if (isset($matches[2][0])) {
                        $rewrite .= $disjunct;
                        continue;
                    }
                    $pattern = "/(\s)(i:(\S)+)/";
                    preg_match_all($pattern, $query, $matches);
                    if (isset($matches[2][0])) {
                        $rewrite .= $disjunct;
                        continue;
                    }
                    $pattern = "/(\s)(weight:(\S)+)/";
                    preg_match_all($pattern, $query, $matches);
                    if (isset($matches[2][0])) {
                        $base_weight = substr($matches[2][0],
                            strlen("weight:"));
                        $disjunct_string =
                            preg_replace($pattern, "", $disjunct_string);
                    }
                    $pattern = "/(\s)(w:(\S)+)/";
                    preg_match_all($pattern, $query, $matches);
                    if (isset($matches[2][0])) {
                        $base_weight = substr($matches[2][0], strlen("w:"));
                        $disjunct_string =
                            preg_replace($pattern, "", $disjunct_string);
                    }
                    $pipe2 = "";
                    if (isset($fragment['COMPONENTS'])) {
                        $start_disjunct_string = $disjunct_string;
                        foreach ($fragment['COMPONENTS'] as $component) {
                            $disjunct_string = $start_disjunct_string;
                            if (isset($component['KEYWORDS'])) {
                                $disjunct_string .= " ".$component['KEYWORDS'];
                            }
                            $stamp = ($component['CRAWL_TIMESTAMP'] <=  1) ?
                                "" : " i:".$component['CRAWL_TIMESTAMP'];
                            $rewrite .= $pipe2.$disjunct_string." w:".
                                ($component['WEIGHT'] * $base_weight).$stamp;
                            $pipe2 = ' | ';
                        }
                    }
                }
                $num_results = (isset($fragment['RESULT_BOUND']) &&
                    $fragment['RESULT_BOUND'] > 1) ?
                    $fragment['RESULT_BOUND'] : 1;
                $rewrite .= " #$num_results# ";
            }
        }
        return $rewrite;
    }
    /**
     * Given a query phrase, returns formatted document summaries of the
     * documents that match the phrase.
     *
     * @param string $input_phrase the phrase to try to match
     * @param int $low  return results beginning with the $low document
     * @param int $results_per_page  how many results to return
     * @param bool $format  whether to highlight in the returned summaries the
     *     matched text
     * @param array $filter an array of hashes of domains to filter from
     *     results
     * @param bool $use_cache_if_allowed if true and USE_CACHE is true then
     *     an attempt will be made to look up the results in either
     *     the file cache or memcache. Otherwise, items will be recomputed
     *     and then potentially restored in cache
     * @param int $raw ($raw == 0) normal grouping, ($raw == 1)
     *     no grouping done on data also no summaries returned (only lookup
     *     info), $raw > 1 return summaries but no grouping
     * @param array $queue_servers a list of urls of yioop machines which might
     *     be used during lookup
     * @param bool $guess_semantics whether to do query rewriting before lookup
     * @param int $save_timestamp if this timestamp is nonzero, then save
     *     iterate position, so can resume on future queries that make
     *     use of the timestamp
     * @param bool $limit_feeds if true the number of feed shard items to
     *     allow in search results is limited to
     *      WordIterator::LIMIT_FEEDS_COUNT
     *
     * @return array an array of summary data
     */
    public function getPhrasePageResults(
        $input_phrase, $low = 0, $results_per_page = C\NUM_RESULTS_PER_PAGE,
        $format = true, $filter = null, $use_cache_if_allowed = true,
        $raw = 0, $queue_servers = [], $guess_semantics = true,
        $save_timestamp = 0, $limit_feeds = true)
    {
        if (C\QUERY_STATISTICS) {
            $indent= "&nbsp;&nbsp;";
            $in2 = $indent . $indent;
            $in3 = $in2 . $indent;
            $prs_cnt = 0;
            $dis_cnt = 0;
            $this->query_info = [];
            $this->query_info['QUERY'] =
                "<b>PHRASE QUERY</b>: ".$input_phrase."<br />";
            $start_time = microtime(true);
        }
        $results = null;
        $answer_score_map = [];
        $word_structs = [];
        /*
            this is a quick and dirty parsing and will usually work,
            exceptions would be # or | in quotes or if someone tried
            to escape |.

            First we split into presentation elements then we split by
            disjuncts
        */
        $presentation_parts = preg_split('/#(\d)+#/',
            $input_phrase, -1, PREG_SPLIT_DELIM_CAPTURE);
        $count = 0;
        $presentation_parts = array_chunk($presentation_parts, 2);
        $num_parts = count($presentation_parts);
        $query_parts = [];
        $last_part = null;
        for ($i = 0;  $i < $num_parts ; $i++) {
            if (isset($presentation_parts[$i][0])  &&
                ($trimmed = trim($presentation_parts[$i][0])) != "") {
                $to_return = (isset($presentation_parts[$i][1])) ?
                    $presentation_parts[$i][1]: 1;
                $query_parts[$trimmed][] =
                    [$count, $to_return];
                $last_part = $trimmed;
                if (isset($presentation_parts[$i][1])) {
                    $count += $presentation_parts[$i][1];
                } else {
                    $count ++;
                }
            }
        }
        $results_high = $low + $results_per_page;
        $num_phrases = count($query_parts);
        if ($num_phrases > 0) {
            $num_last_parts = count($query_parts[$last_part]);
            if ($query_parts[$last_part][$num_last_parts - 1][0] +
                $query_parts[$last_part][$num_last_parts - 1][1] < $low) {
                $query_parts[$last_part][$num_last_parts - 1][1] =
                    $results_high;
            }
        }
        $qpart = 0;
        if (is_string($save_timestamp)) {
            $save_parts = explode("-", $save_timestamp);
            if (isset($save_parts[1])) {
                $qpart = intval($save_parts[1]);
                $save_timestamp = intval($save_parts[0]);
            }
        }
        $orig_stimestamp = $save_timestamp;
        $network = false;
        if ($queue_servers != [] &&
            !$this->isSingleLocalhost($queue_servers)) {
            $network = true;
        }
        foreach ($query_parts as $phrase => $pre_result_bounds) {
            $phrase_high = $pre_result_bounds[0][1];
            $result_bounds = [];
            $start_flag = false;
            $num_bounds = 0;
            foreach ($pre_result_bounds as $bound) {
                if ($bound[0] > $results_high) {
                    break;
                }
                //rest of presentation after what we'll return so break
                $phrase_high =  $bound[0] + $bound[1];
                if ($phrase_high < $low) {
                    continue;
                }
                // this part of presentation is before what we'll return so skip
                $result_bounds[] = $bound;
                $num_bounds++;
            }
            if ($num_bounds == 0) {
                continue;
            }
            $is_last_part = ($phrase == $last_part);
            if ($is_last_part &&
                $result_bounds[$num_bounds - 1][0] +
                $result_bounds[$num_bounds - 1][1] < $results_high) {
                $result_bounds[$num_bounds - 1][1] = $results_high -
                    $result_bounds[$num_bounds - 1][0];
            }
            $phrase_num = max(min($phrase_high, $results_high), $results_high) -
                $low;
            $word_structs = [];
            $format_words = [];
            if (C\QUERY_STATISTICS) {
                $this->query_info['QUERY'] .= $indent .
                    "<b>Presentation $prs_cnt:</b><br />";
                $this->query_info['QUERY'] .= "$in2<i>Low</i>:".
                    $result_bounds[0][0]."<br />";
                $this->query_info['QUERY'] .= $in2 .
                    "<i>High</i>: " . $result_bounds[0][1] . "<br />";
                $prs_cnt++;
            }
            $cache_results = false;
            if (mb_strlen($phrase) > self::INFO_HASH_LEN ||
                mb_substr($phrase, 0, 5) != "info:") {
                $phrase = mb_strtolower($phrase);
            }
            if (!empty($_SERVER["USE_CACHE"]) &&
                $save_timestamp == "" && $use_cache_if_allowed && !$network) {
                $cache_results = self::$cache->get($phrase . $this->index_name);
                if (C\QUERY_STATISTICS) {
                    $this->query_info['QUERY'] .=
                        "$in2<b>Parse done by Cache Lookup</b><br />";
                    if (!empty(self::$cache->cache_file) ) {
                        $this->query_info['QUERY'] .=
                            "$in2<b>Cache File Used: </b>" .
                            self::$cache->cache_file . "<br />";
                    }
                }
            }
            if ($cache_results) {
                list($word_structs, $format_words) = $cache_results;
            } else {
                if ($guess_semantics) {
                    $disjunct_phrases = explode("|", $phrase);
                    $repeat_check = [];
                    $phrase = "";
                    $delim = " ";
                    foreach ($disjunct_phrases as $disjunct) {
                        $check = trim($disjunct);
                        if (isset($repeat_check[$check])) {
                            continue;
                        }
                        $repeat_check[$check] = true;
                        $phrase .= $delim . $this->guessSemantics(" ".
                            $disjunct);
                        $delim = " | ";
                    }
                    $repeat_check = [];
                }
                if (!$network) {
                    $disjunct_phrases = explode("|", $phrase);
                    foreach ($disjunct_phrases as $disjunct) {
                        if (C\QUERY_STATISTICS) {
                            $this->query_info['QUERY']
                                .="$in2<b>Disjunct $dis_cnt:" . "</b><br />";
                            $dis_cnt++;
                        }
                        list($word_struct, $format_words) =
                            $this->parseWordStructConjunctiveQuery($disjunct);
                        if ($word_struct != null) {
                            $word_structs[] = $word_struct;
                        }
                    }
                    if (!empty($_SERVER["USE_CACHE"]) &&
                        $save_timestamp == "") {
                        self::$cache->set($phrase .
                            $this->index_name, [$word_structs, $format_words]);
                    }
                }
            }
            if (C\QUERY_STATISTICS) {
                $this->query_info['QUERY'] .=
                    "$in2<b>Presentation Parse time</b>: " .
                    L\changeInMicrotime($start_time)."<br />";
            }
            if ($orig_stimestamp > 0) {
                $save_timestamp_name = "$orig_stimestamp-$qpart";
            } else {
                $save_timestamp_name = "";
            }
            $out_results = $this->getSummariesByHash($word_structs,
                $low, $phrase_num, $filter, $use_cache_if_allowed, $raw,
                $queue_servers, $phrase, $save_timestamp_name,
                $limit_feeds);
            if (isset($out_results['PAGES']) &&
                count($out_results['PAGES']) != 0) {
                $out_count = 0;
                foreach ($result_bounds as $bound) {
                    for ($i = $bound[0];
                        $i < min($bound[0] + $bound[1], $results_high);
                        $i++) {
                        if (isset($out_results['PAGES'][$out_count])) {
                            if (!strstr($phrase, "|") &&
                                isset($out_results['PAGES'][$out_count]
                                [self::QUESTION_ANSWERS])) {
                                $triplets_with_answer =
                                    $out_results['PAGES'][$out_count]
                                    [self::QUESTION_ANSWERS];
                                $question = trim(
                                    PhraseParser::stemCharGramSegment($phrase,
                                    L\guessLocaleFromString($phrase), true));
                                if (isset($triplets_with_answer[$question])) {
                                    $out_results['PAGES'][$out_count]['ANSWER']=
                                        $triplets_with_answer[$question];
                                    $answer = $triplets_with_answer[$question];
                                    if (array_key_exists(
                                        $answer, $answer_score_map)) {
                                        $new_score = $answer_score_map[$answer]
                                        + $out_results['PAGES'][$out_count]
                                            ['OUT_SCORE'];
                                    } else {
                                        $answer_score_map[$answer] =
                                        $out_results['PAGES'][$out_count]
                                            ['OUT_SCORE'];
                                    }
                                }
                            }
                            $results['PAGES'][$i] =
                                $out_results['PAGES'][$out_count];
                            $out_count++;
                        }
                    }
                }
                if ($is_last_part && isset($out_results['TOTAL_ROWS'])) {
                    $total_rows = $out_results['TOTAL_ROWS'];
                }
            }
            $qpart++;
        }
        if (C\QUERY_STATISTICS) {
            $format_time = microtime(true);
        }
        if (isset($out_results['SAVE_POINT'])) {
            /*
              out_result of last used to back-fill earlier ones that are done
              so on crawl mix archive crawls only look at last
             */
            $results['SAVE_POINT'] = $out_results['SAVE_POINT'];
        }
        if (isset($results['PAGES'])) {
            ksort($results['PAGES']);
            $results["PAGES"] = array_values($results["PAGES"]);
        }
        if (isset($out_results['HARD_QUERY'])) {
            $results['HARD_QUERY'] = $out_results['HARD_QUERY'];
        }
        if (!is_array($results) || count($results) == 0) {
            $results = null;
        }
        if ($results == null) {
            $total_rows = 0;
            $results['TOTAL_ROWS'] = 0;
        }
        if (isset($total_rows)) {
            $results['TOTAL_ROWS'] = $total_rows;
        } elseif (isset($results['PAGES'])) {
            $results['TOTAL_ROWS'] = count($results['PAGES']);
        }
        if ($format) {
            if (count($format_words) == 0) {
                $format_words = null;
            }
        } else {
            $format_words = null;
        }
        $description_length = self::DEFAULT_DESCRIPTION_LENGTH;
        if (isset($this->additional_meta_words) &&
            is_array($this->additional_meta_words)) {
            foreach ($this->additional_meta_words as $meta_word => $length) {
                $pattern = "/$meta_word/";
                if (preg_match($pattern, $input_phrase)) {
                    $description_length = $length;
                    break; // only match the first found
                }
            }
        }
        if ($raw == 0 && isset($results['TOTAL_ROWS']) &&
            $results['TOTAL_ROWS'] > 0) {
            $output = $this->formatPageResults($results, $format_words,
                $description_length);
            if (!empty($answer_score_map)) {
                arsort($answer_score_map);
                reset($answer_score_map);
                $output['BEST_ANSWER'] = key($answer_score_map);
            }
        } else {
            $output = $results;
        }
        if (C\QUERY_STATISTICS) {
            $this->query_info['QUERY'] .= "<b>Format Time</b>: ".
                L\changeInMicrotime($format_time)."<br />";
            $this->query_info['ELAPSED_TIME'] =
                L\changeInMicrotime($start_time);
            $this->db->total_time += $this->query_info['ELAPSED_TIME'];
            $this->db->query_log[] = $this->query_info;
        }
        return $output;
    }
    /**
     * Parses from a string phrase representing a conjunctive query, a struct
     * consisting of the words keys searched for, the allowed and disallowed
     * phrases, the weight that should be put on these query results, and
     * which archive to use.
     *
     * @param string& $phrase string to extract struct from, if the phrase
     *  semantics is guessed or an if condition is processed the value of
     *  phrase will be altered. (Helps for feeding to network queries)
     * @return array struct representing the conjunctive query
     */
    public function parseWordStructConjunctiveQuery(&$phrase)
    {
        $query = $phrase;
        $indent= "&nbsp;&nbsp;";
        $in2 = $indent . $indent;
        $in3 = $in2 . $indent;
        $in4 = $in2. $in2;
        $phrase = " " . $phrase;
        $phrase = $this->parseIfConditions($phrase);
        $phrase_string = $phrase;
        list($found_metas, $disallow_phrases,
            $phrase_string, $query_string, $index_name, $weight) =
            $this->extractMetaWordInfo($phrase);
        /*
            we search using the stemmed/char-grammed words, but we format
            snippets in the results by bolding either
         */
        $query_words = explode(" ", $query_string); //not stemmed
        if ($this->program_indicator) {
            $query_string = $query;
            $this->program_indicator = false;
        }
        $locale_tag = L\guessLocaleFromString($query_string);
        $quote_state = false;
        $phrase_parts = explode('"', $phrase_string);
        $base_words = [];
        $num_words = 0;
        $quote_positions = [];
        foreach ($phrase_parts as $phrase_part) {
            if (trim($phrase_part) == "") {
                $quote_state = ($quote_state) ? false : true;
                continue;
            }
            /*still use original phrase string here to handle
               acronyms abbreviations and the like that use periods */
            if ($quote_state) {
                $sub_parts = explode('*', $phrase_part);
                $first_part = true;
                $quote_position = [];
                foreach ($sub_parts as $sub_part) {
                    if (!$first_part) {
                        $quote_position["*$num_words"] = "*";
                    }
                    $new_words = PhraseParser::extractPhrases(
                        $sub_part, $locale_tag, $index_name, true);
                    $base_words = array_merge($base_words, $new_words);
                    foreach ($new_words as $new_word) {
                        $len = substr_count($new_word, " ") + 1;
                        $quote_position[$num_words] = $len;
                        $num_words++;
                    }
                    $first_part = false;
                }
                $quote_positions[] = $quote_position;
            } else {
                $new_words =
                    PhraseParser::extractPhrases($phrase_part, $locale_tag,
                    $index_name);
                $base_words = array_merge($base_words, $new_words);
            }
            $num_words = count($base_words);
            $quote_state = ($quote_state) ? false : true;
        }
        //stemmed, if have stemmer
        $index_version = IndexManager::getVersion($index_name);
        $add_metas = $found_metas;
        $words = array_merge($base_words, $add_metas);
        if (count($words) == 0 && count($disallow_phrases) > 0) {
            $words[] = "site:any";
        }
        if (C\QUERY_STATISTICS) {
            if (!isset($this->query_info['QUERY'])) {
                $this->query_info['QUERY'] = "";
            }
            $this->query_info['QUERY'] .= "$in3<i>Index</i>: ".
                $index_name."<br />";
            $this->query_info['QUERY'] .= "$in3<i>LocaleTag</i>: ".
                $locale_tag."<br />";
            $this->query_info['QUERY'] .=
                "$in3<i>Stemmed/Char-grammed Words</i>:<br />";
            foreach ($base_words as $word) {
                $this->query_info['QUERY'] .= "$in4$word<br />";
            }
            $this->query_info['QUERY'] .= "$in3<i>Meta Words</i>:<br />";
            foreach ($found_metas as $word) {
                $this->query_info['QUERY'] .= "$in4$word<br />";
            }
            $this->query_info['QUERY'] .= "$in3<i>Quoted Word Locs</i>:<br />";
            foreach ($quote_positions as $quote_position) {
                $this->query_info['QUERY'] .= "$in4(";
                $comma = "";
                foreach ($quote_position as $pos => $len) {
                    $this->query_info['QUERY'] .= "$comma $pos => $len";
                    $comma = ",";
                }
                $this->query_info['QUERY'] .= ")<br />";
            }
        }
        if (isset($words) && count($words) == 1 &&
            count($disallow_phrases) < 1 && !strpos($words[0], " ")) {
            $phrase_string = $words[0];
            $phrase_hash = L\allCrawlHashPaths($phrase_string);
            $word_struct = ["KEYS" => [$phrase_hash],
                "QUOTE_POSITIONS" => null, "DISALLOW_KEYS" => [],
                "WEIGHT" => $weight, "INDEX_NAME" => $index_name,
            ];
        } else {
            //get a raw list of words and their hashes
            $hashes = [];
            $word_keys = [];
            foreach ($words as $word) {
                $word_keys[] = L\allCrawlHashPaths($word);
            }
            if (count($word_keys) == 0) {
                $word_keys = null;
                $word_struct = null;
            }
            $disallow_keys = [];
            $num_disallow_keys = min(C\MAX_QUERY_TERMS,
                count($disallow_phrases));
            if ($num_disallow_keys > 0 && C\QUERY_STATISTICS) {
                $this->query_info['QUERY'] .= "$in3<i>Disallowed Words</i>:".
                    "<br />";
            }
            for ($i = 0; $i < $num_disallow_keys; $i++) {
                // check if disallowed is a meta word and stem or not stem
                if (mb_strstr($disallow_phrases[$i], ':') === false) {
                    $disallow_stem = PhraseParser::extractPhrases(
                        $disallow_phrases[$i], L\getLocaleTag());
                            //stemmed
                } else {
                    $disallow_stem[0] = $disallow_phrases[$i];
                }
                if (C\QUERY_STATISTICS) {
                    $this->query_info['QUERY'] .= "$in4{$disallow_stem[0]}".
                        "<br />";
                }
                $disallow_keys[] = L\crawlHashWord($disallow_stem[0]);
                if ($index_version == 0) {
                    $disallow_keys[] = L\crawlHash($word);
                }
            }
            if ($word_keys !== null) {
                $word_struct = ["KEYS" => $word_keys,
                    "QUOTE_POSITIONS" => $quote_positions,
                    "DISALLOW_KEYS" => $disallow_keys,
                    "WEIGHT" => $weight,
                    "INDEX_NAME" => $index_name
                ];
            }
        }
        $pre_format_words = [];
        foreach ($base_words as $base_word) {
            $pre_format_words = array_merge($pre_format_words,
                explode(" * ", $base_word));
        }
        $pre_format_words = array_values(array_unique(
            array_merge($query_words, $pre_format_words)));
        $format_words = [];
        $count = count($pre_format_words);
        for ($i = 0; $i < $count; $i++) {
            $flag = true;
            if ($pre_format_words[$i] == "") {
                continue;
            }
            for ($j = 0; $j < $count; $j++) {
                if ($j == $i) {
                    continue;
                }
                $hay = mb_strtolower($pre_format_words[$j]);
                $needle = mb_strtolower($pre_format_words[$i]);
                if ($hay == $needle && $j > $i) {
                    continue;
                }
                if (mb_strstr($hay, $needle)) {
                    $flag = false;
                    break;
                }
            }
            if ($flag) {
                $format_words[] = $pre_format_words[$i];
            }
        }
        return [$word_struct, $format_words];
    }
    /**
     * Given a query string, this method extracts meta words, which of these are
     * "materialized" (i.e., should be encoded as part of word ids),
     * disallowed phrases, the query string after meta words removed
     * and ampersand substitution applied, the query string with meta words
     * but apersand substitution applied, the index and the weights found
     * as part of the query string.
     *
     * @param string $phrase the query string
     * @return array containing items listed above in the description of this
     *     method
     */
    public function extractMetaWordInfo($phrase)
    {
        $index_name = $this->index_name;
        $weight = 1;
        $found_metas = [];
        $disallow_phrases = [];
        $phrase_string = $phrase;
        $phrase_string = str_replace("&", "&amp;", $phrase_string);
        $meta_words = PhraseParser::$meta_words_list;
        if (isset($this->additional_meta_words)) {
            $meta_words = array_merge($meta_words, array_keys(
                $this->additional_meta_words));
        }
        $materialized_match_conflict = false;
        foreach ($meta_words as $meta_word) {
            $pattern = "/(\s)($meta_word(\S)+)/";
            preg_match_all($pattern, $phrase, $matches);
            if (!in_array($meta_word,
                ['i:', 'index:', 'w:', 'weight:', '\-'])) {
                $matches = $matches[2];
                $found_metas = array_merge($found_metas, $matches);
            } elseif ($meta_word == '\-') {
                if (count($matches[0]) > 0) {
                    foreach ($matches[2] as $disallowed) {
                        $disallow_phrases[] = substr($disallowed, 1);
                    }
                }
            } elseif ($meta_word == 'i:' || $meta_word == 'index:') {
                if (isset($matches[2][0])) {
                    $index_name = substr($matches[2][0], strlen($meta_word));
                }
            } elseif ($meta_word == 'w:' || $meta_word == 'weight:') {
                if (isset($matches[2][0])) {
                    $weight = substr($matches[2][0], strlen($meta_word));
                }
            }
            $phrase_string = preg_replace($pattern, "", $phrase_string);
        }
        if ($materialized_match_conflict) {
            $found_metas = [];
            $disallow_phrases = [];
            $phrase_string = "";
        }
        $found_metas = array_unique($found_metas);
        $disallow_phrases = array_unique($disallow_phrases);
        $phrase_string = mb_ereg_replace("&amp;", "_and_", $phrase_string);
        $query_string = mb_ereg_replace(C\PUNCT, " ", $phrase_string);
        $query_string = preg_replace("/(\s)+/", " ", $query_string);
        $query_string = mb_ereg_replace('_and_', '&', $query_string);
        $phrase_string = mb_ereg_replace('_and_', '&', $phrase_string);
        return [$found_metas, $disallow_phrases, $phrase_string, $query_string,
            $index_name, $weight];
    }
    /**
     * Ideally, this function tries to guess from the query what the
     * user is looking for. For now, we are just doing simple things like
     * when a query term is a url and rewriting it to the appropriate meta
     * meta word.
     *
     * @param string $phrase input query to guess semantics of
     * @return string a phrase that more closely matches the intentions of the
     *     query.
     */
    public function guessSemantics($phrase)
    {
        $domain_suffixes = [".com", ".net", ".edu", ".org", ".gov",
            ".mil", ".ca", ".uk", ".fr", ".ly"];
        $len = mb_strlen(trim($phrase));
        if ($len > 4) {
            foreach ($domain_suffixes as $suffix) {
                $phrase = $this->endMatch($phrase, $suffix, "site:", "",
                    [":", "@"]);
            }
            $phrase = $this->beginMatch($phrase, "www.", "site:www.");
            $phrase = $this->beginMatch($phrase, "http:", "site:http:");
            // only rewrite info if longer than hash info record length
            if($len > self::INFO_HASH_LEN) {
                $phrase = $this->beginMatch($phrase, "info:", "info:http://",
                    "/", ["/"]);
                $phrase = $this->beginMatch($phrase, "info:", "info:http://",
                    "", ["http"]);
            }
        }
        $tag = L\guessLocaleFromString($phrase);
        if (isset($this->programming_language_map[$tag])) {
            $this->program_indicator = true;
        }
        $main_tag = substr($tag, 0, 2);
        if ($len == 1) {
            $letter = "";
            switch ($main_tag) {
                case 'ar':
                    $letter = "سالة";
                    break;
                case 'de':
                    $letter = "Buchstabe";
                    break;
                case 'en':
                    $letter = "letter";
                    break;
                case 'es':
                    $letter = "letra";
                    break;
                case 'fa':
                    $letter = "نامه";
                    break;
                case 'fr':
                    $letter = "lettre";
                    break;
                case 'it':
                    $letter = "lettera";
                    break;
                case 'po':
                    $letter = "literą";
                    break;
                case 'pt':
                    $letter = "letra";
                    break;
                case 'tr':
                    $letter = "harfi";
                    break;
                case 'ru':
                    $letter = "буква";
                    break;
                case 'vi':
                    $letter = "thư";
                    break;
            }
            $phrase = $letter." ".$phrase."|".$phrase;
        }
        $tag = str_replace("-", "_", $tag);
        $tokenizer_name = C\NS_LOCALE . "$tag\\resources\\Tokenizer";
        if (class_exists($tokenizer_name)) {
            $tokenizer = new $tokenizer_name();
            if (isset($tokenizer->semantic_rewrites)) {
                $rewrites = $tokenizer->semantic_rewrites;
                $tmp = trim($phrase);
                if (isset($rewrites[$tmp])) {
                    $phrase = $rewrites[$tmp];
                }
            }
        }
        if (!empty($tokenizer) && method_exists($tokenizer, "isQuestion") &&
            method_exists($tokenizer, "questionParser") &&
            $tokenizer->isQuestion($phrase)) {
            $generated_question = $tokenizer->questionParser(
                $phrase, $tag);
            if (!empty($generated_question['CONCISE'])) {
                $phrase = $generated_question['CONCISE'][0];
            } else if (!empty($generated_question['RAW'])) {
                $phrase = $generated_question['RAW'][0];
            }
        }
        return $phrase;
    }
    /**
     * Matches terms (non white-char strings) in the language $lang_tag in
     * $phrase that begin with  $start_with and don't contain  $not_contain,
     * replaces $start_with with $new_prefix and adds $suffix to the end
     *
     * @param string $phrase string to look for terms in
     * @param string $start_with what we're looking to see if term begins with
     * @param string $new_prefix what to change $start_with to
     * @param string $suffix what to tack on to the end of the term if there is
     *     a match
     * @param string $not_contains string match is not allowed to contain
     * @param string $lang_tag what language the phrase must be in for the rule
     *     to apply
     *
     * @return string $phrase after modifications have been made
     */
    public function beginMatch($phrase, $start_with, $new_prefix, $suffix = "",
        $not_contains = [], $lang_tag = "en-US")
    {
        $phrase .= " ";
        $quote_start_with = preg_quote($start_with, "/");
        $pattern = "/(\s)($quote_start_with(\S)+)/";
        $start_pos = strlen($start_with);
        preg_match_all($pattern, $phrase, $matches);
        $matches = $matches[2];
        $result_phrase = preg_replace($pattern, "", $phrase);
        foreach ($matches as $match) {
            $tag = L\guessLocaleFromString($match, $lang_tag, 10);
            $not_check = true;
            foreach ($not_contains as $not_contain) {
                if (strstr($match, $not_contain)) {
                    $not_check = false;
                    break;
                }
            }
            if ($tag == $lang_tag && $not_check) {
                $body = substr($match, $start_pos);
                $result_phrase .= " ".$new_prefix.$body.$suffix;
            } else {
                $result_phrase .= " ".$match;
            }
        }
        return $result_phrase;
    }
    /**
     * Matches terms (non white-char strings) in the language $lang_tag in
     * $phrase that end with $end_with and don't contain  $not_contain,
     * replaces $end_with with $new_suffix (if not empty) and adds $prefix to
     * the beginning
     *
     * @param string $phrase string to look for terms in
     * @param string $end_with what we're looking to see if term ends with
     * @param string $prefix what to tack on to the start if there is
     *     a match
     * @param string $new_suffix what to change $end_with to
     * @param string $not_contains string match is not allowed to contain
     * @param string $lang_tag what language the phrase must be in for the rule
     *     to apply
     *
     * @return string $phrase after modifications have been made
     */
    public function endMatch($phrase, $end_with, $prefix, $new_suffix = "",
        $not_contains = [], $lang_tag = "en-US")
    {
        $phrase .= " ";
        $quote_end_with = preg_quote($end_with, "/");
        $pattern = "/(\s)((\S)+$quote_end_with)(\s)/";
        $end_len = strlen($end_with);
        preg_match_all($pattern, $phrase, $matches);
        $matches = $matches[2];
        $result_phrase = preg_replace($pattern, " ", $phrase);
        foreach ($matches as $match) {
            $tag = L\guessLocaleFromString($match, $lang_tag, 10);
            $not_check = true;
            foreach ($not_contains as $not_contain) {
                if (strstr($match, $not_contain)) {
                    $not_check = false;
                    break;
                }
            }
            if ($tag == $lang_tag && $not_check) {
                if ($new_suffix == "") {
                    $body = $match;
                } else {
                    $body = substr($match, 0, -$end_len);
                }
                $result_phrase .= " $prefix".$body.$new_suffix;
            } else {
                $result_phrase .= " ".$match;
            }
        }
        return $result_phrase;
    }
    /**
     * Evaluates any if: conditional meta-words in the query string to
     * calculate a new query string.
     *
     * @param string $phrase original query string
     * @return string query string after if: meta words have been evaluated
     */
    public function parseIfConditions($phrase)
    {
        $cond_token = "if:";
        $pattern = "/(\s)($cond_token(\S)+)/";
        preg_match_all($pattern, $phrase, $matches);
        $matches = $matches[2];
        $result_phrase = preg_replace($pattern, "", $phrase);
        foreach ($matches as $match) {
            $match = substr($match, strlen($cond_token));
            $match_parts = explode("!", $match);
            if (count($match_parts) < 2) {
                continue;
            }
            if (stristr($result_phrase, $match_parts[0]) !== false) {
                $result_phrase .= " ".str_replace("+", " ", $match_parts[1]);
            } elseif (isset($match_parts[2])) {
                $result_phrase .= " ".str_replace("+", " ", $match_parts[2]);
            }
        }
        return $result_phrase;
    }
    /**
     * Gets doc summaries of documents containing given words and meeting the
     * additional provided criteria
     * @param array $word_structs an array of word_structs. Here a word_struct
     *     is an associative array with at least the following fields
     *     KEYS -- an array of word keys
     *     QUOTE_POSITIONS -- an array of positions of words that appeared in
     *         quotes (so need to be matched exactly)
     *     DISALLOW_PHRASES -- an array of words the document must not contain
     *     WEIGHT -- a weight to multiple scores returned from this iterator by
     *     INDEX_NAME -- an index timestamp to get results from
     * @param int $limit number of first document in order to return
     * @param int $num number of documents to return summaries of
     * @param array& $filter an array of hashes of domains to filter from
     *     results
     * @param bool $use_cache_if_allowed if true and USE_CACHE is true then
     *     an attempt will be made to look up the results in either
     *     the file cache or memcache. Otherwise, items will be recomputed
     *     and then potentially restored in cache
     * @param int $raw ($raw == 0) normal grouping, ($raw > 0)
     *     no grouping done on data. if ($raw == 1) no lookups of summaries
     *     done
     * @param array $queue_servers a list of urls of yioop machines which might
     *     be used during lookup
     * @param string $original_query if set, the original query that corresponds
     *     to $word_structs
     * @param string $save_timestamp_name if this timestamp is not empty, then
     *     save iterate position, so can resume on future queries that make
     *     use of the timestamp. If used then $limit ignored and get next $num
     *     docs after $save_timestamp 's previous iterate position.
     * @param bool $limit_feeds if true the number of feed shard items to
     *     allow in search results is limited to WordIterator::LIMIT_FEEDS_COUNT
     *
     * @return array document summaries
     */
    public function getSummariesByHash($word_structs, $limit, $num, &$filter,
        $use_cache_if_allowed = true, $raw = 0, $queue_servers = [],
        $original_query = "", $save_timestamp_name = "",
        $limit_feeds = true)
    {
        $indent= "&nbsp;&nbsp;";
        $in2 = $indent . $indent;
        $in3 = $in2 . $indent;
        $in4 = $in2. $in2;
        if (C\QUERY_STATISTICS) {
            $lookup_time = microtime(true);
        }
        $use_proximity = false;
        $time = time();
        if (count($word_structs) > 1 || (isset($word_structs[0]["KEYS"])
            && count($word_structs[0]["KEYS"]) > 1) ||
            ($word_structs == [] &&
            substr_count($original_query, " ") > 1)) {
            $use_proximity = true;
        }
        if (!isset($filter['time'])) {
            $filter['time'] = 0;
        }
        $filter_time = $filter['time'];
        unset($filter['time']); //iterators don't expect time field
        $pages = [];
        $generation = 0;
        $to_retrieve = ceil(($limit+$num)/self::NUM_CACHE_PAGES) *
            self::NUM_CACHE_PAGES;
        $start_slice = floor(($limit)/self::NUM_CACHE_PAGES) *
            self::NUM_CACHE_PAGES;
        if ($save_timestamp_name != "") {
            $to_retrieve = $num;
            $limit = 0;
            $start_slice = 0;
        }
        if (!empty($_SERVER["USE_CACHE"]) &&
            $save_timestamp_name == "") {
            $mem_tmp = serialize($raw).serialize($word_structs).
                $original_query . $this->index_name;
            $summary_hash = L\crawlHash($mem_tmp . ":" . $limit . ":" . $num);
            if ($use_cache_if_allowed) {
                $cache_success = true;
                $results = self::$cache->get($summary_hash);
                if (!isset($results['TIME']) || $filter_time >
                    $results['TIME']) {
                    //if filter has changed since cached, then invalidate cache
                    $results = false;
                }
                if (isset($results['TIME'])) {
                    $cached_time = $time - $results['TIME'];
                } else {
                    $cached_time = $time;
                }
                if (C\MIN_QUERY_CACHE_TIME > 0 &&
                    $cached_time > C\MAX_QUERY_CACHE_TIME) {
                    $results = false;
                }
                if (isset($results['PAGES'])) {
                    $close_prefix = C\WORK_DIRECTORY . "/schedules/".
                        self::index_closed_name;
                    $has_changeable_results = false;
                    $seen_times = [];
                    foreach ($results['PAGES'] as $page) {
                        if (!isset($page[self::CRAWL_TIME]) ||
                            in_array($page[self::CRAWL_TIME], $seen_times)) {
                            continue;
                        }
                        $seen_times[] = $page[self::CRAWL_TIME];
                        $current_closed = $close_prefix .
                            $page[self::CRAWL_TIME] . ".txt";
                        if (!file_exists($current_closed)) {
                            //either feed result or from active crawl
                            $has_changeable_results = true;
                            break;
                        }
                    }
                    if ($has_changeable_results) {
                        if ($cached_time > C\MIN_QUERY_CACHE_TIME) {
                            $results = false;
                        }
                    }
                }
                if (C\QUERY_STATISTICS) {
                    $this->query_info['QUERY'] .=
                        "$in2<b>Cache Lookup Time</b>: ".
                        L\changeInMicrotime($lookup_time)."<br />";
                    if (!empty(self::$cache->cache_file) ) {
                        $this->query_info['QUERY'] .=
                            "$in2<b>Cache File Used:</b> " .
                            self::$cache->cache_file . "<br />";
                    }
                }
                if ($results !== false) {
                    return $results;
                }
            }
        }
        $old_to_retrieve = $to_retrieve;
        $get_query_time = microtime(true);
        $query_iterator = $this->getQueryIterator($word_structs, $filter, $raw,
            $to_retrieve, $queue_servers, $original_query,
            $save_timestamp_name, $limit_feeds);
        $get_query_time = L\changeInMicrotime($get_query_time);
        $num_retrieved = 0;
        $pages = [];
        $retrieve_postings_time = microtime(true);
        if (is_object($query_iterator)) {
            while ($num_retrieved < $to_retrieve &&
                is_array($next_docs =
                    $query_iterator->nextDocsWithWord())) {
                $pages += $next_docs;
                $num_retrieved = count($pages);
            }
        }
        $retrieve_postings_time = L\changeInMicrotime($retrieve_postings_time);
        if ($save_timestamp_name != "" && ($queue_servers == [] ||
            $this->isSingleLocalhost($queue_servers))) {
            // used for archive crawls of crawl mixes
            $save_file = C\CRAWL_DIR.'/schedules/'.self::save_point.
                $save_timestamp_name.".txt";
            $iterators = $query_iterator->save_iterators;
            $cnt_iterators = count($iterators);
            $save_point = [];
            for ($i = 0; $i < $cnt_iterators; $i++) {
                $save_point[$i] =
                    $iterators[$i]->currentGenDocOffsetWithWord();
            }
            $results["SAVE_POINT"] = $save_point;
            $this->filePutContents($save_file, serialize($save_point));
            $this->db->setWorldPermissionsRecursive($save_file);
        }
        $pages = array_values($pages);
        $result_count = count($pages);
        $sort_time = 0;
        if ($raw == 0) {
            // initialize scores
            $sort_start = microtime(true);
            $max_user_ranks = 0;
            for ($i = 0; $i < $result_count; $i++) {
                $pages[$i]["OUT_SCORE"] = 0;
                if (isset($pages[$i][self::USER_RANKS])) {
                    $j = count($pages[$i][self::USER_RANKS]);
                    if ($max_user_ranks < $j) {
                        $max_user_ranks = $j;
                    }
                }
            }
            if ($max_user_ranks > 0) {
                for ($i = 0; $i < $result_count; $i++) {
                    for ($j = 0; $j < $max_user_ranks; $j++) {
                        if (isset($pages[$i][self::USER_RANKS][$j])) {
                            $pages[$i]["USCORE$j"] =
                                $pages[$i][self::USER_RANKS][$j];
                        } else {
                            $pages[$i]["USCORE$j"] = 0;
                        }
                    }
                }
            }
            $subscore_fields = [self::DOC_RANK, self::RELEVANCE];
            if ($use_proximity) {
                $subscore_fields[] = self::PROXIMITY;
            }
            if ($max_user_ranks > 0) {
                for ($j = 0; $j < $max_user_ranks; $j++) {
                    $subscore_fields[] = "USCORE$j";
                }
            }
            $num_fields = count($subscore_fields);
            // Compute Reciprocal Rank Fusion Score
            $alpha = 600/$num_fields;
            if (isset($pages[0])) {
                foreach ($subscore_fields as $field) {
                    L\orderCallback($pages[0], $pages[0], $field);
                    usort($pages, C\NS_LIB . "orderCallback");
                    $score = 0;
                    for ($i = 0; $i < $result_count; $i++) {
                        if ($i > 0) {
                            if ($pages[$i - 1][$field] != $pages[$i][$field]) {
                                $score++;
                            }
                        }
                        $pages[$i]["OUT_SCORE"] += $alpha/(59 + $score);
                    }
                }
                L\orderCallback($pages[0], $pages[0], "OUT_SCORE");
            }
            usort($pages, C\NS_LIB ."orderCallback");
            if ($use_proximity) {
                for ($i = 0; $i < $result_count; $i++) {
                    $pages[$i][self::SCORE] = $pages[$i]["OUT_SCORE"];
                }
            } else {
                for ($i = 0; $i < $result_count; $i++) {
                    $pages[$i][self::PROXIMITY] = 1;
                    $pages[$i][self::SCORE] = $pages[$i]["OUT_SCORE"];
                }
            }
            $sort_time = L\changeInMicrotime($sort_start);
        }
        if ($num_retrieved < $to_retrieve) {
            $results['TOTAL_ROWS'] = $num_retrieved;
        } else {
            $results['TOTAL_ROWS'] =  $query_iterator->num_docs;
            //this is only an approximation
        }
        if ($raw == 1 && $save_timestamp_name == "") {
            $pages = array_slice($pages, $start_slice);
            $pages = array_slice($pages, $limit - $start_slice, $num);
            $results['PAGES'] = & $pages;
            if ($old_to_retrieve != $to_retrieve) {
                $results['HARD_QUERY'] = $old_to_retrieve;
            }
            return $results;
        }
        if (C\QUERY_STATISTICS) {
            $this->query_info['QUERY'] .= "$in2<b>Lookup Offsets Time</b>: ".
                L\changeInMicrotime($lookup_time)."<br />";
            $machine_times = AnalyticsManager::get("MACHINE_TIMES");
            if ($machine_times) {
                $this->query_info['QUERY'] .=
                "$in3<i>Machine Sub-Times</i> (query/total):<br />".
                    $machine_times."<br />";
            }
            $net_times = AnalyticsManager::get("NET_TIMES");
            $max_machine_times = AnalyticsManager::get("MAX_MACHINE_TIMES");
            if ($net_times && $max_machine_times) {
                $this->query_info['QUERY'] .=
                "$in3<i>Network Overhead Sub-Time</i>: ".
                    ($net_times - $max_machine_times)."<br />";
            }
            if ($sort_time) {
                $this->query_info['QUERY'] .=
                    "$in3<i>Get Iterator Sub-Time</i>: " . $get_query_time .
                    "<br />$in3<i>Retrieve Postings Sub-Time</i>: " .
                    $retrieve_postings_time . "<br />" .
                    "$in3<i>Merge-Rank Sub-Time</i>: " .
                    $sort_time."<br />";
            }
            if ($query_iterator) {
                $this->query_info['PLAN'] = $query_iterator->plan();
            }
            $summaries_time = microtime(true);
        }
        $get_pages = array_slice($pages, $limit, $num);
        $to_get_count = count($get_pages);
        $groups_with_docs = false;
        if (preg_match("/\bsite:doc\b/", $original_query)) {
            $groups_with_docs = true;
        }
        $out_pages = [];
        $cur_limit = $limit;
        while (count($out_pages) < $to_get_count && $get_pages) {
            $out_pages = array_merge($out_pages,
                $this->getSummariesFromOffsets($get_pages, $queue_servers,
                $raw, $groups_with_docs));
            if ($save_timestamp_name != "") {
                break;
            }
            $cur_limit += $num;
            $get_pages = array_slice($pages, $cur_limit, $num);
        }
        $out_pages = array_slice($out_pages, 0, $num);
        if (C\QUERY_STATISTICS) {
            $summary_times_string = AnalyticsManager::get("SUMMARY_TIMES");
            if ($summary_times_string) {
                $round_summary_times = unserialize($summary_times_string);
                $summary_delta_time = L\changeInMicrotime($summaries_time);
                $summary_time_info = "$summary_delta_time<br /> $in4";
                $sum_max_time = 0;
                foreach ($round_summary_times as $summary_times) {
                    $i = 0;
                    $max_time = 0;
                    foreach ($summary_times as $summary_time) {
                        $summary_time_info .= "ID_$i: ".
                            number_format($summary_time, 6)."$indent";
                        $max_time = ($summary_time > $max_time) ?
                            $summary_time : $max_time;
                        $i++;
                    }
                    $summary_time_info .= "<br />\n$in4";
                    $sum_max_time += $max_time;
                }
                $net_overhead =  $summary_delta_time - $sum_max_time;
                $summary_time_info .=
                    "<br />$in3<i>Network Overhead Sub-Time</i>: ".
                    $net_overhead;
            } else {
                $summary_time_info = L\changeInMicrotime($summaries_time);
            }
            $this->query_info['QUERY'] .= "$in2<b>Get Summaries Time</b>: ".
                $summary_time_info."<br />";
        }
        $results['PAGES'] = $out_pages;
        $results['TIME'] = time();
        if (!empty($_SERVER["USE_CACHE"]) &&
            $save_timestamp_name == "") {
            self::$cache->set($summary_hash, $results);
        }
        return $results;
    }
    /**
     * Used to lookup summary info for the pages provided (using their)
     * self::SUMMARY_OFFSET field. If any of the lookup-ed summaries
     * are location's then looks these up in turn. This method handles robot
     * meta tags which might forbid indexing.
     *
     * @param array& $pages of page data without text summaries
     * @param array& $queue_servers array of queue server to find data on
     * @param int $raw only lookup locations if 0
     * @param bool $groups_with_docs whether to return only groups that
     *     contain at least one doc as opposed to a groups with only links
     * @return array pages with summaries added
     */
    public function getSummariesFromOffsets(&$pages, &$queue_servers, $raw,
        $groups_with_docs)
    {
        $lookups = [];
        $page_indexes = [];
        $index = 0;
        foreach ($pages as $page) {
            $key = $page[self::KEY];
            if (isset($page[self::SUMMARY_OFFSET])) {
                if (is_array($page[self::SUMMARY_OFFSET])) {
                    $lookups[$key] = $page[self::SUMMARY_OFFSET];
                } else {
                    $machine_id = (isset($page[self::MACHINE_ID])) ?
                        $page[self::MACHINE_ID] :$this->current_machine;
                    $lookups[$key][] =
                            [$machine_id, $key,
                                $page[self::CRAWL_TIME],
                                $page[self::GENERATION],
                                $page[self::SUMMARY_OFFSET]];
                }
                $page_indexes[$key] = $index;
            }
            $index++;
        }
        $lookup_queue_servers = $queue_servers;
        if ($queue_servers && !in_array(C\NAME_SERVER, $queue_servers)) {
            $lookup_queue_servers[] = C\NAME_SERVER;
                //name server might still have news
        }
        $summaries = $this->getCrawlItems($lookups, $lookup_queue_servers);
        $lookups = [];
        foreach ($summaries as $hash_url => $summary) {
            $lookup_url = false;
            if (isset($summaries[$hash_url][self::LOCATION]) &&
                $summaries[$hash_url][self::LOCATION] != []) {
                $tmp_url = explode(" => ",
                    $summaries[$hash_url][self::DESCRIPTION]);
                if (isset($tmp_url[1])) {
                    $lookup_url = trim($tmp_url[1]);
                }
            } elseif (isset($summaries[$hash_url][self::HASH])) {
                $hash_parts = explode('|', $summaries[$hash_url][self::HASH]);
                if (isset($hash_parts[3])) {
                    $lookup_url = $hash_parts[1];
                }
            }
            if ($lookup_url) {
                $crawl_time = $pages[$page_indexes[$hash_url]][
                    self::CRAWL_TIME];
                $lookups[$hash_url] = [$lookup_url, $crawl_time];
                unset($summaries[$hash_url]);
            }
        }
        $loc_summaries = $this->getCrawlItems($lookups, $lookup_queue_servers);
        if (is_array($loc_summaries)) {
            $summaries = array_merge($summaries, $loc_summaries);
        }
        $out_pages = [];
        $seen_hashes = [];
        foreach ($pages as $page) {
            $key = $page[self::KEY];
            if (isset($summaries[$key]) &&
                (!isset($summaries[$key][self::HASH]) ||
                !in_array($summaries[$key][self::HASH], $seen_hashes))) {
                $summary = & $summaries[$key];
                if (isset($summaries[$key][self::HASH])) {
                    $seen_hashes[] = $summaries[$key][self::HASH];
                }
                $pre_page = array_merge($page, $summary);
                if (isset($pre_page[self::ROBOT_METAS])) {
                    if (!in_array("NOINDEX", $pre_page[self::ROBOT_METAS])
                         &&
                        !in_array("NONE", $pre_page[self::ROBOT_METAS])) {
                        $out_pages[] = $pre_page;
                    }
                } else {
                    $out_pages[] = $pre_page;
                }
            }
        }
        $cnt = count($out_pages);
        if ($groups_with_docs) {
            for ($i = 0; $i < $cnt; $i++) {
                if (empty($out_pages[$i][self::IS_DOC]) ||
                    !empty($out_pages[$i][self::LOCATION])) {
                    unset($out_pages[$i]);
                }
            }
            $out_pages = array_values($out_pages);
        }
        return $out_pages;
    }
    /**
     * Using the supplied $word_structs, contructs an iterator for getting
     * results to a query
     *
     * @param array $word_structs an array of word_structs. Here a word_struct
     *     is an associative array with at least the following fields
     *     KEYS -- an array of word keys
     *     QUOTE_POSITIONS -- an array of positions of words that appreared in
     *         quotes (so need to be matched exactly)
     *     DISALLOW_PHRASES -- an array of words the document must not contain
     *     WEIGHT -- a weight to multiple scores returned from this iterator by
     *     INDEX_NAME -- an index timestamp to get results from
     * @param array& $filter an array of hashes of domains to filter from
     *     results
     *     and then potentially restored in cache
     * @param int $raw ($raw == 0) normal grouping, ($raw == 1)
     *     no grouping done on data also no summaries returned (only lookup
     *     info), $raw > 1 return summaries but no grouping
     * @param int $to_retrieve number of items to retrieve from location in
     *     in interator
     * @param array $queue_servers a list of urls of yioop machines which might
     *     be used during lookup
     * @param string $original_query if set, the orginal query that corresponds
     *     to $word_structs
     * @param string $save_timestamp_name if this timestamp is non empty, then
     *     when making iterator get sub-iterators to advance to gen doc_offset
     *     stored with respect to save_timestamp if exists.
     * @param bool $limit_feeds if true the number of feed shard items to
     *     allow in search results is limited to WordIterator::LIMIT_FEEDS_COUNT
     *
     * @return &object an iterator for iterating through results to the
     * query
     */
    public function getQueryIterator($word_structs, &$filter, $raw,
        &$to_retrieve, $queue_servers = [], $original_query = "",
        $save_timestamp_name = "", $limit_feeds = true)
    {
        $iterators = [];
        $total_iterators = 0;
        $network_flag = false;
        $min_group_flag = false;
        $min_group_override = false;
        if ($queue_servers != [] &&
            !$this->isSingleLocalhost($queue_servers)) {
            $network_flag = true;
            $total_iterators = 1;
            if (!in_array(C\NAME_SERVER, $queue_servers)) {
                $queue_servers[] = C\NAME_SERVER;
                    //name server might still have news
            }
            $num_servers = count($queue_servers);
            if ((!isset($this->index_name) || !$this->index_name) &&
                isset($word_structs[0]["INDEX_NAME"])) {
                $index_name = $word_structs[0]["INDEX_NAME"];
            } else {
                $index_name = $this->index_name;
            }
            $iterators[0] = new I\NetworkIterator($original_query,
                $queue_servers, $index_name, $filter, $save_timestamp_name);
        }
        if (!$network_flag) {
            $doc_iterate_hashes = [substr(L\crawlHashWord("site:any"), 0, 9),
                substr(L\crawlHash("site:any"), 0, 9),
                substr(L\crawlHashWord("site:doc"), 0, 9),
                substr(L\crawlHash("site:doc"), 0, 9)];
            if ($save_timestamp_name != "") {
                // used for archive crawls of crawl mixes
                $save_file = C\CRAWL_DIR.'/schedules/' . self::save_point .
                    $save_timestamp_name.".txt";
                if (file_exists($save_file)) {
                    $save_point =
                        unserialize(file_get_contents($save_file));
                }
                $save_count = 0;
            }
            foreach ($word_structs as $word_struct) {
                if (!is_array($word_struct)) {
                    continue;
                }
                $word_keys = $word_struct["KEYS"];
                $distinct_word_keys = [];
                $seen_keys = [];
                foreach ($word_keys as $wkey) {
                    if (is_string($wkey) || is_string($wkey[0])) {
                        $tmp_key = is_string($wkey) ? $wkey : $wkey[0];
                        if (!isset($seen_keys[$tmp_key])) {
                            $seen_keys[$tmp_key] = true;
                            $distinct_word_keys[] = $wkey;
                        }
                    } else {
                        $distinct_word_keys[] = $wkey;
                    }
                }
                $quote_positions = $word_struct["QUOTE_POSITIONS"];
                $disallow_keys = $word_struct["DISALLOW_KEYS"];
                $index_name = $word_struct["INDEX_NAME"];
                $weight = $word_struct["WEIGHT"];
                $num_word_keys = count($word_keys);
                $total_iterators = count($distinct_word_keys);
                $word_iterators = [];
                $word_iterator_map = [];
                if ($num_word_keys < 1) {
                    continue;
                }
                $sum = 0;
                $lookup_cutoff = max(C\MIN_RESULTS_TO_GROUP, $to_retrieve);
                for ($i = 0; $i < $total_iterators; $i++) {
                    $current_key = (is_string($distinct_word_keys[$i])) ?
                        $distinct_word_keys[$i] : (is_string(
                        $distinct_word_keys[$i][0]) ? $distinct_word_keys[$i][0]
                        : $distinct_word_keys[$i][0][0]);
                    if (!is_string($current_key)) {
                        $current_key = $current_key[0];
                    }
                    if (in_array(substr($current_key, 0, 9),
                        $doc_iterate_hashes)) {
                        $word_iterators[$i] = new I\DocIterator(
                            $index_name, $filter, $to_retrieve);
                        $min_group_override = true;
                    } else {
                        //can happen if exact phrase search suffix approach used
                        if (isset($distinct_word_keys[$i][0]) &&
                            is_array($distinct_word_keys[$i][0])) {
                            $distinct_keys = $distinct_word_keys[$i];
                        } else {
                            $distinct_keys = [$distinct_word_keys[$i]];
                        }
                        $sum = 0;
                        $tmp_word_iterators =[];
                        $m = 0;
                        foreach ($distinct_keys as $distinct_key) {
                            $shift = (isset($distinct_key[1])) ?
                                $distinct_key[1] : 0;
                            $distinct_key_id = L\unbase64Hash(
                                $distinct_key[0]);
                            $tmp_word_iterators[$m] =
                                new I\WordIterator($distinct_key_id, $shift,
                                $index_name, true, $filter, $to_retrieve,
                                $limit_feeds);
                            $sum += $tmp_word_iterators[$m]->num_docs;
                            if ($tmp_word_iterators[$m]->dictionary_info !=
                                [] ||
                                $tmp_word_iterators[$m]->feed_count > 0) {
                                $min_group_override = true;
                                $m++;
                            } else {
                                unset($tmp_word_iterators[$m]);
                            }
                            if ($sum > $lookup_cutoff) {
                                break;
                            }
                        }
                        if ($m == 1) {
                            $word_iterators[$i] = $tmp_word_iterators[0];
                        } else {
                            $word_iterators[$i] = new I\DisjointIterator(
                                $tmp_word_iterators);
                        }
                    }
                    foreach ($word_keys as $index => $key) {
                        if (isset($distinct_word_keys[$i]) &&
                            $key == $distinct_word_keys[$i]) {
                            $word_iterator_map[$index] = $i;
                        }
                    }
                }
                $num_disallow_keys = count($disallow_keys);
                if ($num_disallow_keys > 0) {
                    for ($i = 0; $i < $num_disallow_keys; $i++) {
                        /* notice for now shift always 0 - you can't disallow
                           phrases */
                        $disallow_iterator =
                            new I\WordIterator($disallow_keys[$i], 0,
                                $index_name, false, $filter);
                        $word_iterators[$num_word_keys + $i] =
                            new I\NegationIterator($disallow_iterator);
                    }
                }
                $num_word_keys += $num_disallow_keys;
                if ($num_word_keys == 1 && $weight == 1) {
                    $base_iterator = $word_iterators[0];
                } else {
                    $base_iterator = new I\IntersectIterator(
                        $word_iterators, $word_iterator_map, $quote_positions,
                        $weight);
                    $min_group_flag = true;
                    if ($save_timestamp_name == "") {
                        $base_iterator->sync_timer_on = true;
                    } else {
                        $base_iterator->sync_timer_on = false;
                    }
                }
                if ($save_timestamp_name != "") {
                    if (isset($save_point[$save_count]) &&
                        $save_point[$save_count] != -1) {
                        $base_iterator->advance($save_point[$save_count]);
                    }
                    $save_count++;
                }
                $iterators[] = $base_iterator;
            }
        }
        $num_iterators = count($iterators); //if network_flag should be 1
        if ($num_iterators < 1) {
            return null;
        } elseif ($num_iterators == 1) {
            $union_iterator = $iterators[0];
        } else {
            $union_iterator = new I\UnionIterator($iterators);
        }
        $raw = intval($raw);
        if ($raw > 0) {
            $group_iterator = $union_iterator;
        } else {
            $group_iterator =
                new I\GroupIterator($union_iterator, $total_iterators,
                    $this->current_machine, $network_flag);
        }
        if ($network_flag) {
            $union_iterator->results_per_block =
                ceil(C\SERVER_ALPHA *
                    $group_iterator->results_per_block/$num_servers);
        } elseif ($save_timestamp_name != "") {
            $group_iterator->save_iterators = $iterators;
        } elseif ($min_group_flag && !$min_group_override) {
            $group_iterator->results_per_block = max(C\MIN_RESULTS_TO_GROUP/20,
                1);
            $to_retrieve = -1;
        }
        return $group_iterator;
    }
}

ViewGit