<?php /** * SeekQuarry/Yioop -- * Open Source Pure PHP Search Engine, Crawler, and Indexer * * Copyright (C) 2009 - 2012 Chris Pollett chris@pollett.org * * LICENSE: * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * * END LICENSE * * @author Chris Pollett chris@pollett.org * @package seek_quarry * @subpackage model * @license http://www.gnu.org/licenses/ GPL3 * @link http://www.seekquarry.com/ * @copyright 2009 - 2012 * @filesource */ if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} /** * logging is done during crawl not through web, * so it will not be used in the phrase model */ if(!defined("POST_PROCESSING")) { define("LOG_TO_FILES", false); } /** For crawlHash function */ require_once BASE_DIR."/lib/utility.php"; /** For extractPhrasesAndCount function */ require_once BASE_DIR."/lib/phrase_parser.php"; /** * Used to look up words and phrases in the inverted index * associated with a given crawl */ require_once BASE_DIR."/lib/index_archive_bundle.php"; /** * Load FileCache class in case used */ require_once(BASE_DIR."/lib/file_cache.php"); /** * Load iterators to get docs out of index archive */ foreach(glob(BASE_DIR."/lib/index_bundle_iterators/*_iterator.php") as $filename) { require_once $filename; } /** * * This is class is used to handle * results for a given phrase search * * @author Chris Pollett * @package seek_quarry * @subpackage model */ class PhraseModel extends Model { /** used to hold the name of index archive to look summaries up in * @var string */ var $index_name; /** an associative array of additional meta words and * the max description length of results if such a meta word is used * this array is typically set in index.php * * @var array */ var $additional_meta_words; /** * Used to hold query statistics about the current query * @var array */ var $query_info; /** * Number of pages to cache in one go in memcache or filecache * Size chosen based on 1MB max object size for memcache or filecache */ const NUM_CACHE_PAGES = 10; /** * {@inheritdoc} */ function __construct($db_name = DB_NAME) { parent::__construct($db_name); } /** * Returns whether there is a index with the provide timestamp * * @param int $index_timestamp timestamp of the index to check if in cache * @return bool whether it exists or not */ function indexExists($index_time_stamp) { return file_exists(CRAWL_DIR.'/cache/IndexData'.$index_time_stamp); } /** * Rewrites a mix query so that it maps directly to a query about crawls * * @param string $query the original before a rewrite * @param object $mix a mix object saying how the mix is built out of crawls * * @return string a rewritten query in terms of crawls */ function rewriteMixQuery($query, $mix) { $disjunct_phrases = explode("|", $query); $rewrite = ""; if(isset($mix['GROUPS'])) { foreach($mix['GROUPS'] as $group) { $pipe = ""; foreach($disjunct_phrases as $disjunct) { $rewrite .= $pipe; $pipe = ' | '; $disjunct_string = $disjunct; $base_weight = 1; $pattern = "/(\s)(index:(\S)+)/"; preg_match_all($pattern, $query, $matches); if(isset($matches[2][0])) { $rewrite .= $disjunct; continue; } $pattern = "/(\s)(i:(\S)+)/"; preg_match_all($pattern, $query, $matches); if(isset($matches[2][0])) { $rewrite .= $disjunct; continue; } $pattern = "/(\s)(weight:(\S)+)/"; preg_match_all($pattern, $query, $matches); if(isset($matches[2][0])) { $base_weight = substr($matches[2][0],strlen("weight:")); $disjunct_string = preg_replace($pattern,"", $disjunct_string); } $pattern = "/(\s)(w:(\S)+)/"; preg_match_all($pattern, $query, $matches); if(isset($matches[2][0])) { $base_weight = substr($matches[2][0],strlen("w:")); $disjunct_string = preg_replace($pattern,"", $disjunct_string); } $pipe2 = ""; if(isset($group['COMPONENTS'])) { $start_disjunct_string = $disjunct_string; foreach($group['COMPONENTS'] as $component) { $disjunct_string = $start_disjunct_string; if(isset($component['KEYWORDS'])) { $disjunct_string .= " ".$component['KEYWORDS']; } $rewrite .= $pipe2.$disjunct_string." w:". ($component['WEIGHT']*$base_weight)." i:". $component['CRAWL_TIMESTAMP']; $pipe2 = ' | '; } } } $num_results = (isset($group['RESULT_BOUND']) && $group['RESULT_BOUND'] > 1) ? $group['RESULT_BOUND'] : 1; $rewrite .= " #$num_results# "; } } return $rewrite; } /** * Given a query phrase, returns formatted document summaries of the * documents that match the phrase. * * @param string $phrase the phrase to try to match * @param int $low return results beginning with the $low document * @param int $results_per_page how many results to return * @param bool $format whether to highlight in the returned summaries the * matched text * @param array $filter an array of hashes of domains to filter from * results * @param bool $use_cache_if_allowed if true and USE_CACHE is true then * an attempt will be made to look up the results in either * the file cache or memcache. Otherwise, items will be recomputed * and then potentially restored in cache * @param int $raw ($raw == 0) normal grouping, ($raw == 1) * no grouping but page look-up for links, ($raw == 2) * no grouping done on data' * @param array $queue_servers a list of urls of yioop machines which might * be used during lookup * @return array an array of summary data */ function getPhrasePageResults( $input_phrase, $low = 0, $results_per_page = NUM_RESULTS_PER_PAGE, $format = true, $filter = NULL, $use_cache_if_allowed = true, $raw = 0, $queue_servers = array()) { if(QUERY_STATISTICS) { $indent= " "; $in2 = $indent . $indent; $in3 = $in2 . $indent; $prs_cnt = 0; $dis_cnt = 0; $this->query_info = array(); $this->query_info['QUERY'] = "<b>PHRASE QUERY</b>: ".$input_phrase."<br />"; $start_time = microtime(); } $results = NULL; $word_structs = array(); /* this is a quick and dirty parsing and will usually work, exceptions would be # or | in quotes or if someone tried to escape |. First we split into presentation elements then we split by disjuncts */ $presentation_parts = preg_split('/#(\d)+#/', $input_phrase, -1, PREG_SPLIT_DELIM_CAPTURE); $count = 0; $presentation_parts = array_chunk($presentation_parts, 2); $num_parts = count($presentation_parts); $query_parts = array(); $last_part = NULL; for($i = 0; $i < $num_parts ; $i++) { if(isset($presentation_parts[$i][0]) && ($trimmed = trim($presentation_parts[$i][0])) != "" ) { $to_return = (isset($presentation_parts[$i][1])) ? $presentation_parts[$i][1]: 1; $query_parts[$trimmed][] = array($count, $to_return); $last_part = $trimmed; if(isset($presentation_parts[$i][1])) { $count += $presentation_parts[$i][1]; } else { $count ++; } } } $results_high = $low + $results_per_page; $num_last_parts = count($query_parts[$last_part]); if($query_parts[$last_part][$num_last_parts - 1][0] + $query_parts[$last_part][$num_last_parts - 1][1] < $low) { $query_parts[$last_part][$num_last_parts - 1][1] = $results_high; } $num_phrases = count($query_parts); foreach($query_parts as $phrase => $pre_result_bounds) { $phrase_high = $pre_result_bounds[0][1]; $result_bounds = array(); $start_flag = false; $num_bounds = 0; foreach($pre_result_bounds as $bound) { if($bound[0] > $results_high) break; //rest of presentation after what we'll return so break $phrase_high = $bound[0] + $bound[1]; if($phrase_high < $low) continue; // this part of presentation is before what we'll return so skip $result_bounds[] = $bound; $num_bounds++; } if($num_bounds == 0) continue; if($phrase == $last_part && $result_bounds[$num_bounds - 1][0] + $result_bounds[$num_bounds - 1][1] < $results_high) { $result_bounds[$num_bounds - 1][1] = $results_high - $result_bounds[$num_bounds - 1][0]; } $phrase_num = max(min($phrase_high, $results_high), $results_high) - $low; $disjunct_phrases = explode("|", $phrase); $word_structs = array(); if(QUERY_STATISTICS) { $this->query_info['QUERY'] .= $indent . "<b>Presentation $prs_cnt:</b><br />"; $this->query_info['QUERY'] .= "$in2<i>Low</i>:". $result_bounds[0][0]."<br />"; $this->query_info['QUERY'] .= $in2 . "<i>High</i>: ".$result_bounds[0][1]."<br />"; $prs_cnt++; } foreach($disjunct_phrases as $disjunct) { if(QUERY_STATISTICS) { $this->query_info['QUERY'] .= "$in2<b>Disjunct $dis_cnt:" . "</b><br />"; $dis_cnt++; } list($word_struct, $format_words) = $this->parseWordStructConjunctiveQuery($disjunct); if($word_struct != NULL) { $word_structs[] = $word_struct; } } if(QUERY_STATISTICS) { $this->query_info['QUERY'] .= "$in2<b>Presentation Parse time</b>: " . changeInMicrotime($start_time)."<br />"; $summaries_time = microtime(); } $out_results = $this->getSummariesByHash($word_structs, $low, $phrase_num, $filter, $use_cache_if_allowed, $raw, $queue_servers, $phrase); if(isset($out_results['PAGES']) && count($out_results['PAGES']) != 0) { $out_count = 0; foreach($result_bounds as $bound) { for($i = $bound[0]; $i < min($bound[0] + $bound[1], $results_high); $i++) { if(isset($out_results['PAGES'][$out_count])) { $results['PAGES'][$i] = $out_results['PAGES'][$out_count]; $out_count++; } } } if($phrase == $last_part && isset($out_results['TOTAL_ROWS'])){ $total_rows = $out_results['TOTAL_ROWS']; } } if(QUERY_STATISTICS) { $this->query_info['QUERY'] .= "$in2<b>Get Summaries time</b>: ". changeInMicrotime($summaries_time)."<br />"; $format_time = microtime(); } } if(isset($results['PAGES'])){ ksort($results['PAGES']); $results["PAGES"] = array_values($results["PAGES"]); } if(count($results) == 0) { $results = NULL; } if($results == NULL) { $total_rows = 0; $results['TOTAL_ROWS'] = 0; } if(isset($total_rows)) { $results['TOTAL_ROWS'] = $total_rows; } else { $results['TOTAL_ROWS'] = count($results['PAGES']); } if($format) { if(count($format_words) == 0 ){ $format_words = NULL; } } else { $format_words = NULL; } $description_length = self::DEFAULT_DESCRIPTION_LENGTH; if(isset($this->additional_meta_words) && is_array($this->additional_meta_words)) { foreach($this->additional_meta_words as $meta_word => $length){ $pattern = "/$meta_word/"; if(preg_match($pattern, $input_phrase)) { $description_length = $length; break; // only match the first found } } } $output = $this->formatPageResults($results, $format_words, $description_length); if(QUERY_STATISTICS) { $this->query_info['QUERY'] .= "<b>Format time</b>: ". changeInMicrotime($format_time)."<br />"; $this->query_info['ELAPSED_TIME'] = changeInMicrotime($start_time); $this->db->total_time += $this->query_info['ELAPSED_TIME']; $this->db->query_log[] = $this->query_info; } return $output; } /** * Determines the offset into the summaries WebArchiveBundle of the * provided url so that the info:url summary can be retrieved. * This assumes of course that the info:url meta word has been stored. * * @param string $url what to lookup * @return array (offset, generation) into the web archive bundle */ function lookupSummaryOffsetGeneration($url) { $index_archive_name = self::index_data_base_name . $this->index_name; $index_archive = new IndexArchiveBundle( CRAWL_DIR.'/cache/'.$index_archive_name); $num_retrieved = 0; $pages = array(); $summary_offset = NULL; $num_generations = $index_archive->generation_info['ACTIVE']; $word_iterator = new WordIterator(crawlHash("info:$url"), $index_archive); if(is_array($next_docs = $word_iterator->nextDocsWithWord())) { foreach($next_docs as $doc_key => $doc_info) { $summary_offset = $doc_info[CrawlConstants::SUMMARY_OFFSET]; $generation = $doc_info[CrawlConstants::GENERATION]; $cache_partition = $doc_info[CrawlConstants::SUMMARY][ CrawlConstants::CACHE_PAGE_PARTITION]; $num_retrieved++; if($num_retrieved >= 1) { break; } } if($num_retrieved == 0) { return false; } } else { return false; } return array($summary_offset, $generation, $cache_partition); } /** * Parses from a string phrase representing a conjunctive query, a struct * consisting of the words keys searched for, the allowed and disallowed * phrases, the weight that should be put on these query results, and * which archive to use. * * @param string $phrase string to extract struct from * @return array struct representing the conjunctive query */ function parseWordStructConjunctiveQuery($phrase) { $indent= " "; $in2 = $indent . $indent; $in3 = $in2 . $indent; $in4 = $in2. $in2; $phrase = " ".$phrase; $phrase = $this->guessSemantics($phrase); $phrase = $this->parseIfConditions($phrase); $phrase_string = $phrase; $phrase_string = str_replace("&", "&", $phrase_string); $meta_words = array('link:', 'site:', 'version:', 'modified:', 'filetype:', 'info:', '\-', 'os:', 'server:', 'date:', 'index:', 'i:', 'ip:', 'weight:', 'w:', 'u:', 'lang:', 'media:', 'elink:', 'location:'); if(isset($this->additional_meta_words)) { $meta_words = array_merge($meta_words, array_keys( $this->additional_meta_words)); } $index_name = $this->index_name; $weight = 1; $found_metas = array(); $disallow_phrases = array(); foreach($meta_words as $meta_word) { $pattern = "/(\s)($meta_word(\S)+)/"; preg_match_all($pattern, $phrase, $matches); if(!in_array($meta_word, array('i:', 'index:', 'w:', 'weight:', '\-') )) { $matches = $matches[2]; $found_metas = array_merge($found_metas, $matches); } else if($meta_word == '\-') { if(count($matches[0]) > 0) { $disallow_phrases = array_merge($disallow_phrases, array(substr($matches[2][0],1))); } } else if ($meta_word == 'i:' || $meta_word == 'index:') { if(isset($matches[2][0])) { $index_name = substr($matches[2][0],strlen($meta_word)); } } else if ($meta_word == 'w:' || $meta_word == 'weight:') { if(isset($matches[2][0])) { $weight = substr($matches[2][0],strlen($meta_word)); } } $phrase_string = preg_replace($pattern, "", $phrase_string); } $index_archive_name = self::index_data_base_name . $index_name; $index_archive = new IndexArchiveBundle( CRAWL_DIR.'/cache/'.$index_archive_name); $phrase_string = mb_ereg_replace(PUNCT, " ", $phrase_string); $phrase_string = preg_replace("/(\s)+/", " ", $phrase_string); /* we search using the stemmed/char-grammed words, but we format snippets in the results by bolding either */ $query_words = explode(" ", $phrase_string); //not stemmed $base_words = PhraseParser::extractPhrases($phrase_string, MAX_PHRASE_LEN, getLocaleTag()); //stemmed, if have stemmer $words = array_merge($base_words, $found_metas); if(QUERY_STATISTICS) { $this->query_info['QUERY'] .= "$in3<i>Index</i>: ". $index_archive_name."<br />"; $this->query_info['QUERY'] .= "$in3<i>LocaleTag</i>: ". getLocaleTag()."<br />"; $this->query_info['QUERY'] .= "$in3<i>Stemmed/Char-grammed Words</i>:<br />"; foreach($base_words as $word){ $this->query_info['QUERY'] .= "$in4$word<br />"; } $this->query_info['QUERY'] .= "$in3<i>Meta Words</i>:<br />"; foreach($found_metas as $word){ $this->query_info['QUERY'] .= "$in4$word<br />"; } } if(isset($words) && count($words) == 1 && count($disallow_phrases) < 1) { $phrase_string = $words[0]; $phrase_hash = crawlHash($phrase_string); $word_struct = array("KEYS" => array($phrase_hash), "RESTRICT_PHRASES" => NULL, "DISALLOW_KEYS" => array(), "WEIGHT" => $weight, "INDEX_ARCHIVE" => $index_archive ); } else { /* handle strings in quotes (we want an exact match on such quoted strings) */ $quoteds =array(); $hash_quoteds = array(); $num_quotes = preg_match_all('/\"((?:[^\"\\\]|\\\\.)*)\"/', $phrase,$quoteds); if(isset($quoteds[1])) { $quoteds = $quoteds[1]; } //get a raw list of words and their hashes $hashes = array(); $i = 0; foreach($words as $word) { $hashes[] = crawlHash($word); } $restrict_phrases = $quoteds; if(count($hashes) > 0) { $word_keys = array_slice($hashes, 0, MAX_QUERY_TERMS); } else { $word_keys = NULL; $word_struct = NULL; } $restrict_phrases = array_unique($restrict_phrases); $restrict_phrases = array_filter($restrict_phrases); $index_archive->setCurrentShard(0, true); $disallow_keys = array(); $num_disallow_keys = min(MAX_QUERY_TERMS, count($disallow_phrases)); for($i = 0; $i < $num_disallow_keys; $i++) { $disallow_stem=array_keys(PhraseParser::extractPhrasesAndCount( $disallow_phrases[$i], 2, getLocaleTag())); //stemmed $disallow_keys[] = crawlHash($disallow_stem[0]); } if($word_keys !== NULL) { $word_struct = array("KEYS" => $word_keys, "RESTRICT_PHRASES" => $restrict_phrases, "DISALLOW_KEYS" => $disallow_keys, "WEIGHT" => $weight, "INDEX_ARCHIVE" => $index_archive ); } } $format_words = array_merge($query_words, $base_words); return array($word_struct, $format_words); } /** * The plan is code to tru to guess from the query what the user is * looking for will be called from here. For now, we are just guessing * when a query term is a url and rewriting it to the appropriate meta * meta word. * * @param string $phrase input query to guess semantics of * @return string a phrase that more closely matches the intentions of the * query. */ function guessSemantics($phrase) { $phrase .= " "; $cond_token = "(\.com|\.edu|\.org|\.gov|\.mil|.ca|\.uk|\.fr)"; $pattern = "/(\s)((\S)+$cond_token)(\s)/"; preg_match_all($pattern, $phrase, $matches); $matches = $matches[2]; $result_phrase = preg_replace($pattern, "", $phrase); foreach($matches as $match) { if(!strstr($match, ":")) { $result_phrase .= " site:".$match; } else { $result_phrase .= " ".$match; } } $phrase = $result_phrase; $cond_token = "www\."; $pattern = "/(\s)($cond_token(\S)+)/"; preg_match_all($pattern, $phrase, $matches); $matches = $matches[2]; $result_phrase = preg_replace($pattern, "", $phrase); foreach($matches as $match) { $result_phrase .= " site:".$match; } $phrase = $result_phrase; $cond_token = "http:"; $pattern = "/(\s)($cond_token(\S)+)/"; preg_match_all($pattern, $phrase, $matches); $matches = $matches[2]; $result_phrase = preg_replace($pattern, "", $phrase); foreach($matches as $match) { $result_phrase .= " site:".$match; } return $result_phrase; } /** * Evaluates any if: conditional meta-words in the query string to * caluclate a new query string. * * @param string $phrase original query string * @return string query string after if: meta words have been evaluated */ function parseIfConditions($phrase) { $cond_token = "if:"; $pattern = "/(\s)($cond_token(\S)+)/"; preg_match_all($pattern, $phrase, $matches); $matches = $matches[2]; $result_phrase = preg_replace($pattern, "", $phrase); foreach($matches as $match) { $match = substr($match, strlen($cond_token)); $match_parts = explode("!", $match); if(count($match_parts) < 2) continue; if(stristr($result_phrase, $match_parts[0]) !== false) { $result_phrase .= " ".str_replace("+", " ", $match_parts[1]); } else if(isset($match_parts[2])) { $result_phrase .= " ".str_replace("+", " ", $match_parts[2]); } } return $result_phrase; } /** * Given a page summary extract the words from it and try to find documents * which match the most relevant words. The algorithm for "relevant" is * pretty weak. For now we pick the $num many words which appear in the * fewest documents. * * @param string $crawl_item a page summary * @param int $num number of key phrase to return * @return array an array of most selective key phrases */ function getTopPhrases($crawl_item, $num) { $index_archive_name = self::index_data_base_name . $this->index_name; $index_archive = new IndexArchiveBundle(CRAWL_DIR.'/cache/'.$index_archive_name); $phrase_string = PhraseParser::extractWordStringPageSummary($crawl_item); $words = array_keys(PhraseParser::extractPhrasesAndCount($phrase_string)); $hashes = array(); $lookup = array(); foreach($words as $word) { $tmp = crawlHash($word); $hashes[] = $tmp; $lookup[$tmp] = $word; } $words_array = $index_archive->getSelectiveWords($hashes, $num, "greaterThan"); $word_keys = array_keys($words_array); $phrases = array(); foreach($word_keys as $word_key) { $phrases[] = $lookup[$word_key]; } return $phrases; } /** * Gets doc summaries of documents containing given words and meeting the * additional provided criteria * @param array $word_structs an array of word_structs. Here a word_struct * is an associative array with at least the following fields * KEYS -- an array of word keys * RESTRICT_PHRASES -- an array of phrases the document must contain * DISALLOW_PHRASES -- an array of words the document must not contain * WEIGHT -- a weight to multiple scores returned from this iterator by * INDEX_ARCHIVE -- an index_archive object to get results from * @param int $limit number of first document in order to return * @param int $num number of documents to return summaries of * @param array &$filter an array of hashes of domains to filter from * results * @param bool $use_cache_if_allowed if true and USE_CACHE is true then * an attempt will be made to look up the results in either * the file cache or memcache. Otherwise, items will be recomputed * and then potentially restored in cache * @param int $raw ($raw == 0) normal grouping, ($raw == 1) * no grouping but page look-up for links, ($raw == 2) * no grouping done on data * @param array $queue_servers a list of urls of yioop machines which might * be used during lookup * @param string $original_query if set, the original query that corresponds * to $word_structs * * @return array document summaries */ function getSummariesByHash($word_structs, $limit, $num, &$filter, $use_cache_if_allowed = true, $raw = 0, $queue_servers = array(), $original_query = "") { global $CACHE; $pages = array(); $generation = 0; $to_retrieve = ceil(($limit+$num)/self::NUM_CACHE_PAGES) * self::NUM_CACHE_PAGES; $start_slice = floor(($limit)/self::NUM_CACHE_PAGES) * self::NUM_CACHE_PAGES; if(USE_CACHE) { $mem_tmp = ""; foreach($word_structs as $word_struct) { $mem_tmp .= serialize($word_struct["KEYS"]). serialize($word_struct["RESTRICT_PHRASES"]) . serialize($word_struct["DISALLOW_KEYS"]) . $word_struct["WEIGHT"] . $word_struct["INDEX_ARCHIVE"]->dir_name; } if($use_cache_if_allowed) { $cache_success = true; $results = array(); $results['PAGES'] = array(); for($i=$start_slice; $i<$to_retrieve;$i+=self::NUM_CACHE_PAGES){ $summary_hash = crawlHash($mem_tmp.":".$i); $slice = $CACHE->get($summary_hash); if($slice === false) { $cache_success = false; break; } $results['PAGES'] = array_merge($results['PAGES'], $slice['PAGES']); $results['TOTAL_ROWS'] = $slice['TOTAL_ROWS']; } if($cache_success) { $results['PAGES'] = array_slice($results['PAGES'], $limit - $start_slice, $num); return $results; } } } $query_iterator = $this->getQueryIterator($word_structs, $filter, $raw, $queue_servers, $original_query); $num_retrieved = 0; $pages = array(); $isLocal = ($queue_servers == array()) || $this->isSingleLocalhost($queue_servers); while($num_retrieved < $to_retrieve && is_object($query_iterator) && is_array($next_docs = $query_iterator->nextDocsWithWord()) ) { foreach($next_docs as $doc_key => $doc_info) { if($isLocal) { $summary = & $doc_info[CrawlConstants::SUMMARY]; $tmp = unserialize($query_iterator->getIndex( $doc_key)->description); $doc_info[self::CRAWL_TIME] = $tmp[self::CRAWL_TIME]; unset($doc_info[CrawlConstants::SUMMARY]); if(is_array($summary)) { $pages[] = array_merge($doc_info, $summary); $num_retrieved++; } } else { $pages[] = $doc_info; $num_retrieved++; } } } $result_count = count($pages); // initialize scores for($i = 0; $i < $result_count; $i++) { $pages[$i][CrawlConstants::SCORE] = 0; } $subscore_fields = array(self::DOC_RANK, self::RELEVANCE, self::PROXIMITY); $num_fields = count($subscore_fields); // Compute Reciprocal Rank Fusion Score $alpha = 600/$num_fields; if(isset($pages[0])) { foreach($subscore_fields as $field) { orderCallback($pages[0], $pages[0], $field); usort($pages, "orderCallback"); $score = 0; for($i = 0; $i < $result_count; $i++) { if($i > 0) { if($pages[$i - 1][$field] != $pages[$i][$field]) { $score++; } } $pages[$i][CrawlConstants::SCORE] += $alpha/(60 + $score); } } orderCallback($pages[0], $pages[0], CrawlConstants::SCORE); } usort($pages, "orderCallback"); if($num_retrieved < $to_retrieve) { $results['TOTAL_ROWS'] = $num_retrieved; } else { $results['TOTAL_ROWS'] = $query_iterator->num_docs; //this is only an approximation } if(USE_CACHE) { for($i = 0; $i < $result_count; $i++){ unset($pages[$i][self::LINKS]); } for($i = 0;$i < $to_retrieve;$i+=self::NUM_CACHE_PAGES){ $summary_hash = crawlHash($mem_tmp.":".$i); $slice['PAGES'] = array_slice($pages, $i, self::NUM_CACHE_PAGES); $slice['TOTAL_ROWS'] = $results['TOTAL_ROWS']; $CACHE->set($summary_hash, $slice); } } $results['PAGES'] = & $pages; $results['PAGES'] = array_slice($results['PAGES'], $start_slice); $results['PAGES'] = array_slice($results['PAGES'], $limit - $start_slice, $num); return $results; } /** * Using the supplied $word_structs, contructs an iterator for getting * results to a query * * @param array $word_structs an array of word_structs. Here a word_struct * is an associative array with at least the following fields * KEYS -- an array of word keys * RESTRICT_PHRASES -- an array of phrases the document must contain * DISALLOW_PHRASES -- an array of words the document must not contain * WEIGHT -- a weight to multiple scores returned from this iterator by * INDEX_ARCHIVE -- an index_archive object to get results from * @param array &$filter an array of hashes of domains to filter from * results * and then potentially restored in cache * @param int $raw ($raw == 0) normal grouping, ($raw == 1) * no grouping but page look-up for links, ($raw == 2) * no grouping done on data * @param array $queue_servers a list of urls of yioop machines which might * be used during lookup * @param string $original_query if set, the orginal query that corresponds * to $word_structs * * @return &object an iterator for iterating through results to the * query */ function getQueryIterator($word_structs, &$filter, $raw = 0, $queue_servers = array(), $original_query = "") { $iterators = array(); $total_iterators = 0; $network_flag = false; if($queue_servers != array()) { if(!$this->isSingleLocalhost($queue_servers)) { $network_flag = true; $total_iterators = 1; $num_servers = count($queue_servers); $iterators[0] = new NetworkIterator($original_query, $queue_servers, $this->index_name); } } if(!$network_flag) { foreach($word_structs as $word_struct) { if(!is_array($word_struct)) { continue;} $word_keys = $word_struct["KEYS"]; $distinct_word_keys = array_unique($word_keys); $restrict_phrases = $word_struct["RESTRICT_PHRASES"]; $disallow_keys = $word_struct["DISALLOW_KEYS"]; $index_archive = $word_struct["INDEX_ARCHIVE"]; $weight = $word_struct["WEIGHT"]; $num_word_keys = count($word_keys); $total_iterators = count($distinct_word_keys); $word_iterators = array(); $word_iterator_map = array(); if($num_word_keys < 1) {continue;} for($i = 0; $i < $total_iterators; $i++) { $word_iterators[$i] = new WordIterator($distinct_word_keys[$i], $index_archive, false, $filter); foreach ($word_keys as $index => $key) { if($key == $distinct_word_keys[$i]){ $word_iterator_map[$index] = $i; } } } $num_disallow_keys = count($disallow_keys); if($num_disallow_keys > 0) { for($i = 0; $i < $num_disallow_keys; $i++) { $disallow_iterator = new WordIterator($disallow_keys[$i], $index_archive, false, $filter); $word_iterators[$num_word_keys + $i] = new NegationIterator($disallow_iterator); } } $num_word_keys += $num_disallow_keys; if($num_word_keys == 1) { $base_iterator = $word_iterators[0]; } else { $base_iterator = new IntersectIterator( $word_iterators,$word_iterator_map); } if($restrict_phrases == NULL && $disallow_keys == array() && $weight == 1) { $iterators[] = $base_iterator; } else { $iterators[] = new PhraseFilterIterator($base_iterator, $restrict_phrases, $weight); } } } $num_iterators = count($iterators); if( $num_iterators < 1) { return NULL; } else if($num_iterators == 1) { $union_iterator = $iterators[0]; } else { $union_iterator = new UnionIterator($iterators); } $raw = intval($raw); if ($raw == 2) { $group_iterator = $union_iterator; } else if ($raw == 1) { $group_iterator = new GroupIterator($union_iterator, $total_iterators, true); } else { $group_iterator = new GroupIterator($union_iterator, $total_iterators); } if($network_flag) { $union_iterator->results_per_block = 1.1* $group_iterator->results_per_block/$num_servers; } return $group_iterator; } } ?>