<?php /** * SeekQuarry/Yioop -- * Open Source Pure PHP Search Engine, Crawler, and Indexer * * Copyright (C) 2009, 2010, 2011 Chris Pollett chris@pollett.org * * LICENSE: * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * * END LICENSE * * @author Chris Pollett chris@pollett.org * @package seek_quarry * @subpackage controller * @license http://www.gnu.org/licenses/ GPL3 * @link http://www.seekquarry.com/ * @copyright 2009, 2010, 2011 * @filesource */ if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} /**Load base controller class, if needed. */ require_once BASE_DIR."/controllers/controller.php"; /** To extract words from the query*/ require_once BASE_DIR."/lib/phrase_parser.php"; /** Get the crawlHash function */ require_once BASE_DIR."/lib/utility.php"; /** Loads common constants for web crawling */ require_once BASE_DIR."/lib/crawl_constants.php"; /** * Controller used to handle search requests to SeekQuarry * search site. Used to both get and display * search results. * * @author Chris Pollett * @package seek_quarry * @subpackage controller */ class SearchController extends Controller implements CrawlConstants { /** * Says which models to load for this controller. * PhraseModel is used to extract words from the query; CrawlModel * is used for cached web page requests * @var array */ var $models = array("phrase", "crawl", "searchfilters"); /** * Says which views to load for this controller. * The SearchView is used for displaying general search results as well * as the initial search screen; NocacheView * is used on a cached web page request that fails; RssView is used * to present search results according to the opensearch.org rss results * format. * @var array */ var $views = array("search", "nocache", "rss"); /** * Says which activities (roughly methods invoke from the web) this * controller will respond to * @var array */ var $activities = array("query", "cache", "related", "signout"); /** * This is the main entry point for handling a search request. * * ProcessRequest determines the type of search request (normal request , * cache request, or related request), or if its a * user is returning from the admin panel via signout. It then calls the * appropriate method to handle the given activity.Finally, it draw the * search screen. */ function processRequest() { $data = array(); $view = "search"; $start_time = microtime(); if(isset($_REQUEST['f']) && $_REQUEST['f']=='rss' && RSS_ACCESS) { $view = "rss"; } else if (!WEB_ACCESS) { return; } if(isset($_REQUEST['raw']) && $_REQUEST['raw'] == true) { $raw = true; } else { $raw = false; } if(isset($_SESSION['MAX_PAGES_TO_SHOW']) ) { $results_per_page = $_SESSION['MAX_PAGES_TO_SHOW']; } else { $results_per_page = NUM_RESULTS_PER_PAGE; } if(isset($_SESSION['USER_ID'])) { $user = $_SESSION['USER_ID']; $token_okay = $this->checkCSRFToken('YIOOP_TOKEN', $user); if($token_okay === false) { unset($_SESSION['USER_ID']); $user = $_SERVER['REMOTE_ADDR']; } } else { $user = $_SERVER['REMOTE_ADDR']; } if(isset($_REQUEST['a'])) { if(in_array($_REQUEST['a'], $this->activities)) { $activity = $_REQUEST['a']; if($activity == "signout") { unset($_SESSION['USER_ID']); $user = $_SERVER['REMOTE_ADDR']; $data['SCRIPT'] = "doMessage('<h1 class=\"red\" >". tl('search_controller_logout_successful')."</h1>')"; } if(isset($_REQUEST['arg'])) { $arg = $_REQUEST['arg']; } else { $activity = "query"; } } else { $activity = "query"; } } else { $activity = "query"; } if(isset($_REQUEST['its']) || isset($_SESSION['its'])) { $its = (isset($_REQUEST['its'])) ? $_REQUEST['its'] : $_SESSION['its']; $index_time_stamp = $this->clean($its, "int"); if(!$this->phraseModel->indexExists($index_time_stamp) && !$this->crawlModel->isCrawlMix($index_time_stamp)) { $index_time_stamp = $this->crawlModel->getCurrentIndexDatabaseName(); //use the default crawl index } } else { $index_time_stamp = $this->crawlModel->getCurrentIndexDatabaseName(); //use the default crawl index } $index_info = NULL; if($this->phraseModel->indexExists($index_time_stamp) || $this->crawlModel->isCrawlMix($index_time_stamp)) { $index_info = $this->crawlModel->getInfoTimestamp($index_time_stamp); } if(isset($_REQUEST['q']) && strlen($_REQUEST['q']) > 0 || $activity != "query") { if($activity == "query") { $activity_array = $this->extractActivityQuery(); $query = $activity_array[0]; // dirty $activity = $activity_array[1]; $arg = $activity_array[2]; } if($activity != "cache") { if(!isset($query)) { $query = NULL; } if(isset($_REQUEST['limit'])) { $limit = $this->clean($_REQUEST['limit'], "int"); } else { $limit = 0; } $data = $this->processQuery( $query, $activity, $arg, $results_per_page, $limit, $index_time_stamp, $raw); // calculate the results of a search if there is one } else { $highlight = true; if(!isset($query)) { $query = $_REQUEST['q']; //dirty list(,$query_activity,) = $this->extractActivityQuery(); if($query_activity != "query") {$highlight = false;} } $this->cacheRequestAndOutput($arg, $highlight, $query, $index_time_stamp); return; } } $data['its'] = (isset($index_time_stamp)) ? $index_time_stamp : 0; if($index_info !== NULL) { if(isset($index_info['IS_MIX'])) { $data['INDEX_INFO'] = tl('search_controller_mix_info', $index_info['DESCRIPTION']); } else { $data['INDEX_INFO'] = tl('search_controller_crawl_info', $index_info['DESCRIPTION'], $index_info['VISITED_URLS_COUNT'], $index_info['COUNT']); } } else { $data['INDEX_INFO'] = ""; } $data['YIOOP_TOKEN'] = $this->generateCSRFToken($user); $data['ELAPSED_TIME'] = changeInMicrotime($start_time); $this->displayView($view, $data); } /** * Searches the database for the most relevant pages for the supplied search * terms. Renders the results to the HTML page. * * @param string $query a string containing the words to search on * @param string $activity besides a straight search for words query, * one might have other searches, such as a search for related pages. * this argument says what kind of search to do. * @param string $arg for a search other than a straight word query this * argument provides auxiliary information on how to conduct the * search. For instance on a related web page search, it might provide * the url of the site with which to perform the related search. * @param int $results_per_page the maixmum number of search results * that can occur on a page * @param int $limit the first page of all the pages with the query terms * to return. For instance, if 10 then the tenth highest ranking page * for those query terms will be return, then the eleventh, etc. * @param int $index_name the timestamp of an index to use, if 0 then * default used * @param int $raw ($raw == 0) normal grouping, ($raw == 1) * no grouping but page look-up for links, ($raw == 2) * no grouping done on data * * @return array an array of at most results_per_page many search results */ function processQuery($query, $activity, $arg, $results_per_page, $limit = 0, $index_name = 0, $raw = 0) { $no_index_given = false; if($index_name == 0) { $index_name = $this->crawlModel->getCurrentIndexDatabaseName(); $no_index_given = true; } $is_mix = $this->crawlModel->isCrawlMix($index_name); if($no_index_given && (!$this->phraseModel->indexExists($index_name) && !$is_mix)) { $data['SCRIPT'] = "doMessage('<h1 class=\"red\" >". tl('search_controller_no_index_set'). "</h1>');"; return $data; } $this->phraseModel->index_name = $index_name; $this->phraseModel->additional_meta_words = array(); foreach($this->indexing_plugins as $plugin) { $plugin_name = ucfirst($plugin)."Plugin"; $plugin_obj = new $plugin_name(); $tmp_meta_words = $plugin_obj->getAdditionalMetaWords(); $this->phraseModel->additional_meta_words = array_merge($this->phraseModel->additional_meta_words, $tmp_meta_words); } $this->crawlModel->index_name = $index_name; $original_query = $query; $query = preg_replace('/no:cache/', "", $query); $use_cache_if_possible = ($original_query == $query) ? true : false; switch($activity) { case "related": $data['QUERY'] = "related:$arg"; $url = $arg; list($summary_offset, $generation, ) = $this->phraseModel->lookupSummaryOffsetGeneration($url); $crawl_item = $this->crawlModel->getCrawlItem($summary_offset, $generation); $top_phrases = $this->phraseModel->getTopPhrases($crawl_item, 3); $top_query = implode(" ", $top_phrases); $phrase_results = $this->phraseModel->getPhrasePageResults( $top_query, $limit, $results_per_page, false, NULL, $use_cache_if_possible, $raw); $data['PAGING_QUERY'] = "index.php?c=search&a=related&arg=". urlencode($url); $data['QUERY'] = urlencode($this->clean($data['QUERY'], "string")); break; case "query": default: if(trim($query) != "") { $mix_metas = array("m:", "mix:"); foreach($mix_metas as $mix_meta) { $pattern = "/(\s)($mix_meta(\S)+)/"; preg_match_all($pattern, $query, $matches); if(isset($matches[2][0]) && !isset($mix_name)) { $mix_name = substr($matches[2][0], strlen($mix_meta)); $mix_name = str_replace("+", " ", $mix_name); } $query = preg_replace($pattern, "", $query); } if(isset($mix_name)) { $tmp = $this->crawlModel->getCrawlMixTimestamp( $mix_name); if($tmp != false) { $index_name = $tmp; $is_mix = true; } } if($is_mix) { $mix = $this->crawlModel->getCrawlMix($index_name); $query = $this->phraseModel->rewriteMixQuery($query, $mix); } $filter = $this->searchfiltersModel->getFilter(); $phrase_results = $this->phraseModel->getPhrasePageResults( $query, $limit, $results_per_page, true, $filter, $use_cache_if_possible, $raw); $query = $original_query; } $data['PAGING_QUERY'] = "index.php?q=".urlencode($query); $data['QUERY'] = urlencode($this->clean($query,"string")); break; } $data['PAGES'] = (isset($phrase_results['PAGES'])) ? $phrase_results['PAGES']: array(); $data['TOTAL_ROWS'] = (isset($phrase_results['TOTAL_ROWS'])) ? $phrase_results['TOTAL_ROWS'] : 0; $data['LIMIT'] = $limit; $data['RESULTS_PER_PAGE'] = $results_per_page; return $data; } /** * This method is responsible for parsing out the kind of query * from the raw query string * * This method parses the raw query string for query activities. * It parses the name of each activity and its argument * * @return array list of search activities parsed out of the search string */ function extractActivityQuery() { $query = mb_ereg_replace("(\s)+", " ", $_REQUEST['q']); $query = mb_ereg_replace("\s:\s", ":", $_REQUEST['q']); $query_parts = mb_split(" ", $query); $count = count($query_parts); $out_query = ""; $activity = "query"; $arg = ""; $space = ""; for($i = 0; $i < $count; $i++) { foreach($this->activities as $a_activity) { $in_pos = mb_strpos($query_parts[$i], "$a_activity:"); if($in_pos !== false && $in_pos == 0) { $out_query = ""; $activity = $a_activity; $arg = mb_substr($query_parts[$i], strlen("$a_activity:")); continue; } } $out_query .= $space.$query_parts[$i]; $space = " "; } $activity_array = array($out_query, $activity, $arg); return $activity_array; } /** * Used in rendering a cached web page to highlight the search terms. * * @param object $node DOM object to mark html elements of * @param array $words an array of words to be highlighted * @param object $dom a DOM object for the whole document * @return object the node modified to now have highlighting */ function markChildren($node, $words, $dom) { if(!isset($node->childNodes->length)) { return $node; } for($k = 0; $node->childNodes->length; $k++) { if(!$node->childNodes->item($k)) { break; } $clone = $node->childNodes->item($k)->cloneNode(true); if($clone->nodeType == XML_TEXT_NODE) { $text = $clone->textContent; foreach($words as $word) { //only mark string of length at least 2 if(strlen($word) > 1) { $mark_prefix = crawlHash($word); if(stristr($mark_prefix, $word) !== false) { $mark_prefix = preg_replace( "/$word/i", '', $mark_prefix); } $text = preg_replace( "/$word/i", $mark_prefix.'$0', $text); } } $textNode = $dom->createTextNode($text); $node->replaceChild($textNode, $node->childNodes->item($k)); } else { $clone = $this->markChildren($clone, $words, $dom); $node->replaceChild($clone, $node->childNodes->item($k)); } } return $node; } //*********BEGIN SEARCH API ********* /** * Part of Yioop! Search API. Performs a normal search query and returns * associative array of query results * * @param string $query this can be any query string that could be * entered into the search bar on Yioop! (other than related: and * cache: queries) * @param int $results_per_page number of results to return * @param int $limit first result to return from the ordered query results * @param int $raw ($raw == 0) normal grouping, ($raw == 1) * no grouping but page look-up for links, ($raw == 2) * no grouping done on data * * @return array associative array of results for the query performed */ public function queryRequest($query, $results_per_page, $limit = 0, $raw = 0) { return (API_ACCESS) ? $this->processQuery($query, "query", "", $results_per_page, $limit, $raw) : NULL; } /** * Part of Yioop! Search API. Performs a related to a given url * search query and returns associative array of query results * * @param string $url to find related documents for * @param int $results_per_page number of results to return * @param int $limit first result to return from the ordered query results * @param int $raw ($raw == 0) normal grouping, ($raw == 1) * no grouping but page look-up for links, ($raw == 2) * no grouping done on data * * @return array associative array of results for the query performed */ public function relatedRequest($url, $results_per_page, $limit = 0, $crawl_time = 0, $raw = 0) { return (API_ACCESS) ? $this->processQuery("", "related", $url, $results_per_page, $limit, $crawl_time, $raw) : NULL; } /** * Part of Yioop! Search API. Performs a related to a given url * search query and returns associative array of query results * * @param string $url to get cached page for * @param bool $highlight whether to put the search terms in the page * in colored span tags. * @param string $terms space separated list of search terms * @param string $crawl_time timestamp of crawl to look for cached page in * * @return string with contents of cached page */ public function cacheRequest($url, $highlight=true, $terms ="", $crawl_time = 0) { if(!API_ACCESS) return false; ob_start(); $this->cacheRequestAndOutput($url, $highlight, $terms, $crawl_time); $cached_page = ob_get_contents(); ob_end_clean(); return $cached_page; } //*********END SEARCH API ********* /** * Used to get and render a cached web page * * @param string $url the url of the page to find the cached version of * @param bool $highlight whether or not to highlight the query terms in * the cached page * @param string $terms the list of query terms * @param int $crawl_time the timestamp of the crawl to look up the cached * page in */ function cacheRequestAndOutput($url, $highlight=true, $terms ="", $crawl_time = 0) { global $CACHE; $hash_key = crawlHash( $terms.$url.serialize($highlight).serialize($crawl_time)); if(USE_CACHE) { if($newDoc = $CACHE->get($hash_key)) { echo $newDoc; return; } } if($crawl_time == 0) { $crawl_time = $this->crawlModel->getCurrentIndexDatabaseName(); } $this->phraseModel->index_name = $crawl_time; $this->crawlModel->index_name = $crawl_time; list($summary_offset, $generation, $cache_partition) = $this->phraseModel->lookupSummaryOffsetGeneration($url); $data = array(); if(!$crawl_item = $this->crawlModel->getCrawlItem($summary_offset, $generation)) { $this->displayView("nocache", $data); return; } $summary_string = wordwrap($crawl_item[self::TITLE], 80, "\n")."\n\n" . wordwrap($crawl_item[self::DESCRIPTION], 80, "\n")."\n\n". wordwrap(print_r($crawl_item[self::LINKS], true), 80, "\n"); $robot_instance = $crawl_item[self::ROBOT_INSTANCE]; $robot_table_name = CRAWL_DIR."/robot_table.txt"; $robot_table = array(); if(file_exists($robot_table_name)) { $robot_table = unserialize(file_get_contents($robot_table_name)); } if(!isset($robot_table[$robot_instance])) { $data["SUMMARY_STRING"] = $summary_string; $this->displayView("nocache", $data); return; } $machine = $robot_table[$robot_instance][0]; $machine_uri = $robot_table[$robot_instance][1]; $page = $crawl_item[self::HASH]; $offset = $crawl_item[self::OFFSET]; $cache_item = $this->crawlModel->getCacheFile($machine, $machine_uri, $cache_partition, $offset, $crawl_time); if(!isset($cache_item[self::PAGE])) { $data["SUMMARY_STRING"] = $summary_string; $this->displayView("nocache", $data); return; } $cache_file = $cache_item[self::PAGE]; if(!stristr($cache_item[self::TYPE], "image")) { $meta_words = array('link\:', 'site\:', 'version\:', 'modified\:', 'filetype\:', 'info\:', '\-', 'os\:', 'server\:', 'date\:', 'lang\:', 'elink\:', 'index:', 'ip:', 'i:', 'weight:', 'w:', 'u:'); foreach($meta_words as $meta_word) { $pattern = "/(\s)($meta_word(\S)+)/"; $terms = preg_replace($pattern, "", $terms); } $terms = str_replace("'", " ", $terms); $terms = str_replace('"', " ", $terms); $terms = str_replace('\\', " ", $terms); $terms = str_replace('|', " ", $terms); $terms = $this->clean($terms, "string"); $phrase_string = mb_ereg_replace("[[:punct:]]", " ", $terms); $words = mb_split(" ",$phrase_string); if(!$highlight) { $words = array(); } } else { $type = $cache_item[self::TYPE]; $cache_file = "<html><head><title>Yioop! Cache</title></head>". "<body><object data='data:$type;base64,". base64_encode($cache_file)."' type='$type' /></body></html>"; $words = array(); } $date = date ("F d Y H:i:s", $cache_item[self::TIMESTAMP]); $dom = new DOMDocument(); $did_dom = @$dom->loadHTML('<?xml encoding="UTF-8">' . $cache_file); foreach ($dom->childNodes as $item) if ($item->nodeType == XML_PI_NODE) $dom->removeChild($item); // remove hack $dom->encoding = "UTF-8"; // insert proper $xpath = new DOMXPath($dom); $body = $dom->getElementsByTagName('body')->item(0); if($body == false) { $cache_file = "<html><head><title>Yioop! Cache</title></head>". "<body>".htmlentities($cache_file)."</body></html>"; $dom = new DOMDocument(); @$dom->loadHTML($cache_file); $body = $dom->getElementsByTagName('body')->item(0); } $first_child = $body->firstChild; $summaryNode = $dom->createElement('pre'); $summaryNode = $body->insertBefore($summaryNode, $first_child); $summaryNode->setAttributeNS("","style", "border-color: black; ". "border-style:solid; border-width:3px; ". "padding: 5px; background-color: white; display:none;"); $summaryNode->setAttributeNS("","id", "summary-page-id"); $textNode = $dom->createTextNode($summary_string); $summaryNode->appendChild($textNode); $scriptNode = $dom->createElement('script'); $scriptNode = $body->insertBefore($scriptNode, $summaryNode); $textNode = $dom->createTextNode("var summaryShow = 'none';"); $scriptNode->appendChild($textNode); $preNode = $dom->createElement('pre'); $preNode = $body->insertBefore($preNode, $summaryNode); $preNode->setAttributeNS("","style", "border-color: black; ". "border-style:solid; border-width:3px; ". "padding: 5px; background-color: white"); $divNode = $dom->createElement('div'); $divNode = $body->insertBefore($divNode, $preNode); $divNode->setAttributeNS("","style", "border-color: black; ". "border-style:solid; border-width:3px; ". "padding: 5px; background-color: white"); $textNode = $dom->createTextNode(tl('search_controller_cached_version', "$url", $date)); $divNode->appendChild($textNode); if(isset($cache_item[self::HEADER])) { $textNode = $dom->createTextNode($cache_item[self::HEADER]."\n"); } else { $textNode = $dom->createTextNode(""); } $preNode->appendChild($textNode); $aNode = $dom->createElement("a"); $aTextNode = $dom->createTextNode( tl('search_controller_summary_data')); $aNode->setAttributeNS("","onclick", "javascript:". "summaryShow=(summaryShow!='block')?'block':'none';". "elt=document.getElementById('summary-page-id');". "elt.style.display=summaryShow;"); $aNode->setAttributeNS("","style", "text-decoration: underline; ". "cursor: pointer"); $aNode->appendChild($aTextNode); $aNode = $preNode->appendChild($aNode); $body = $this->markChildren($body, $words, $dom); $newDoc = $dom->saveHTML(); $colors = array("yellow", "orange", "grey", "cyan"); $color_count = count($colors); $i = 0; foreach($words as $word) { //only mark string of length at least 2 if(strlen($word) > 1) { $mark_prefix = crawlHash($word); if(stristr($mark_prefix, $word) !== false) { $mark_prefix = preg_replace( "/$word/i", '', $mark_prefix); } $match = $mark_prefix.$word; $newDoc = preg_replace("/$match/i", '<span style="background-color:'. $colors[$i].'">$0</span>', $newDoc); $i = ($i + 1) % $color_count; $newDoc = preg_replace("/".$mark_prefix."/", "", $newDoc); } } if(USE_CACHE) { $CACHE->set($hash_key, $new_doc); } echo $newDoc; return; } } ?>