Last commit for controllers/search_controller.php: 9ff742e4cc2ef0dba312dd0c5f642890b6945730

First pass at converting files to use autoloading! Take care if you have an old yioop system you are upgrading, a=chris

Chris Pollett [2015-07-01 02:Jul:st]
First pass at converting files to use autoloading! Take care if you have an old yioop system you are upgrading, a=chris
<?php
/**
 *  SeekQuarry/Yioop --
 *  Open Source Pure PHP Search Engine, Crawler, and Indexer
 *
 *  Copyright (C) 2009 - 2012  Chris Pollett chris@pollett.org
 *
 *  LICENSE:
 *
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 *  END LICENSE
 *
 * @author Chris Pollett chris@pollett.org
 * @package seek_quarry
 * @subpackage controller
 * @license http://www.gnu.org/licenses/ GPL3
 * @link http://www.seekquarry.com/
 * @copyright 2009 - 2012
 * @filesource
 */

if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}

/**Load base controller class, if needed. */
require_once BASE_DIR."/controllers/controller.php";
/** To extract words from the query*/
require_once BASE_DIR."/lib/phrase_parser.php";
/** Get the crawlHash function */
require_once BASE_DIR."/lib/utility.php";
/** Loads common constants for web crawling */
require_once BASE_DIR."/lib/crawl_constants.php";
/** For getting pages from a mirror if decide not to handle ourselves*/
require_once BASE_DIR."/lib/fetch_url.php";
/**
 * Controller used to handle search requests to SeekQuarry
 * search site. Used to both get and display
 * search results.
 *
 * @author Chris Pollett
 * @package seek_quarry
 * @subpackage controller
 */
class SearchController extends Controller implements CrawlConstants
{
    /**
     * Says which models to load for this controller.
     * PhraseModel is used to extract words from the query; CrawlModel
     * is used for cached web page requests
     * @var array
     */
    var $models = array("phrase", "crawl", "searchfilters", "machine",
        "source", "cron");
    /**
     * Says which views to load for this controller.
     * The SearchView is used for displaying general search results as well
     * as the initial search screen; NocacheView
     * is used on a cached web page request that fails; RssView is used
     * to present search results according to the opensearch.org rss results
     * format.
     * @var array
     */
    var $views = array("search",  "nocache", "rss");
    /**
     * Says which activities (roughly methods invoke from the web) this
     * controller will respond to
     * @var array
     */
    var $activities = array("query", "cache", "related", "signout");

    /**
     * Name of the sub-search currently in use
     * @var string
     */
    var $subsearch_name = "";

    /**
     * The localization identifier for the current subsearch
     * @var string
     */
    var $subsearch_identifier = "";

    /**
     *  Number of seconds that must elapse after last call before doing
     *  news cron activities (mainly download most recent feeds)
     */
    const NEWS_UPDATE_INTERVAL = 3600;

    /**
     *  Number of seconds that must elapse after last call before culling
     *  all news items (to get rid of old ones)
     */
    const NEWS_DELETE_INTERVAL = 86400; //one day

    /**
     * This is the main entry point for handling a search request.
     *
     * ProcessRequest determines the type of search request (normal request ,
     * cache request, or related request), or if its a
     * user is returning from the admin panel via signout. It then calls the
     * appropriate method to handle the given activity.Finally, it draw the
     * search screen.
     */
    function processRequest()
    {
        $data = array();
        $view = "search";
        $web_flag = true;
        $start_time = microtime();

        if(isset($_REQUEST['f']) && $_REQUEST['f']=='rss' &&
            RSS_ACCESS) {
            $view = "rss";
            $web_flag = false;
        } else if(isset($_REQUEST['f']) && $_REQUEST['f']=='serial' &&
            RSS_ACCESS) {
            $view = "serial";
            $web_flag = false;
        } else if (!WEB_ACCESS) {
            return;
        }
        $subsearches = $this->sourceModel->getSubsearches();
        $no_query = false;
        if(isset($_REQUEST["s"])) {
            $search_found = false;
            foreach($subsearches as $search) {
                if($search["FOLDER_NAME"] == $_REQUEST["s"]) {
                    $search_found = true;
                    $this->subsearch_name = $_REQUEST["s"];
                    $this->subsearch_identifier = $search["INDEX_IDENTIFIER"];
                    if(!isset($_REQUEST['num']) && isset($search["PER_PAGE"])) {
                        $_REQUEST['num']= $search["PER_PAGE"];
                    }
                    break;
                }
            }
            if(!$search_found) {
                $pathinfo = pathinfo($_SERVER['SCRIPT_FILENAME']);
                include($pathinfo["dirname"]."/error.php");
                exit();
            }
            if($this->subsearch_name == "news" &&
                (!isset($_REQUEST['q']) || $_REQUEST['q']=="")) {
                $lang = getLocaleTag();
                $lang_parts = explode("-", $lang);
                if(isset($lang_parts[0])){
                    $lang = $lang_parts[0];
                }
                $_REQUEST['q'] = "lang:".$lang;
                $no_query = true;
            }
        }
        if(isset($_REQUEST['num'])) {
            $results_per_page = $this->clean($_REQUEST['num'], "int");
        } else if(isset($_SESSION['MAX_PAGES_TO_SHOW']) ) {
            $results_per_page = $_SESSION['MAX_PAGES_TO_SHOW'];
        } else {
            $results_per_page = NUM_RESULTS_PER_PAGE;
        }

        if(isset($_SESSION['USER_ID'])) {
            $user = $_SESSION['USER_ID'];
            $token_okay = $this->checkCSRFToken('YIOOP_TOKEN', $user);
            if($token_okay === false) {
                unset($_SESSION['USER_ID']);
                $user = $_SERVER['REMOTE_ADDR'];
            }
        } else {
            $user = $_SERVER['REMOTE_ADDR'];
        }
        if(isset($_REQUEST['q'])) {
            $_REQUEST['q'] = $this->restrictQueryByUserAgent($_REQUEST['q']);
        }
        if(isset($_REQUEST['raw'])){
            $raw = max($this->clean($_REQUEST['raw'], "int"), 0);
        } else {
            $raw = 0;
        }
        if(isset($_REQUEST['a'])) {
            if(in_array($_REQUEST['a'], $this->activities)) {

                $activity = $_REQUEST['a'];

                if($activity == "signout") {
                    unset($_SESSION['USER_ID']);
                    $user = $_SERVER['REMOTE_ADDR'];
                    $data['SCRIPT'] = "doMessage('<h1 class=\"red\" >".
                        tl('search_controller_logout_successful')."</h1>')";
                }

                if(isset($_REQUEST['arg'])) {
                    $arg = $_REQUEST['arg'];
                } else {
                    $activity = "query";
                }
            } else {
                $activity = "query";
            }
        } else {
            $activity = "query";
        }

        if($activity == "query" && $this->checkMirrorHandle()) {return; }

        $machine_urls = $this->machineModel->getQueueServerUrls();

        if(isset($_REQUEST['machine'])) {
            $current_machine = $this->clean($_REQUEST['machine'], 'int');
        } else {
            $current_machine = 0;
        }
        $this->phraseModel->current_machine = $current_machine;
        $this->crawlModel->current_machine = $current_machine;
        $current_its = $this->crawlModel->getCurrentIndexDatabaseName();

        if(isset($_REQUEST['its']) || isset($_SESSION['its'])) {
            $its = (isset($_REQUEST['its'])) ? $_REQUEST['its'] :
                $_SESSION['its'];
            $index_time_stamp = $this->clean($its, "int");
            if($index_time_stamp != 0 ) {
                //validate timestamp against list
                //(some crawlers replay deleted crawls)
                $crawls = $this->crawlModel->getCrawlList(false,true,
                    $machine_urls,true);
                $is_mix = false;
                if($this->crawlModel->isCrawlMix($index_time_stamp)) {
                    $is_mix = true;
                }
                $found_crawl = false;
                foreach($crawls as $crawl) {
                    if($index_time_stamp == $crawl['CRAWL_TIME']) {
                        $found_crawl = true;
                        break;
                    }
                }
                if(!$is_mix && ( !$found_crawl && (isset($_REQUEST['q']) ||
                    isset($_REQUEST['arg'])))) {
                    unset($_SESSION['its']);
                    include(BASE_DIR."/error.php");
                    exit();
                } else if(!$found_crawl) {
                    unset($_SESSION['its']);
                    $index_time_stamp = $current_its;
                }
            } else {
                $index_time_stamp = $current_its;
                    //use the default crawl index
            }
        } else {
            $index_time_stamp = $current_its;
                //use the default crawl index
        }
        if($web_flag && $index_time_stamp != 0 ) {
            $index_info =  $this->crawlModel->getInfoTimestamp(
                $index_time_stamp, $machine_urls);
            if($index_info == array() || !isset($index_info["COUNT"]) ||
                $index_info["COUNT"] == 0) {
                if($index_time_stamp != $current_its) {
                    $index_time_stamp = $current_its;
                    $index_info =  $this->crawlModel->getInfoTimestamp(
                        $index_time_stamp, $machine_urls);
                    if($index_info == array()) { $index_info = NULL; }
                }
            }
        } else if ($index_time_stamp == 0) {
            $index_info = NULL;
        }

        if(isset($_REQUEST['q']) && strlen($_REQUEST['q']) > 0
            || $activity != "query") {
            if($activity == "query") {
                $activity_array = $this->extractActivityQuery();
                $query = $activity_array[0]; // dirty
                $activity = $activity_array[1];
                $arg = $activity_array[2];
            }

            if($activity != "cache") {
                if(!isset($query)) {
                    $query = NULL;
                }
                if(isset($_REQUEST['limit'])) {
                    $limit = $this->clean($_REQUEST['limit'], "int");
                } else {
                    $limit = 0;
                }
                $data =
                    $this->processQuery(
                        $query, $activity, $arg,
                        $results_per_page, $limit, $index_time_stamp, $raw);
                        // calculate the results of a search if there is one
            } else {
                $highlight = true;
                if(!isset($query)) {
                    $query = $_REQUEST['q']; //dirty
                    list(,$query_activity,) = $this->extractActivityQuery();
                    if($query_activity != "query") {$highlight = false;}
                }
                $this->cacheRequestAndOutput($arg,
                    $highlight, $query, $index_time_stamp);
                return;
            }
        }

        $data['its'] = (isset($index_time_stamp)) ? $index_time_stamp : 0;
        if($web_flag && $index_info !== NULL) {
            if(isset($index_info['IS_MIX'])) {
                $data['INDEX_INFO'] = tl('search_controller_mix_info',
                    $index_info['DESCRIPTION']);
            } else {
                if(isset($index_info['DESCRIPTION']) &&
                    isset($index_info['VISITED_URLS_COUNT']) &&
                    isset($index_info['COUNT']) ) {
                    $data['INDEX_INFO'] = tl('search_controller_crawl_info',
                        $index_info['DESCRIPTION'],
                        $index_info['VISITED_URLS_COUNT'],
                        $index_info['COUNT']);
                } else {
                    $data['INDEX_INFO'] = "";
                }
            }
        } else {
            $data['INDEX_INFO'] = "";
        }

        $data['ELAPSED_TIME'] = changeInMicrotime($start_time);
        if ($view == "serial") {
            if(isset($data["PAGES"])) {
                $count = count($data["PAGES"]);
                for($i = 0; $i < $count; $i++) {
                    unset($data["PAGES"][$i]["OUT_SCORE"]);
                    $data["PAGES"][$i][self::SCORE]= "".
                        round($data["PAGES"][$i][self::SCORE], 3);
                    $data["PAGES"][$i][self::DOC_RANK]= "".
                        round($data["PAGES"][$i][self::DOC_RANK], 3);
                    $data["PAGES"][$i][self::RELEVANCE]= "".
                        round($data["PAGES"][$i][self::RELEVANCE], 3);
                }
            }
            echo serialize($data);
            exit();
        }
        $stats_file = CRAWL_DIR."/cache/".self::statistics_base_name.
                $data['its'].".txt";
        $data["SUBSEARCHES"] = $subsearches;
        if($this->subsearch_name != "" && $this->subsearch_identifier != "") {
            $data["SUBSEARCH"] = $this->subsearch_name;
        }
        $data["HAS_STATISTICS"] = file_exists($stats_file);
        $data['YIOOP_TOKEN'] = $this->generateCSRFToken($user);
        if($view == "search" && $raw == 0 && isset($data['PAGES'])) {
            $data['PAGES'] = $this->makeMediaGroups($data['PAGES']);
        }
        $data['INCLUDE_SCRIPTS'] = array("suggest");
        if($no_query || isset($_REQUEST['no_query'])) {
            $data['NO_QUERY'] = true;
            $data['PAGING_QUERY'] .= "&no_query=true";
        }
        $this->displayView($view, $data);
    }

    /**
     * Sometimes robots disobey the statistics page nofollow meta tag.
     * and need to be stopped before they query the whole index
     *
     * @param string $query  the search request string
     * @param string the search request string if not a bot; "" otherwise
     */
    function restrictQueryByUserAgent($query)
    {
        $bots = array("googlebot", "baidu", "naver", "sogou");
        $query_okay = true;
        foreach($bots as $bot) {
            if(!isset($_SERVER["HTTP_USER_AGENT"]) ||
                stristr($_SERVER["HTTP_USER_AGENT"], $bot)) {
                $query_okay = false;
            }
        }
        return ($query_okay) ? $query : "";
    }


    /**
     * Used to check if there are any mirrors of the current server.
     * If so, it tries to distribute the query requests randomly amongst
     * the mirrors
     * @return bool whether or not a mirror of the current site handled it
     */
    function checkMirrorHandle()
    {
        $mirror_table_name = CRAWL_DIR."/".self::mirror_table_name;
        $handled = false;
        if(file_exists($mirror_table_name)) {
            $mirror_table = unserialize(file_get_contents($mirror_table_name));
            $mirrors = array();
            $time = time();
            foreach($mirror_table['machines'] as $entry) {
                if($time - $entry[3] < 2 * MIRROR_NOTIFY_FREQUENCY) {
                    if($entry[0] == "::1") {
                        $entry[0] = "[::1]";
                    }
                    $request = "http://".$entry[0].$entry[1];
                    $mirrors[] = $request;
                }
            }
            $count = count($mirrors);
            if($count > 0 ) {
                mt_srand();
                $rand = mt_rand(0, $count);
                // if ==$count, we'll let the current machine handle it
                if($rand < $count) {
                    $request = $mirrors[$rand]."?".$_SERVER["QUERY_STRING"];
                    echo FetchUrl::getPage($request);
                    $handled = true;
                }
            }
        }
        return $handled;
    }

    /**
     * Searches the database for the most relevant pages for the supplied search
     * terms. Renders the results to the HTML page.
     *
     * @param string $query a string containing the words to search on
     * @param string $activity besides a straight search for words query,
     *      one might have other searches, such as a search for related pages.
     *      this argument says what kind of search to do.
     * @param string $arg for a search other than a straight word query this
     *      argument provides auxiliary information on how to conduct the
     *      search. For instance on a related web page search, it might provide
     *      the url of the site with which to perform the related search.
     * @param int $results_per_page the maixmum number of search results
     *      that can occur on a page
     * @param int $limit the first page of all the pages with the query terms
     *      to return. For instance, if 10 then the tenth highest ranking page
     *      for those query terms will be return, then the eleventh, etc.
     * @param int $index_name the timestamp of an index to use, if 0 then
     *      default used
     * @param int $raw ($raw == 0) normal grouping, $raw > 0
     *      no grouping done on data. If $raw == 1 no summary returned (used
     *      with f=serial, end user probably does not want)
     *      In this case, will get offset, generation, etc so could later lookup
     * @return array an array of at most results_per_page many search results
     */
    function processQuery($query, $activity, $arg, $results_per_page,
        $limit = 0, $index_name = 0, $raw = 0)
    {
        $no_index_given = false;
        if($index_name == 0) {
            $index_name = $this->crawlModel->getCurrentIndexDatabaseName();
            $no_index_given = true;
        }
        $is_mix = $this->crawlModel->isCrawlMix($index_name);
        if($no_index_given && (!$this->phraseModel->indexExists($index_name)
            && !$is_mix)) {
            $data["ERROR"] = tl('search_controller_no_index_set');
            $data['SCRIPT'] =
                    "doMessage('<h1 class=\"red\" >".
                    tl('search_controller_no_index_set').
                    "</h1>');";
            return $data;
        }

        $this->phraseModel->index_name = $index_name;
        $this->phraseModel->additional_meta_words = array();
        foreach($this->indexing_plugins as $plugin) {
            $plugin_name = ucfirst($plugin)."Plugin";
            $plugin_obj = new $plugin_name();
            $tmp_meta_words = $plugin_obj->getAdditionalMetaWords();
            $this->phraseModel->additional_meta_words =
                array_merge($this->phraseModel->additional_meta_words,
                    $tmp_meta_words);
        }

        $this->crawlModel->index_name = $index_name;

        $original_query = $query;
        list($query, $raw, $use_network, $use_cache_if_possible,
            $guess_semantics) =
                $this->calculateControlWords($query, $raw, $is_mix);
        $index_archive_name= self::index_data_base_name.$index_name;
        if(file_exists( CRAWL_DIR."/cache/$index_archive_name/no_network.txt")){
            $_REQUEST['network'] = false;
            //if default index says no network queries then no network queries
        }
        if($use_network &&
            (!isset($_REQUEST['network']) || $_REQUEST['network'] == "true")) {
            $queue_servers = $this->machineModel->getQueueServerUrls();
        } else {
            $queue_servers = array();
        }
        if(isset($_REQUEST['guess']) &&  $_REQUEST['guess'] == "false") {
            $guess_semantics = false;
        }
        switch($activity)
        {
            case "related":
                $data['QUERY'] = "related:$arg";
                $url = $arg;
                $crawl_item = $this->crawlModel->getCrawlItem($url,
                    $queue_servers);
                $top_phrases  =
                    $this->getTopPhrases($crawl_item, 3, $index_name);
                $top_query = implode(" ", $top_phrases);
                $filter = $this->searchfiltersModel->getFilter();
                $this->phraseModel->editedPageSummaries =
                    $this->searchfiltersModel->getEditedPageSummaries();
                $phrase_results = $this->phraseModel->getPhrasePageResults(
                    $top_query, $limit, $results_per_page, false, $filter,
                    $use_cache_if_possible, $raw, $queue_servers,
                    $guess_semantics);
                $data['PAGING_QUERY'] = "?c=search&amp;".
                    "a=related&amp;arg=".urlencode($url);
                if(isset($this->subsearch_name) && $this->subsearch_name !="") {
                    $data['PAGING_QUERY'] .= "&amp;s=".
                        $this->subsearch_name;
                }

                $data['QUERY'] = urlencode($this->clean($data['QUERY'],
                    "string"));
            break;

            case "query":
            default:
                if(trim($query) != "") {
                    $filter = $this->searchfiltersModel->getFilter();
                    $this->phraseModel->editedPageSummaries =
                        $this->searchfiltersModel->getEditedPageSummaries();
                    $phrase_results = $this->phraseModel->getPhrasePageResults(
                        $query, $limit, $results_per_page, true, $filter,
                        $use_cache_if_possible, $raw, $queue_servers,
                        $guess_semantics);
                    $query = $original_query;
                }
                $data['PAGING_QUERY'] = "?q=".urlencode($query);
                if(isset($this->subsearch_name) && $this->subsearch_name !="") {
                    $data['PAGING_QUERY'] .= "&amp;s=".
                        $this->subsearch_name;
                }
                $data['QUERY'] = urlencode($this->clean($query,"string"));

            break;
        }
        $time = time();
        $cron_time = $this->cronModel->getCronTime("news_delete");
        $delta = $time - $cron_time;
        if($delta == 0) {
            $this->cronModel->updateCronTime("news_delete");
        }
        if($delta > self::NEWS_DELETE_INTERVAL) {
            $this->cronModel->updateCronTime("news_delete");
            $this->sourceModel->deleteFeedItems(self::NEWS_DELETE_INTERVAL);
        }
        $cron_time = $this->cronModel->getCronTime("news_update");
        $delta = $time - $cron_time;
        if($delta > self::NEWS_UPDATE_INTERVAL || $delta == 0) {
            $this->cronModel->updateCronTime("news_update");
            $this->sourceModel->updateFeedItems();
        }
        $data['VIDEO_SOURCES'] = $this->sourceModel->getMediaSources("video");
        $data['PAGES'] = (isset($phrase_results['PAGES'])) ?
             $phrase_results['PAGES']: array();

        $data['TOTAL_ROWS'] = (isset($phrase_results['TOTAL_ROWS'])) ?
            $phrase_results['TOTAL_ROWS'] : 0;
        $data['LIMIT'] = $limit;
        $data['RESULTS_PER_PAGE'] = $results_per_page;
        return $data;

    }

    /**
     *  Extracts from the query string any control words:
     *  mix:, m:, raw:, no: and returns an array consisting
     *  of the query with these words removed, and then variables
     *  for their values.
     *
     *  @param string $query original query string
     *  @param bool $raw the $_REQUEST['raw'] value
     *  @param bool if the current index name is that of a crawl mix
     *
     *  @return array ($query, $raw, $use_network,
     *      $use_cache_if_possible, $guess_semantics)
     */
    function calculateControlWords($query, $raw, $is_mix)
    {
        $original_query = $query;
        if(trim($query) != "") {
            if($this->subsearch_identifier != "") {
                $replace = " {$this->subsearch_identifier}";
                $query = preg_replace('/\|/', "$replace |", $query);
                $query .= " $replace";
            }
        }
        $query = " $query";
        $mix_metas = array("m:", "mix:");
        foreach($mix_metas as $mix_meta) {
            $pattern = "/(\s)($mix_meta(\S)+)/";
            preg_match_all($pattern, $query, $matches);
            if(isset($matches[2][0]) && !isset($mix_name)) {
                $mix_name = substr($matches[2][0],
                    strlen($mix_meta));
                $mix_name = str_replace("+", " ", $mix_name);
                break; // only one mix and can't be nested
            }
        }
        $query = preg_replace($pattern, "", $query);
        if(isset($mix_name)) {
            if(is_numeric($mix_name)) {
                $is_mix = true;
                $index_name = $mix_name;
            } else {
                $tmp = $this->crawlModel->getCrawlMixTimestamp(
                    $mix_name);
                if($tmp != false) {
                    $index_name = $tmp;
                    $is_mix = true;
                }
            }
        }
        if($is_mix) {
            $mix = $this->crawlModel->getCrawlMix($index_name);
            $query =
                $this->phraseModel->rewriteMixQuery($query, $mix);
        }

        $pattern = "/(\s)(raw:(\S)+)/";
        preg_match_all($pattern, $query, $matches);
        if(isset($matches[2][0])) {
            $raw = substr($matches[2][0], 4);
            $raw = ($raw > 0) ? 2 : 0;
        }
        $query = preg_replace($pattern, "", $query);
        $query = preg_replace('/no:cache/', "", $query);
        $use_cache_if_possible = ($original_query == $query) ? true : false;
        $network_work_query = $query;
        $query = preg_replace('/no:network/', "", $query);
        $use_network = ($network_work_query == $query) ? true : false;
        $guess_query = $query;
        $query = preg_replace('/no:guess/', "", $query);
        $guess_semantics = ($guess_query == $query) ? true : false;

        return array($query, $raw, $use_network,
            $use_cache_if_possible, $guess_semantics);
    }

    /**
     * Groups search result pages together which have thumbnails
     * from an array of search pages. Grouped thumbnail pages stored at array
     * index of first thumbnail found, non thumbnail pages stored where were
     * before
     *
     * @param $pages an array of search result pages to group those pages
     *      with thumbs within
     * @return array $pages after the grouping has been done
     */
    function makeMediaGroups($pages)
    {
        $first_image = -1;
        $first_feed_item = -1;
        $out_pages = array();
        foreach($pages as $page) {
            if(isset($page[self::THUMB]) && $page[self::THUMB] != 'NULL') {
                if($first_image == -1) {
                    $first_image = count($out_pages);
                    $out_pages[$first_image]['IMAGES'] = array();
                }
                $out_pages[$first_image]['IMAGES'][] = $page;
            } else if(isset($page[self::IS_FEED]) && $page[self::IS_FEED]) {
                if($first_feed_item == -1) {
                    $first_feed_item = count($out_pages);
                    $out_pages[$first_feed_item]['FEEDS'] = array();
                }
                $out_pages[$first_feed_item]['FEEDS'][] = $page;
            } else {
                $out_pages[] = $page;
            }
        }
        return $out_pages;
    }


    /**
     * Given a page summary extract the words from it and try to find documents
     * which match the most relevant words. The algorithm for "relevant" is
     * pretty weak. For now we pick the $num many words whose ratio
     * of number of occurences in crawl item/ number of occurences in all
     * documents is the largest
     *
     * @param string $crawl_item a page summary
     * @param int $num number of key phrase to return
     * @param int $index_name the timestamp of an index to use, if 0 then
     *      default used
     * @return array  an array of most selective key phrases
     */
    function getTopPhrases($crawl_item, $num, $crawl_time = 0)
    {
        $queue_servers = $this->machineModel->getQueueServerUrls();
        if($crawl_time == 0) {
            $crawl_time = $this->crawlModel->getCurrentIndexDatabaseName();
        }
        $this->phraseModel->index_name = $crawl_time;
        $this->crawlModel->index_name = $crawl_time;

        $phrase_string =
            PhraseParser::extractWordStringPageSummary($crawl_item);

        $crawl_item[self::LANG] = (isset($crawl_item[self::LANG])) ?
            $crawl_item[self::LANG] : DEFAULT_LOCALE;

        $page_word_counts =
            PhraseParser::extractPhrasesAndCount($phrase_string,
                $crawl_item[self::LANG]);
        $words = array_keys($page_word_counts);

        $word_counts = $this->crawlModel->countWords($words, $queue_servers);

        $word_ratios = array();
        foreach($page_word_counts as $word => $count) {
            $word_ratios[$word] =
                (isset($word_counts[$word]) && $word_counts[$word] > 0) ?
                $count/$word_counts[$word] : 0;
            /*discard cases where word only occurs in one doc as want
              to find related relevant documents */
            if($word_ratios[$word] == 1) $word_ratios[$word] = 0;
        }

        uasort($word_ratios, "greaterThan");

        $top_phrases = array_keys($word_ratios);
        $top_phrases = array_slice($top_phrases, 0, $num);

        return $top_phrases;

    }

    /**
     * This method is responsible for parsing out the kind of query
     * from the raw query string
     *
     * This method parses the raw query string for query activities.
     * It parses the name of each activity and its argument
     *
     * @return array list of search activities parsed out of the search string
     */
    function extractActivityQuery() {

        $query = mb_ereg_replace("(\s)+", " ", $_REQUEST['q']);
        $query = mb_ereg_replace("\s:\s", ":", $_REQUEST['q']);

        $query_parts = mb_split(" ", $query);
        $count = count($query_parts);

        $out_query = "";
        $activity = "query";
        $arg = "";
        $space = "";
        for($i = 0; $i < $count; $i++) {
            foreach($this->activities as $a_activity) {
                $in_pos = mb_strpos($query_parts[$i], "$a_activity:");

                if($in_pos !== false &&  $in_pos == 0) {

                    $out_query = "";
                    $activity = $a_activity;
                    $arg = mb_substr($query_parts[$i], strlen("$a_activity:"));
                    continue;
                }
            }
            $out_query .= $space.$query_parts[$i];
            $space = " ";
        }

        $activity_array = array($out_query, $activity, $arg);

        return $activity_array;
    }

    /**
     *  Used in rendering a cached web page to highlight the search terms.
     *
     *  @param object $node DOM object to mark html elements of
     *  @param array $words an array of words to be highlighted
     *  @param object $dom a DOM object for the whole document
     *  @return object the node modified to now have highlighting
     */
    function markChildren($node, $words, $dom)
    {

        if(!isset($node->childNodes->length)) {
            return $node;
        }
        for($k = 0; $node->childNodes->length; $k++)  {
            if(!$node->childNodes->item($k)) { break; }

            $clone = $node->childNodes->item($k)->cloneNode(true);

            if($clone->nodeType == XML_TEXT_NODE) {
                $text = $clone->textContent;

                foreach($words as $word) {
                    //only mark string of length at least 2
                    if(mb_strlen($word) > 1) {
                        $mark_prefix = crawlHash($word);
                        if(stristr($mark_prefix, $word) !== false) {
                            $mark_prefix = preg_replace(
                            "/\b$word\b/i", '', $mark_prefix);
                        }
                        $text = preg_replace(
                            "/\b$word\b/i", $mark_prefix.'$0', $text);
                    }
                }

                $textNode =  $dom->createTextNode($text);
                $node->replaceChild($textNode, $node->childNodes->item($k));
            } else {
                $clone = $this->markChildren($clone, $words, $dom);

                $node->replaceChild($clone, $node->childNodes->item($k));

            }
        }

        return $node;
    }

    /**
     * Make relative links canonical with respect to provided $url
     * for links appear within the Dom node.
     *
     * @param object $node dom node to fix links for
     * @param string $url url to use to canonicalize links
     * @return object updated dom node
     */
    function canonicalizeLinks($node, $url)
    {
        if(!isset($node->childNodes->length)) {
            return $node;
        }
        for($k = 0; $node->childNodes->length; $k++) {
            if(!$node->childNodes->item($k)) { break; }

            $clone = $node->childNodes->item($k)->cloneNode(true);
            $tag_name = (isset($clone->tagName) ) ? $clone->tagName : "-1";
            if(in_array($tag_name, array("a", "link"))) {
                if($clone->hasAttribute("href")) {
                    $href = $clone->getAttribute("href");
                    $href = UrlParser::canonicalLink($href, $url, false);
                    $clone->setAttribute("href", $href);
                    //an anchor might have an img tag within it so recurse
                    $clone = $this->canonicalizeLinks($clone, $url);
                    $node->replaceChild($clone, $node->childNodes->item($k));
                }
            } else if (in_array($tag_name, array("img", "object",
                "script"))) {
                if($clone->hasAttribute("src")) {
                    $src = $clone->getAttribute("src");
                    $src = UrlParser::canonicalLink($src, $url, false);
                    $clone->setAttribute("src", $src);
                    $node->replaceChild($clone, $node->childNodes->item($k));
                }
            } else {
                if($tag_name != -1) {
                    $clone = $this->canonicalizeLinks($clone, $url);
                    if(is_object($clone)) {
                        $node->replaceChild($clone, $node->childNodes->item($k));
                    }
                }
            }
        }
        return $node;
    }

    //*********BEGIN SEARCH API *********
    /**
     * Part of Yioop! Search API. Performs a normal search query and returns
     * associative array of query results
     *
     * @param string $query this can be any query string that could be
     *      entered into the search bar on Yioop! (other than related: and
     *      cache: queries)
     * @param int $results_per_page number of results to return
     * @param int $limit first result to return from the ordered query results
     * @param int $grouping ($grouping == 0) normal grouping of links
     *      with associated document, ($grouping > 0)
     *      no grouping done on data
     *
     * @return array associative array of results for the query performed
     */
    public function queryRequest($query, $results_per_page, $limit = 0,
        $grouping = 0)
    {
        $grouping = ($grouping > 0 ) ? 2 : 0;
        return (API_ACCESS) ?
            $this->processQuery($query, "query", "", $results_per_page,
                $limit, $grouping) : NULL;
    }

    /**
     * Part of Yioop! Search API. Performs a related to a given url
     * search query and returns associative array of query results
     *
     * @param string $url to find related documents for
     * @param int $results_per_page number of results to return
     * @param int $limit first result to return from the ordered query results
     * @param int $grouping ($grouping == 0) normal grouping of links
     *      with associated document, ($grouping > 0)
     *      no grouping done on data
     *
     * @return array associative array of results for the query performed
     */
    public function relatedRequest($url, $results_per_page, $limit = 0,
        $crawl_time = 0, $grouping = 0)
    {
        $grouping = ($grouping > 0 ) ? 2 : 0;
        return (API_ACCESS) ?
            $this->processQuery("", "related", $url, $results_per_page,
                $limit, $crawl_time, $raw) : NULL;
    }

    /**
     * Part of Yioop! Search API. Performs a related to a given url
     * search query and returns associative array of query results
     *
     * @param string $url to get cached page for
     * @param bool $highlight whether to put the search terms in the page
     *      in colored span tags.
     * @param string $terms space separated list of search terms
     * @param string $crawl_time timestamp of crawl to look for cached page in
     *
     * @return string with contents of cached page
     */
    public function cacheRequest($url, $highlight=true, $terms ="",
        $crawl_time = 0)
    {
        if(!API_ACCESS) return false;
        ob_start();
        $this->cacheRequestAndOutput($url, $highlight, $terms,
            $crawl_time);
        $cached_page = ob_get_contents();
        ob_end_clean();
        return $cached_page;
    }
    //*********END SEARCH API *********

    /**
     * Used to get and render a cached web page
     *
     * @param string $url the url of the page to find the cached version of
     * @param bool $highlight whether or not to highlight the query terms in
     *      the cached page
     * @param string $terms the list of query terms
     * @param int $crawl_time the timestamp of the crawl to look up the cached
     *      page in
     */
   function cacheRequestAndOutput($url, $highlight=true, $terms ="",
        $crawl_time = 0)
    {
        global $CACHE, $IMAGE_TYPES;

        $hash_key = crawlHash(
            $terms.$url.serialize($highlight).serialize($crawl_time));
        if(USE_CACHE) {
            if($newDoc = $CACHE->get($hash_key)) {
                echo $newDoc;
                return;
            }
        }
        $queue_servers = $this->machineModel->getQueueServerUrls();
        if($crawl_time == 0) {
            $crawl_time = $this->crawlModel->getCurrentIndexDatabaseName();
        }
        $this->phraseModel->index_name = $crawl_time;
        $this->crawlModel->index_name = $crawl_time;

        $data = array();

        $crawl_item = $this->crawlModel->getCrawlItem($url, $queue_servers);

        if(!$crawl_item ) {
            $this->displayView("nocache", $data);
            return;
        }
        $in_url = "";
        $image_flag = false;
        if(isset($crawl_item[self::THUMB])) {
            $image_flag = true;
            $inlinks = $this->phraseModel->getPhrasePageResults(
                "link:$url", 0,
                1, true, NULL, false, 0, $queue_servers);
            $in_url = isset($inlinks["PAGES"][0][self::URL]) ?
                $inlinks["PAGES"][0][self::URL] : "";
        }
        $check_fields = array(self::TITLE, self::DESCRIPTION, self::LINKS);
        foreach($check_fields as $field) {
            $crawl_item[$field] = (isset($crawl_item[$field])) ?
                $crawl_item[$field] : "";
        }
        $summary_string =
            tl('search_controller_extracted_title')."\n\n".
            wordwrap($crawl_item[self::TITLE], 80, "\n")."\n\n" .
            tl('search_controller_extracted_description')."\n\n".
            wordwrap($crawl_item[self::DESCRIPTION], 80, "\n")."\n\n".
            tl('search_controller_extracted_links')."\n\n".
            wordwrap(print_r($crawl_item[self::LINKS], true), 80, "\n");
        if(isset($crawl_item[self::ROBOT_PATHS])) {
            if(isset($crawl_item[self::ROBOT_PATHS][self::ALLOWED_SITES])) {
                $summary_string =
                    tl('search_controller_extracted_allow_paths')."\n\n".
                    wordwrap(print_r($crawl_item[self::ROBOT_PATHS][
                        self::ALLOWED_SITES], true),  80, "\n");
            }
            if(isset($crawl_item[self::ROBOT_PATHS][self::DISALLOWED_SITES])) {
                $summary_string =
                    tl('search_controller_extracted_disallow_paths')."\n\n".
                    wordwrap(print_r($crawl_item[self::ROBOT_PATHS][
                        self::DISALLOWED_SITES], true),  80, "\n");
            }
            if(isset($crawl_item[self::CRAWL_DELAY])) {
                $summary_string =
                    tl('search_controller_crawl_delay')."\n\n".
                    wordwrap(print_r($crawl_item[self::CRAWL_DELAY], true),
                        80, "\n") ."\n\n". $summary_string;
            }
        }
        $robot_instance = $crawl_item[self::ROBOT_INSTANCE];
        $robot_table_name = CRAWL_DIR."/".self::robot_table_name;
        $robot_table = array();
        if(file_exists($robot_table_name)) {
            $robot_table = unserialize(file_get_contents($robot_table_name));
        }
        if(!isset($robot_table[$robot_instance])) {
            $data["SUMMARY_STRING"] = $summary_string;
            $this->displayView("nocache", $data);
            return;
        }

        $instance_parts = explode("-", $robot_instance);
        $instance_num = false;
        if(count($instance_parts) > 1) {
            $instance_num = intval($instance_parts[0]);
        }
        $machine = $robot_table[$robot_instance][0];
        $machine_uri = $robot_table[$robot_instance][1];
        $page = $crawl_item[self::HASH];
        $offset = $crawl_item[self::OFFSET];
        $cache_partition = $crawl_item[self::CACHE_PAGE_PARTITION];
        $cache_item = $this->crawlModel->getCacheFile($machine,
            $machine_uri, $cache_partition, $offset,  $crawl_time,
            $instance_num);
        if(!isset($cache_item[self::PAGE])) {
            $data["SUMMARY_STRING"] = $summary_string;
            $this->displayView("nocache", $data);
            return;
        }
        if( isset($crawl_item[self::ROBOT_METAS]) &&
                (in_array("NOARCHIVE", $crawl_item[self::ROBOT_METAS]) ||
                in_array("NONE", $crawl_item[self::ROBOT_METAS])) ) {
            $cache_file = "<div>'.
                tl('search_controller_no_archive_page').'</div>";
        } else {
            $cache_file = $cache_item[self::PAGE];
        }
        if(!$image_flag) {

            $meta_words = $this->phraseModel->meta_words_list;
            foreach($meta_words as $meta_word) {
                $pattern = "/(\s)($meta_word(\S)+)/";
                $terms = preg_replace($pattern, "", $terms);
            }
            $terms = str_replace("'", " ", $terms);
            $terms = str_replace('"', " ", $terms);
            $terms = str_replace('\\', " ", $terms);
            $terms = str_replace('|', " ", $terms);
            $terms = $this->clean($terms, "string");

            $phrase_string = mb_ereg_replace("[[:punct:]]", " ", $terms);
            $words = mb_split(" ",$phrase_string);
            if(!$highlight) {
                $words = array();
            }
        } else {
            $type = $cache_item[self::TYPE];
            $loc_url = ($in_url == "") ? $url : $in_url;
            $cache_file = "<html><head><title>Yioop! Cache</title></head>".
                "<body><object onclick=\"document.location='$loc_url'\"".
                " data='data:$type;base64,".
                base64_encode($cache_file)."' type='$type' />";
            if($loc_url != $url) {
                $cache_file .= "<p>".tl('search_controller_original_page').
                    "<br /><a href='$loc_url'>$loc_url</a></p>";
            }
            $cache_file .= "</body></html>";
            $words = array();
        }
        $date = date ("F d Y H:i:s", $cache_item[self::TIMESTAMP]);

        $dom = new DOMDocument();

        $did_dom = @$dom->loadHTML('<?xml encoding="UTF-8">' . $cache_file);
        foreach ($dom->childNodes as $item)
        if ($item->nodeType == XML_PI_NODE)
            $dom->removeChild($item); // remove hack
        $dom->encoding = "UTF-8"; // insert proper

        $xpath = new DOMXPath($dom);

        $head = $dom->getElementsByTagName('head')->item(0);
        if(is_object($head)) {
            // add a noindex nofollow robot directive to page
            $head_first_child = $head->firstChild;
            $robotNode = $dom->createElement('meta');
            $robotNode = $head->insertBefore($robotNode, $head_first_child);
            $robotNode->setAttribute("name", "ROBOTS");
            $robotNode->setAttribute("content", "NOINDEX,NOFOLLOW");
            $comment = $dom->createComment(
                tl('search_controller_cache_comment'));
            $comment = $head->insertBefore($comment, $robotNode);
            // make link and script links absolute
            $head = $this->canonicalizeLinks($head, $url);
        }
        $body =  $dom->getElementsByTagName('body')->item(0);
        if($body == false) {
            $body_tags = "<frameset><frame><noscript><img><span><b><i><em>".
                "<strong><h1><h2><h3><h4><h5><h6><p><div>".
                "<a><table><tr><td><th><dt><dir><dl><dd>";
            $cache_file = strip_tags($cache_file, $body_tags);
            $cache_file = "<html><head><title>Yioop! Cache</title></head>".
                "<body>".$cache_file."</body></html>";
            $dom = new DOMDocument();
            @$dom->loadHTML($cache_file);
            $body =  $dom->getElementsByTagName('body')->item(0);
        }
        //make tags in body absolute
        $body = $this->canonicalizeLinks($body, $url);
        $first_child = $body->firstChild;

        // add information about what was extracted from page
        $text_align = (getLocaleDirection() == 'ltr') ? "left" : "right";
        $summaryNode = $dom->createElement('pre');
        $summaryNode = $body->insertBefore($summaryNode, $first_child);
        $summaryNode->setAttributeNS("","style", "border-color: black; ".
            "border-style:solid; border-width:3px; text-align:$text_align;".
            "padding: 5px; background-color: white; display:none;");
        $summaryNode->setAttributeNS("","id", "summary-page-id");


        if(isset($cache_item[self::HEADER])) {
            $summary_string = $cache_item[self::HEADER]."\n". $summary_string;
        }
        $textNode = $dom->createTextNode($summary_string);
        $summaryNode->appendChild($textNode);

        $scriptNode = $dom->createElement('script');
        $scriptNode = $body->insertBefore($scriptNode, $summaryNode);
        $textNode = $dom->createTextNode("var summaryShow = 'none';");
        $scriptNode->appendChild($textNode);

        $aDivNode = $dom->createElement('div');
        $aDivNode = $body->insertBefore($aDivNode, $summaryNode);
        $aDivNode->setAttributeNS("","style", "border-color: black; ".
            "border-style:solid; border-width:3px; margin-bottom:10px;".
            "padding: 5px; background-color: white; text-align:$text_align;");
        $divNode = $dom->createElement('div');

        $divNode = $body->insertBefore($divNode, $aDivNode);
        $divNode->setAttributeNS("","style", "border-color: black; ".
            "border-style:solid; border-width:3px;margin-bottom:10px;".
            "padding: 5px; background-color: white; text-align:$text_align;");

        $textNode = $dom->createTextNode(tl('search_controller_cached_version',
            "Z@url@Z", $date));
        $divNode->appendChild($textNode);

        $aNode = $dom->createElement("a");
        $aTextNode = $dom->createTextNode(
            tl('search_controller_summary_data'));
        $aNode->setAttributeNS("","onclick", "javascript:".
            "summaryShow=(summaryShow!='block')?'block':'none';".
            "elt=document.getElementById('summary-page-id');".
            "elt.style.display=summaryShow;");
        $aNode->setAttributeNS("","style", "text-decoration: underline; ".
            "cursor: pointer");

        $aNode->appendChild($aTextNode);

        $aNode = $aDivNode->appendChild($aNode);

        $body = $this->markChildren($body, $words, $dom);

        $newDoc = $dom->saveHTML();
        $url = "<a href='$url'>$url</a>";
        $newDoc = str_replace("Z@url@Z", $url, $newDoc);
        $colors = array("yellow", "orange", "gray", "cyan");
        $color_count = count($colors);

        $i = 0;
        foreach($words as $word) {
            //only mark string of length at least 2
            if(mb_strlen($word) > 1) {
                $mark_prefix = crawlHash($word);
                if(stristr($mark_prefix, $word) !== false) {
                    $mark_prefix = preg_replace(
                    "/$word/i", '', $mark_prefix);
                }
                $match = $mark_prefix.$word;
                $newDoc = preg_replace("/$match/i",
                    '<span style="background-color:'.
                    $colors[$i].'">$0</span>', $newDoc);
                $i = ($i + 1) % $color_count;
                $newDoc = preg_replace("/".$mark_prefix."/", "", $newDoc);
            }
        }

        if(USE_CACHE) {
            $CACHE->set($hash_key, $newDoc);
        }

        echo $newDoc;
        return;
    }

}
?>
ViewGit