Last commit for models/source_model.php: 9ff742e4cc2ef0dba312dd0c5f642890b6945730

First pass at converting files to use autoloading! Take care if you have an old yioop system you are upgrading, a=chris

Chris Pollett [2015-07-01 02:Jul:st]
First pass at converting files to use autoloading! Take care if you have an old yioop system you are upgrading, a=chris
<?php
/**
 *  SeekQuarry/Yioop --
 *  Open Source Pure PHP Search Engine, Crawler, and Indexer
 *
 *  Copyright (C) 2009 - 2014  Chris Pollett chris@pollett.org
 *
 *  LICENSE:
 *
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 *  END LICENSE
 *
 * @author Chris Pollett chris@pollett.org
 * @package seek_quarry
 * @subpackage model
 * @license http://www.gnu.org/licenses/ GPL3
 * @link http://www.seekquarry.com/
 * @copyright 2009 - 2014
 * @filesource
 */
if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}
/** Loads the base class */
require_once BASE_DIR."/models/model.php";
/** IndexShards used to store feed indexes*/
require_once BASE_DIR."/lib/index_shard.php";
/** For text manipulation of feeds*/
require_once BASE_DIR."/lib/phrase_parser.php";
/**
 * Used to manage data related to video, news, and other search sources
 * Also, used to manage data about available subsearches seen in SearchView
 *
 * @author Chris Pollett
 * @package seek_quarry
 * @subpackage model
 */
class SourceModel extends Model
{
    /** Mamimum number of feeds to download in one try */
    const MAX_FEEDS_ONE_GO = 100;
    /**
     *  Controls which tables and the names of tables
     *  underlie the given model and should be used in a getRows call
     *  As SourceModel is used for both media sources and subsearches.
     *  The underlying table might be MEDIA_SOURCE or it might be SUBSEARCH.
     *  The $args variable is a string which is assumed to say which.
     *
     *  @param string $args if is "SUBSEARCH" then the SUBSEARCH table will
     *      be used by getRows rather than MEDIA_SOURCE.
     *  @return string which table to use
     */
    function fromCallback($args = NULL)
    {
        if($args == "SUBSEARCH") {
            return "SUBSEARCH";
        }
        return "MEDIA_SOURCE";
    }
    /**
     *  Returns a list of media sources such as (video, rss sites) and their
     *  URL and thumb url formats, etc
     *
     *  @param string $source_type the particular kind of media source to return
     *      for example, video
     *  @param bool $has_feed_no_items if true returns only those items which
     *      have not feed_items associated with them.
     *  @return array a list of web sites which are either video or news sites
     */
    function getMediaSources($source_type = "", $has_no_feed_items = false)
    {
        $db = $this->db;
        $sources = array();
        $params = array();
        $sql = "SELECT M.* FROM MEDIA_SOURCE M";
        if($source_type !="") {
            $sql .= " WHERE TYPE=:type";
            $params = array(":type" => $source_type);
        }
        if($has_no_feed_items) {
            if($source_type == "") {
                $sql .= " WHERE ";
            } else {
                $sql .= " AND ";
            }
            $sql .= " NOT EXISTS
                (SELECT * FROM FEED_ITEM F
                WHERE F.SOURCE_NAME = M.NAME)";
        }
        $i = 0;
        $result = $db->execute($sql, $params);
        while($sources[$i] = $db->fetchArray($result)) {
            $i++;
        }
        unset($sources[$i]); //last one will be null
        return $sources;
    }
    /**
     *  Return the media source by the name of the source
     *  @param string $timestamp of the media source to look up
     *  @return array associative array with SOURCE_NAME, TYPE, SOURCE_URL,
     *      THUMB_URL, and LANGUAGE
     */
    function getMediaSource($timestamp)
    {
        $db = $this->db;
        $sql = "SELECT * FROM MEDIA_SOURCE WHERE TIMESTAMP = ?";
        $result = $db->execute($sql, array($timestamp));
        if(!$result) {
            return false;
        }
        $row = $db->fetchArray($result);
        return $row;
    }
    /**
     *  Used to add a new video, rss, or other sources to Yioop
     *
     *  @param string $name
     *  @param string $source_type whether video, rss, etc
     *  @param string $source_url url regex of resource (video) or actual
     *      resource (rss). Not quite a real regex you add {} to the
     *      location in the url where the name of the particular video
     *      should go http://www.youtube.com/watch?v={}&
     *      (anything after & is ignored, so between = and & will be matched
     *      as the name of a video)
     *  @param string $thumb_url regex of where to get thumbnails for videos
     *      based on match of $source_url, for example,
     *      http://img.youtube.com/vi/{}/2.jpg
     * @param string $language the locale tag for the media source (rss)
     */
    function addMediaSource($name, $source_type, $source_url, $thumb_url,
        $language = DEFAULT_LOCALE)
    {
        $db = $this->db;
        $sql = "INSERT INTO MEDIA_SOURCE VALUES (?,?,?,?,?,?)";

        $db->execute($sql, array(time(), $name, $source_type, $source_url,
            $thumb_url, $language));
    }
    /**
     *  Used to update the fields stored in a MEDIA_SOURCE row according to
     *  an array holding new values
     *
     *  @param array $source_info updated values for a MEDIA_SOURCE row
     */
    function updateMediaSource($source_info)
    {
        $timestamp = $source_info['TIMESTAMP'];
        unset($source_info['TIMESTAMP']);
        unset($source_info['NAME']);
        $sql = "UPDATE MEDIA_SOURCE SET ";
        $comma ="";
        $params = array();
        foreach($source_info as $field => $value) {
            $sql .= "$comma $field=? ";
            $comma = ",";
            $params[] = $value;
        }
        $sql .= " WHERE TIMESTAMP=?";
        $params[] = $timestamp;
        $this->db->execute($sql, $params);
    }
    /**
     * Deletes the media source whose id is the given timestamp
     *
     * @param int $timestamp of media source to be deleted
     */
    function deleteMediaSource($timestamp)
    {
        $sql = "SELECT * FROM MEDIA_SOURCE WHERE TIMESTAMP='$timestamp'";
        $result = $this->db->execute($sql);
        if($result) {
            $row = $this->db->fetchArray($result);
            if(isset($row['TYPE']) && $row['TYPE'] == "rss") {
                if($row['NAME'] != "") {
                    $sql = "DELETE FROM FEED_ITEM WHERE SOURCE_NAME=?";
                    $this->db->execute($sql, array($row['NAME']));
                }
            }
        }
        $sql = "DELETE FROM MEDIA_SOURCE WHERE TIMESTAMP=?";
        $this->db->execute($sql, array($timestamp));
    }
    /**
     * Returns a list of the subsearches used by the current Yioop instances
     * including their names translated to the current locale
     *
     * @return array associative array containing subsearch info name in locale,
     *     folder name, index, number of results per page
     */
    function getSubsearches()
    {
        $subsearches = array();
        $db = $this->db;
        $locale_tag = getLocaleTag();
        $sql = "SELECT LOCALE_ID FROM LOCALE ".
            "WHERE LOCALE_TAG = ? " . $db->limitOffset(1);
        $result = $db->execute($sql, array($locale_tag));
        $row = $db->fetchArray($result);
        $locale_id = $row['LOCALE_ID'];
        $sql = "SELECT S.LOCALE_STRING AS LOCALE_STRING, ".
            "S.FOLDER_NAME AS FOLDER_NAME, ".
            " S.PER_PAGE AS PER_PAGE, ".
            " S.INDEX_IDENTIFIER AS INDEX_IDENTIFIER, ".
            " T.TRANSLATION_ID AS TRANSLATION_ID FROM ".
            " SUBSEARCH S, TRANSLATION T WHERE  ".
            " T.IDENTIFIER_STRING = S.LOCALE_STRING";
        $i = 0;
        $result = $db->execute($sql);
        $sub_sql = "SELECT TRANSLATION AS SUBSEARCH_NAME ".
            "FROM TRANSLATION_LOCALE ".
            " WHERE TRANSLATION_ID=? AND LOCALE_ID=? " . $db->limitOffset(1);
            // maybe do left join at some point
        while($subsearches[$i] = $db->fetchArray($result)) {
            $id = $subsearches[$i]["TRANSLATION_ID"];
            $result_sub =  $db->execute($sub_sql, array($id, $locale_id));
            $translate = false;
            if($result_sub) {
                $translate = $db->fetchArray($result_sub);
            }
            if($translate) {
                $subsearches[$i]['SUBSEARCH_NAME'] =
                    $translate['SUBSEARCH_NAME'];
            } else {
                $subsearches[$i]['SUBSEARCH_NAME'] = $this->translateDb(
                    $subsearches[$i]['LOCALE_STRING'], DEFAULT_LOCALE);
            }
            $i++;
        }
        unset($subsearches[$i]); //last one will be null
        return $subsearches;
    }
    /**
     *  Return the media source by the name of the source
     *  @param string $folder_name
     *  @return array
     */
    function getSubsearch($folder_name)
    {
        $db = $this->db;
        $sql = "SELECT * FROM SUBSEARCH WHERE FOLDER_NAME = ?";
        $result = $db->execute($sql, array($folder_name));
        if(!$result) {
            return false;
        }
        $row = $db->fetchArray($result);
        return $row;
    }
    /**
     * Adds a new subsearch to the list of subsearches. This are displayed
     * at the top od the Yioop search pages.
     *
     * @param string $folder_name name of subsearch in terms of urls
     *      (not translated name that appears in the subsearch bar)
     * @param string $index_identifier timestamp of crawl or mix to be
     *      used for results of subsearch
     * @param int $per_page number of search results per page when this
     *      subsearch is used
     */
    function addSubsearch($folder_name, $index_identifier, $per_page)
    {
        $db = $this->db;
        $locale_string = "db_subsearch_".$folder_name;
        $sql = "INSERT INTO SUBSEARCH VALUES (?, ?, ?, ?)";
        $db->execute($sql, array($locale_string, $folder_name,
            $index_identifier, $per_page));
        $sql = "INSERT INTO TRANSLATION VALUES (?, ?)";
        $db->execute($sql, array(time(), $locale_string));
    }
    /**
     *  Used to update the fields stored in a SUBSEARCH row according to
     *  an array holding new values
     *
     *  @param array $search_info updated values for a SUBSEARCH row
     */
    function updateSubsearch($search_info)
    {
        $folder_name = $search_info['FOLDER_NAME'];
        unset($search_info['FOLDER_NAME']);
        $sql = "UPDATE SUBSEARCH SET ";
        $comma ="";
        $params = array();
        foreach($search_info as $field => $value) {
            $sql .= "$comma $field=? ";
            $comma = ",";
            $params[] = $value;
        }
        $sql .= " WHERE FOLDER_NAME=?";
        $params[] = $folder_name;
        $this->db->execute($sql, $params);
    }
    /**
     * Deletes a subsearch from the subsearch table and removes its
     * associated translations
     *
     * @param string $folder_name of subsearch to delete
     */
    function deleteSubsearch($folder_name)
    {
        $db = $this->db;
        $locale_string = "db_subsearch_".$folder_name;
        $sql = "SELECT * FROM TRANSLATION WHERE IDENTIFIER_STRING = ?";
        $result = $db->execute($sql, array($locale_string));
        if(isset($result)) {
            $row = $db->fetchArray($result);
            if(isset($row["TRANSLATION_ID"])) {
                $sql = "DELETE FROM TRANSLATION_LOCALE WHERE ".
                    "TRANSLATION_ID=?";
                $db->execute($sql, array($row["TRANSLATION_ID"]));
            }
        }
        $sql = "DELETE FROM SUBSEARCH WHERE FOLDER_NAME=?";
        $db->execute($sql, array($folder_name));

        $sql = "DELETE FROM TRANSLATION WHERE IDENTIFIER_STRING = ?";
        $db->execute($sql, array($locale_string));
    }
    /**
     *  For each feed source downloads the feeds, checks which items are
     *  not in the database, adds them. This method does not update
     *  the inverted index shard.
     *
     *  @param int $age how many seconds old records should be ignored
     *  @return bool whether feed item update was successful
     */
    function updateFeedItems($age = self::ONE_WEEK)
    {
        $db = $this->db;
        $time = time();
        $feeds_one_go = self::MAX_FEEDS_ONE_GO;
        $feeds = array();
        $sql = "SELECT COUNT(*) AS CNT FROM MEDIA_SOURCE WHERE TYPE='rss'";
        $result = $db->execute($sql);
        $row = $db->fetchArray($result);
        $num_feeds = (isset($row['CNT'])) ? $row['CNT'] : 0;
        $num_bins = floor($num_feeds/$feeds_one_go) + 1;
        $hour = date('H', $time);
        $current_bin = $hour % $num_bins;
        $limit = $current_bin * $feeds_one_go;
        $limit = $db->limitOffset($limit, $feeds_one_go);
        $sql = "SELECT * FROM MEDIA_SOURCE WHERE TYPE='rss' $limit";
        $result = $db->execute($sql);
        $i = 0;
        while($feeds[$i] = $this->db->fetchArray($result)) {
            $i++;
        }
        unset($feeds[$i]); //last one will be null
        $feeds = FetchUrl::getPages($feeds, false, 0, NULL, "SOURCE_URL",
            CrawlConstants::PAGE, true, NULL, true);
        $feed_items = array();
        $sql = "UPDATE MEDIA_SOURCE SET LANGUAGE=? WHERE TIMESTAMP=?";
        foreach($feeds as $feed) {
            $dom = new DOMDocument();
            @$dom->loadXML($feed[CrawlConstants::PAGE]);
            $lang = "";
            if(!isset($feed["LANGUAGE"]) || $feed["LANGUAGE"] == "") {
                $languages = $dom->getElementsByTagName('language');
                if($languages && is_object($languages) &&
                    is_object($languages->item(0))) {
                    $lang = $languages->item(0)->textContent;
                    $this->db->execute($sql, array($lang, $feed['TIMESTAMP']));
                }
            } else if(isset($feed["LANGUAGE"]) && $feed["LANGUAGE"] != "") {
                $lang = $feed["LANGUAGE"];
            }

            $nodes = $dom->getElementsByTagName('item');
            $rss_elements = array("title" => "title",
                "description" => "description", "link" =>"link",
                "guid" => "guid", "pubDate" => "pubDate");
            if($nodes->length == 0) {
                // maybe we're dealing with atom rather than rss
                $nodes = $dom->getElementsByTagName('entry');
                $rss_elements = array(
                    "title" => "title", "description" => "summary",
                    "link" => "link", "guid" => "id", "pubDate" => "updated");
            }
            crawlLog("Updating {$feed['NAME']}...");
            $num_added = 0;
            foreach($nodes as $node) {
                $item = array();
                foreach($rss_elements as $db_element => $feed_element) {
                    crawlTimeoutLog("..still adding feed items to index.");
                    $tag_node = $node->getElementsByTagName(
                            $feed_element)->item(0);
                    $element_text = (is_object($tag_node)) ?
                        $tag_node->nodeValue: "";
                    if($feed_element == "link" && $element_text == "") {
                        $element_text = $tag_node->getAttribute("href");
                    }
                    $item[$db_element] = strip_tags($element_text);
                }
                $did_add = $this->addFeedItemIfNew($item, $feed['NAME'], $lang,
                    $age);
                if($did_add) {
                    $num_added++;
                }
            }
            crawlLog("...added $num_added news items.");
        }
        return true;
    }
    /**
     * Copies all feeds items newer than $age to a new shard, then deletes
     * old index shard and database entries older than $age. Finally sets copied
     * shard to be active. If this method is going to take max_execution_time/2
     * it returns false, so an additional job can be schedules; otherwise
     * it returns true
     *
     * @param int $age how many seconds old records should be deleted
     * @return bool whether job executed to complete
     */
    function rebuildFeedShard($age)
    {
        $time = time();
        $feed_shard_name = WORK_DIRECTORY."/feeds/index";
        $prune_shard_name = WORK_DIRECTORY."/feeds/prune_index";
        @unlink($prune_shard_name);
        $prune_shard =  new IndexShard($prune_shard_name);
        $too_old = $time - $age;
        if(!$prune_shard) {
            return false;
        }
        $pre_feeds = $this->getMediaSources("rss");
        if(!$pre_feeds) { return false; }
        $feeds = array();
        foreach($pre_feeds as $pre_feed) {
            if(!isset($pre_feed['NAME'])) continue;
            $feeds[$pre_feed['NAME']] = $pre_feed;
        }
        $db = $this->db;
        // we now rebuild the inverted index with the remaining items
        $sql = "SELECT * FROM FEED_ITEM ".
            "WHERE PUBDATE >= ? ".
            "ORDER BY PUBDATE DESC";
        $result = $db->execute($sql, array($too_old));
        if($result) {
            $completed = true;
            crawlLog("..still deleting. Making new index of non-pruned items.");
            $i = 0;
            while($item = $db->fetchArray($result)) {
                crawlTimeoutLog("..have added %s non-pruned items to index.",
                    $i);
                $i++;
                if(!isset($item['SOURCE_NAME'])) { continue; }
                $source_name = $item['SOURCE_NAME'];
                if(isset($feeds[$source_name])) {
                    $lang = $feeds[$source_name]['LANGUAGE'];
                } else {
                    $lang = "";
                }
                $phrase_string = $item["TITLE"] . " ". $item["DESCRIPTION"];
                $word_lists = PhraseParser::extractPhrasesInLists(
                    $phrase_string, $lang);
                $raw_guid = unbase64Hash($item["GUID"]);
                $doc_keys = crawlHash($item["LINK"], true) .
                    $raw_guid."d". substr(crawlHash(
                    UrlParser::getHost($item["LINK"])."/",true), 1);
                $meta_ids = $this->calculateMetas($lang, $item['PUBDATE'],
                    $source_name, $item["GUID"]);

                $prune_shard->addDocumentWords($doc_keys, $item['PUBDATE'],
                    $word_lists, $meta_ids, PhraseParser::$materialized_metas,
                    true, false);
            }
        }
        $prune_shard->save();
        @chmod($prune_shard_name, 0777);
        @chmod($feed_shard_name, 0777);
        @rename($prune_shard_name, $feed_shard_name);
        @chmod($feed_shard_name, 0777);
        $sql = "DELETE FROM FEED_ITEM WHERE PUBDATE < ?";
        $db->execute($sql, array($too_old));
    }
    /**
     * Adds $item to  FEED_ITEM table in db if it isn't already there
     *
     * @param array $item data from a single news feed item
     * @param string $source_name string name of the news feed $item was found
     *  on
     * @param int $age how many seconds old records should be ignored
     * @param string $lang locale-tag of the news feed
     * @return bool whether an item was added
     */
    function addFeedItemIfNew($item, $source_name, $lang, $age)
    {
        if(!isset($item["link"]) || !isset($item["title"]) ||
            !isset($item["description"])) return false;
        if(!isset($item["guid"]) || $item["guid"] == "") {
            $item["guid"] = crawlHash($item["link"]);
        } else {
            $item["guid"] = crawlHash($item["guid"]);
        }
        $raw_guid = unbase64Hash($item["guid"]);
        if(!isset($item["pubDate"]) || $item["pubDate"] == "") {
            $item["pubDate"] = time();
        } else {
            $item["pubDate"] = strtotime($item["pubDate"]);
        }
        if(time() - $item["pubDate"] > $age) {
            return false;
        }
        $sql = "SELECT COUNT(*) AS NUMBER FROM FEED_ITEM WHERE GUID = ?";
        $db = $this->db;
        $result = $db->execute($sql, array($item["guid"]));
        if($result) {
            $row = $db->fetchArray($result);
            if($row["NUMBER"] > 0) {
                return false;
            }
        } else {
            return true;
        }
        $sql = "INSERT INTO FEED_ITEM VALUES (?, ?, ?, ?, ?, ?)";
        $result = $db->execute($sql, array($item['guid'], $item['title'],
            $item['link'], $item['description'], $item['pubDate'],
            $source_name));
        if(!$result) return false;
        return true;
    }
    /**
     *  Used to calculate the meta words for RSS feed items
     *
     *  @param string $lang the locale_tag of the feed item
     *  @param int $pubdate UNIX timestamp publication date of item
     *  @param string $source_name the name of the news feed
     *  @param string $guid the guid of the news item
     *
     *  @return array $meta_ids meta words found
     */
    function calculateMetas($lang, $pubdate, $source_name, $guid)
    {
        $meta_ids = array("media:news", "media:news:".urlencode($source_name),
            "guid:".strtolower($guid));
        $meta_ids[] = 'date:'.date('Y', $pubdate);
        $meta_ids[] = 'date:'.date('Y-m', $pubdate);
        $meta_ids[] = 'date:'.date('Y-m-d', $pubdate);
        $meta_ids[] = 'date:'.date('Y-m-d-H', $pubdate);
        $meta_ids[] = 'date:'.date('Y-m-d-H-i', $pubdate);
        $meta_ids[] = 'date:'.date('Y-m-d-H-i-s', $pubdate);
        if($lang != "") {
            $lang_parts = explode("-", $lang);
            $meta_ids[] = 'lang:'.$lang_parts[0];
            if(isset($lang_parts[1])){
                $meta_ids[] = 'lang:'.$lang;
            }
        }
        return $meta_ids;
    }
}
 ?>
ViewGit