Last commit for src/library/media_jobs/DescriptionUpdateJob.php: 76157bf30dcae2fbb12c780e275e953642e2ebee

Tweaks to DescriptionUpdateJob

Chris Pollett [2024-04-21 00:Apr:st]
Tweaks to DescriptionUpdateJob
<?php
/**
 * SeekQuarry/Yioop --
 * Open Source Pure PHP Search Engine, Crawler, and Indexer
 *
 * Copyright (C) 2009 - 2022  Chris Pollett chris@pollett.org
 *
 * LICENSE:
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 *
 * END LICENSE
 *
 * @author Parth Patel (modfied to better use xpaths Chris Pollett)
 * @license https://www.gnu.org/licenses/ GPL3
 * @link https://www.seekquarry.com/
 * @copyright 2009 - 2022
 * @filesource
 */
namespace seekquarry\yioop\library\media_jobs;

use seekquarry\yioop\configs as C;
use seekquarry\yioop\library as L;
use seekquarry\yioop\library\FetchUrl;
use seekquarry\yioop\library\UrlParser;

/**
* A media job to periodically update descriptions of Wiki resources
* using Description Search Sources
*/
class DescriptionUpdateJob extends MediaJob
{
    /**
     * Time in current epoch when description last updated
     * @var int
     */
    public $update_time;
    /**
     * Datasource object used to run db queries related to fes items
     * @var object
     */
    public $db;
    /**
     * File to tell DescriptionUpdateJob that a wiki resource needs a
     * description
     */
    const NEEDS_DESCRIPTION_FILE = C\APP_DIR .
        "/resources/needs_descriptions.txt";
    /**
     * Initializes the last update time to far in the past so, description will
     * get immediately updated. Sets up connection to DB to fetch description
     * search sources
     */
    public function init()
    {
        $this->update_time = 0;
        $this->name_server_does_client_tasks = true;
        $this->name_server_does_client_tasks_only = true;
        $db_class = C\NS_DATASOURCES . ucfirst(C\DBMS). "Manager";
        $this->db = new $db_class();
        $this->db->connect();
        C\nsconddefine("DESCRIPTION_UPDATE_INTERVAL", C\ONE_HOUR);
    }
    /**
     * Only update if its been more than a day since the last update
     * and there are resources requiring description update
     *
     * @return bool whether its been a daysince the last update
     */
    public function checkPrerequisites()
    {
        $time = time();
        $delta = $time - $this->update_time;
        if ($delta > C\DESCRIPTION_UPDATE_INTERVAL &&
            file_exists(self::NEEDS_DESCRIPTION_FILE) &&
            filesize(self::NEEDS_DESCRIPTION_FILE) > 0) {
            $this->update_time = $time;
            L\crawlLog("---- Performing resources description update ----");
            return true;
        }
        L\crawlLog("---- Time since last update not exceeded, " .
            "skipping description update ----");
        return false;
    }
    /**
     * Get the description search sources from the local database and use
     * those to run the same task as in the distributed setting
     */
    public function nondistributedTasks()
    {
        $db = $this->db;
        $sql = "SELECT * FROM MEDIA_SOURCE WHERE TYPE='description_source'";
        $result = $db->execute($sql);
        $sources = [];
        while ($source = $db->fetchArray($result)) {
            $this->parseDescriptionAuxInfo($source);
            $sources[] = $source;
        }
        $this->tasks = $sources;
        $this->doTasks($sources);
    }
    /**
     * Parses out the components of the auxiliary field of a description
     * source.
     *
     * @param array &source associative array of data about one particular
     *  description
     */
    public static function parseDescriptionAuxInfo(&$source)
    {
        $aux_parts = explode("###", html_entity_decode(
            $source['AUX_INFO'], ENT_QUOTES));
        list($source['AUX_INFO'], $source['ITEM_XPATH'],
            $source['TITLE_XPATH'],$source['URL_XPATH'],
            , , , $source['TEST_DATA']) = $aux_parts;
    }
    /**
     * For each resource requiring description update, use the description
     * search sources to find information
     *
     * @param array $tasks array of description sources
     */
    public function doTasks($tasks)
    {
        $this->thumb_folder_paths = explode("\n",
            file_get_contents(self::NEEDS_DESCRIPTION_FILE));
        $this->thumb_folder_paths = array_unique($this->thumb_folder_paths);
        if (!is_array($tasks) || !is_array($this->thumb_folder_paths)) {
            L\crawlLog(
                "---- This media updater is NOT responsible for " .
                "any description update! ----");
            return;
        }
        L\crawlLog("---- This media updater is responsible for " .
            "the description updates ----");
        $thumb_folder_paths = $this->thumb_folder_paths;
        foreach ($thumb_folder_paths as $thumb_folder_path) {
            $time = time();
            if ($time - $this->update_time >= C\ONE_HOUR) {
                L\crawlLog("---- Runtime limit exceeded, saving the current " .
                    "state and yielding the processor ----");
                file_put_contents(self::NEEDS_DESCRIPTION_FILE, implode(PHP_EOL,
                    $this->thumb_folder_paths));
                return;
            }
            $this->updateResourcesDescription($tasks, $thumb_folder_path);
            array_shift($this->thumb_folder_paths);
        }
        file_put_contents(self::NEEDS_DESCRIPTION_FILE, "");
    }
    /**
     * Updates/finds descriptions for resources listed in a
     * needs_description.txt in a wiki pages thumb subfolder.
     * It does this by iterating over all configured description search sources
     * a until a match is found. It then saves the description in file at given
     * resource thumb folder path
     *
     * @param string $thumb_folder_path path to sub-folders
     *  needs_description.txt file
     * @param array $sources associative array containing details of all search
     *      sources
     * @param boolean $test_mode used to return string in test mode
     * @return string if $test_mode true
     */
    public function updateResourcesDescription($sources,
        $thumb_folder_path = "", $test_mode = false)
    {
        if (!$test_mode && !file_exists($thumb_folder_path)) {
            return false;
        }
        $test_results = "";
        $log_function = function ($msg, $log_tag = "div class='source-test'")
            use (&$test_results, $test_mode) {
            $close_tag= preg_split("/\s+/",$log_tag)[0];
            if ($test_mode) {
                $test_results .=
                    "<$log_tag style='overflow-x: scroll;'>$msg</$close_tag>\n";
            } else {
                L\crawlLog($msg);
            }
        };
        $file_path = $thumb_folder_path . "/needs_description.txt";
        if (!$test_mode) {
            $log_function("---- Processing file $file_path ----");
        }
        $resources_detail = !$test_mode ? ( file_exists($file_path) ?
            explode("\n", file_get_contents($file_path)) : [] ) :
            explode("\n", $sources[0]['TEST_DATA']);
        $i = 1;
        $resources_detail = array_filter($resources_detail);
        $resources_detail_copy = $resources_detail;
        foreach ($resources_detail as $resource_detail) {
            $time = time();
            if (!$test_mode && $time - $this->update_time >= C\ONE_HOUR) {
                file_put_contents($file_path, implode(PHP_EOL,
                    $resources_detail_copy));
                return true;
            }
            $log_function("Processing $i - $resource_detail", "h3");
            array_shift($resources_detail_copy);
            $resource_name = $resource_detail;
            $resource_name = trim(preg_replace('/\s+/', ' ', $resource_name));
            $mime_type = explode("/", L\mimeType($resource_name, true))[0];
            $resource_name = pathinfo($resource_name)['filename'];
            $resource_name = preg_replace('/\s+/', '_', $resource_name);
            $max_score = 0;
            $details_page_url = "";
            foreach ($sources as $source) {
                $source_name = $source['NAME'];
                if ($source['CATEGORY'] == $mime_type) {
                    $log_function("*** Using search source <b>$source_name" .
                        "</b> to find description ***", "p");
                    $search_page_url = $source['SOURCE_URL'] . $resource_name;
                    $log_function(" Search Page URL - $search_page_url", "pre");
                    $search_page = FetchUrl::getPage($search_page_url);
                    if (empty($search_page)) {
                        $log_function("<span class='red'>No search results".
                        "found for $resource_name</span>", "p");
                        continue;
                    }
                    set_error_handler(null);
                    $dom = L\getDomFromString($search_page);
                    $dom_xpath = new \DOMXPath($dom);
                    $items = @$dom_xpath->evaluate($source['ITEM_XPATH']);
                    set_error_handler(C\NS_CONFIGS . "yioop_error_handler");
                    foreach ($items as $item) {
                        $processed_result = $this->processItem($item,
                            $resource_name, $source, $dom, $test_mode);
                        $test_results .= $processed_result[2];
                        if ($processed_result[0] > $max_score) {
                            $max_score = $processed_result[0];
                            $details_page_url = $processed_result[1];
                        }
                    }
                    if (!empty($details_page_url)) {
                        $log_function("<b>Selected Details Page URL - " .
                            "$details_page_url</b>", "pre");
                        $details_page = FetchUrl::getPage($details_page_url);
                        if (empty($details_page)) {
                            $log_function("<span class='red'>Details page".
                            " not available</span>", "p");
                            continue;
                        }
                        list($details, $test_info) = $this->getDetails(
                            $details_page, $source, $test_mode);
                        $test_results .= $test_info;
                        if (!empty($details)) {
                            if ($test_mode) {
                                $log_function("*** Found below details ***",
                                    "p");
                                $log_function("$details", "pre");
                            } else {
                                file_put_contents($thumb_folder_path .
                                    "/$resource_detail.txt", $details);
                            }
                            break;
                        }
                    }
                }
            }
            $i++;
        }
        if (!$test_mode) {
            file_put_contents($file_path, "");
        }
        return $test_mode ? $test_results : true;
    }
    /**
     * Processes $item, a DOMElement representing a search result for
     * a description for the wiki resource $name, extracting a title and
     * url. Form the title a match score with $name is obtained. This score
     * and url as well as in test mode log messages are returned.
     *
     * @param $item DOMNode representing one possible description search result
     * @param $name the wiki resource name we are trying to get a description
     *  of
     * @param $source the source associative array with information about how
     *  to extract description from the current dom document and dom  node.
     * @param $dom DOMDocument of whole document node is from, used in
     *  creating DOMXpath object for quering $item.
     * @return array $score, $url, $test_results $score of $item as a likely
     *  source for a description for the wiki resource $name, $url
     *  that $item point to with more information, $test_results log messages
     *  if in test mode.
     */
    public function processItem($item, $name, $source, $dom, $test_mode = false)
    {
        if (!$item->hasChildNodes()) {
            return [0, null, ""];
        }
        $test_results = "";
        $log_function = function ($msg, $log_tag = "div class='source-test'")
            use (&$test_results, $test_mode) {
            $close_tag = preg_split("/\s+/",$log_tag)[0];
            if ($test_mode) {
                $test_results .=
                    "<$log_tag style='overflow-x: scroll;'>$msg</$close_tag>\n";
            } else {
                L\crawlLog($msg);
            }
        };
        $log_function("*** Processing item ***", "p");
        $dom_xpath = new \DOMXPath($dom);
        $title = "";
        if ($source['TITLE_XPATH'][0] == "/") {
            $source['TITLE_XPATH'] = "." . $source['TITLE_XPATH'];
        }
        $title_nodes = @$dom_xpath->evaluate($source['TITLE_XPATH'], $item);
        if (!empty($title_nodes) && !empty($title_nodes->item(0))) {
            $title = trim(mb_strtolower($title_nodes->item(0)->textContent));
            similar_text($name, $title, $score);
        }
        if ($source['URL_XPATH'][0] == "/") {
            $source['URL_XPATH'] = "." . $source['URL_XPATH'];
        }
        $url_nodes = @$dom_xpath->evaluate($source['URL_XPATH'], $item);
        $url_parts = parse_url($source['SOURCE_URL']);
        $base_url = $url_parts['scheme']."://".$url_parts['host'];
        if (!empty($url_nodes) && !empty($url_nodes->item(0))) {
            $url = $url_nodes->item(0)->textContent;
            $url = UrlParser::canonicalLink($url, $base_url);
        }
        if (!empty($title) && !empty($url)) {
            $log_function(" <b>Title:</b> $title", "pre");
            $log_function(" <b>URL:</b> $url", "pre");
            $log_function(" <b>Title Match Percentage:</b> $score", "pre");
            return [$score, $url, $test_results];
        }
        return [0, null, $test_results];
    }
    /**
     * Fetches the details on the url page using the xpaths
     * values configured in search source
     *
     * @param $page string the html string of the details page
     * @param $source array search source details
     * @return $details string details found using xpaths
     */
    public function getDetails($page, $source, $test_mode = false)
    {
        $test_results = "";
        $log_function = function ($msg, $log_tag = "div class='source-test'")
            use (&$test_results, $test_mode) {
            $close_tag= preg_split("/\s+/",$log_tag)[0];
            if ($test_mode) {
                $test_results .=
                    "<$log_tag style='overflow-x: scroll;'>$msg</$close_tag>\n";
            } else {
                L\crawlLog($msg);
            }
        };
        set_error_handler(null);
        $dom = L\getDomFromString($page);
        $details = "";
        set_error_handler(C\NS_CONFIGS . "yioop_error_handler");
        if (empty($dom)) {
            $log_function("<span class='red'>Error creating DOM</span>",
                "pre");
            return $details;
        }
        $detail_items = explode("\n", $source['AUX_INFO']);
        $details = "";
        foreach ($detail_items as $detail_item) {
            $info_found = false;
            $sub_details = "";
            list($detail_name, $detail_xpath, ) = explode("|", $detail_item);
            $detail_name = trim($detail_name);
            $dom_xpath = new \DOMXPath($dom);
            $detail_nodes = @$dom_xpath->evaluate($detail_xpath);
            if (!empty($detail_nodes)) {
                foreach ($detail_nodes as $detail_node) {
                    if (!empty($detail_node->nodeValue)) {
                        $sub_details .= trim(preg_replace('/\s+/', ' ',
                            $detail_node->nodeValue)) . "\n";
                        $info_found = true;
                    }
                }
            }
            if (!$info_found) {
                $log_function("<span class='red'>Could not fetch value".
                    " for <b>$detail_name</b></span>", "pre");
            } else {
                $details .= "\n$detail_name\n".
                str_pad("",strlen($detail_name), "-")."\n" . $sub_details;
            }
        }
        return [wordwrap($details), $test_results];
    }
}
ViewGit