Last commit for src/library/processors/RobotProcessor.php: 2addb500315b7393a90fe66431d7832b1e7386c7

Adjust copyrights years

Chris Pollett [2024-01-03 21:Jan:rd]

Adjust copyrights years

<?php
/**
 * SeekQuarry/Yioop --
 * Open Source Pure PHP Search Engine, Crawler, and Indexer
 *
 * Copyright (C) 2009 - 2023  Chris Pollett chris@pollett.org
 *
 * LICENSE:
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 *
 * END LICENSE
 *
 * @author Chris Pollett chris@pollett.org
 * @license https://www.gnu.org/licenses/ GPL3
 * @link https://www.seekquarry.com/
 * @copyright 2009 - 2023
 * @filesource
 */
namespace seekquarry\yioop\library\processors;

use seekquarry\yioop\configs as C;
use seekquarry\yioop\library\UrlParser;

/**
 * Processor class used to extract information from robots.txt files
 *
 * @author Chris Pollett
 */
class RobotProcessor extends PageProcessor
{
    /**
     * Set-ups the any indexing plugins associated with this page
     * processor
     *
     * @param array $plugins an array of indexing plugins which might
     *     do further processing on the data handles by this page
     *     processor
     * @param int $max_description_len maximal length of a page summary
     * @param int $max_links_to_extract maximum number of links to extract
     *      from a single document
     * @param string $summarizer_option CRAWL_CONSTANT specifying what kind
     *      of summarizer to use self::BASIC_SUMMARIZER,
     *      self::GRAPH_BASED_SUMMARIZER and self::CENTROID_SUMMARIZER
     *      self::CENTROID_SUMMARIZER
     */
    public function __construct($plugins = [], $max_description_len = null,
        $max_links_to_extract = null,
        $summarizer_option = self::BASIC_SUMMARIZER)
    {
        parent::__construct($plugins, $max_description_len,
            $max_links_to_extract, $summarizer_option);
        /** Register File Types We Handle
         */
        self::$mime_processor["text/robot"] = "RobotProcessor";
    }
    /**
     * Parses the contents of a robots.txt page extracting allowed,
     * disallowed paths, crawl-delay, and sitemaps. We also extract a
     * list of all user agent strings seen.
     *
     * @param string $page text string of a document
     * @param string $url location the document came from, not used by
     *     TextProcessor at this point. Some of its subclasses override
     *     this method and use url to produce complete links for
     *     relative links within a document
     *
     * @return array a summary of (title, description, links, and content) of
     *     the information in $page
     */
    public function process($page, $url)
    {
        $summary = null;
        $summary[self::TITLE] = "";
        $summary[self::DESCRIPTION] = "";
        $summary[self::LANG] = null;
        $summary[self::ROBOT_PATHS] = [self::ALLOWED_SITES => [],
            self::DISALLOWED_SITES => []];
        $summary[self::AGENT_LIST] = [];
        $summary[self::LINKS] = [];
        $host_url = UrlParser::getHost($url);
        $lines = explode("\n", $page);
        $add_rule_state = false;
        $rule_added_flag = false;
        $delay_flag = false;
        $delay = 0;
        foreach ($lines as $pre_line) {
            $pre_line_parts = explode("#", $pre_line);
            $line = $pre_line_parts[0];
            $line_parts = explode(":", $line);
            if (!isset($line_parts[1])) {
                continue;
            }
            $field = array_shift($line_parts);
            $value = implode(":", $line_parts);
            //notice we lower case field, so switch below is case insensitive
            $field = strtolower(trim($field));
            $value = trim($value);
            $specificness = 0;
            if (strlen($value) == 0) {
                continue;
            }
            switch ($field) {
                case "user-agent":
                    //we allow * in user agent string
                    $summary[self::AGENT_LIST][] = $value;
                    $current_specificness =
                        (strcmp($value, C\USER_AGENT_SHORT) == 0) ? 1 : 0;
                    if ($current_specificness < $specificness) {
                        break;
                    }
                    if ($specificness < $current_specificness) {
                        //Give precedence to exact match on agent string
                        $specificness = $current_specificness;
                        $add_rule_state = true;
                        $summary[self::ROBOT_PATHS] =
                            [self::ALLOWED_SITES => [],
                            self::DISALLOWED_SITES => []];
                        break;
                    }
                    $agent_parts = explode("*", $value);
                    $offset = 0;
                    $add_rule_state = true;
                    foreach ($agent_parts as $part) {
                        if ($part == "") {
                            continue;
                        }
                        $new_offset = stripos(C\USER_AGENT_SHORT, $part,
                            $offset);
                        if ($new_offset === false) {
                            $add_rule_state = false;
                            break;
                        }
                        $offset = $new_offset;
                    }
                break;
                case "sitemap":
                    $tmp_url = UrlParser::canonicalLink($value, $host_url);
                    if (!UrlParser::checkRecursiveUrl($tmp_url)
                        && strlen($tmp_url) < C\MAX_URL_LEN) {
                        /*
                          Sometimes sitemap files don't end in xml
                          so add a fragment to flag as a sitemap when
                          determining queue tier
                         */
                        $summary[self::LINKS][] = $tmp_url . "#sitemap";
                    }
                break;
                case "allow":
                    if ($add_rule_state) {
                        $rule_added_flag = true;
                        $summary[self::ROBOT_PATHS][self::ALLOWED_SITES][] =
                            $this->makeCanonicalRobotPath($value);
                    }
                break;
                case "disallow":
                    if ($add_rule_state) {
                        $rule_added_flag = true;
                        $summary[self::ROBOT_PATHS][self::DISALLOWED_SITES][] =
                            $this->makeCanonicalRobotPath($value);
                    }
                break;
                case "crawl-delay":
                    if ($add_rule_state) {
                        $delay_flag = true;
                        $delay = max($delay, intval($value));
                    }
                break;
            }
        }
        if ($delay_flag) {
            if ($delay > C\MAXIMUM_CRAWL_DELAY)  {
               $summary[self::ROBOT_PATHS][self::DISALLOWED_SITES][] = "/";
            } else {
                $summary[self::CRAWL_DELAY] = $delay;
            }
        }
        $summary[self::PAGE] = "<html><body><pre>".
                strip_tags($page)."</pre></body></html>";
        return $summary;
    }
    /**
     *  Converts a path in a robots.txt file into a standard form usable by
     *  Yioop
     *  For robot paths
     *    foo
     *  is treated the same as
     *    /foo
     *  Path might contain urlencoded characters. These are all decoded
     *  except for %2F which corresponds to a / (this is as per
     *  http://www.robotstxt.org/norobots-rfc.txt)
     *
     * @param string $path to convert
     * @return string Yioop canonical path
     */
    public function makeCanonicalRobotPath($path)
    {
        if ($path[0] != "/") {
            $path = "/$path";
        }
        return urldecode(preg_replace("/\%2F/i", "%252F", $path));
    }
}

ViewGit