viewgit/inc/functions.php:22 Function utf8_encode() is deprecated [8192]
<?php /** * SeekQuarry/Yioop -- * Open Source Pure PHP Search Engine, Crawler, and Indexer * * Copyright (C) 2009 - 2023 Chris Pollett chris@pollett.org * * LICENSE: * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <https://www.gnu.org/licenses/>. * * END LICENSE * * @author Chris Pollett chris@pollett.org * @license https://www.gnu.org/licenses/ GPL3 * @link https://www.seekquarry.com/ * @copyright 2009 - 2023 * @filesource */ namespace seekquarry\yioop\library\processors; use seekquarry\yioop\configs as C; use seekquarry\yioop\library\UrlParser; /** * Processor class used to extract information from robots.txt files * * @author Chris Pollett */ class RobotProcessor extends PageProcessor { /** * Set-ups the any indexing plugins associated with this page * processor * * @param array $plugins an array of indexing plugins which might * do further processing on the data handles by this page * processor * @param int $max_description_len maximal length of a page summary * @param int $max_links_to_extract maximum number of links to extract * from a single document * @param string $summarizer_option CRAWL_CONSTANT specifying what kind * of summarizer to use self::BASIC_SUMMARIZER, * self::GRAPH_BASED_SUMMARIZER and self::CENTROID_SUMMARIZER * self::CENTROID_SUMMARIZER */ public function __construct($plugins = [], $max_description_len = null, $max_links_to_extract = null, $summarizer_option = self::BASIC_SUMMARIZER) { parent::__construct($plugins, $max_description_len, $max_links_to_extract, $summarizer_option); /** Register File Types We Handle */ self::$mime_processor["text/robot"] = "RobotProcessor"; } /** * Parses the contents of a robots.txt page extracting allowed, * disallowed paths, crawl-delay, and sitemaps. We also extract a * list of all user agent strings seen. * * @param string $page text string of a document * @param string $url location the document came from, not used by * TextProcessor at this point. Some of its subclasses override * this method and use url to produce complete links for * relative links within a document * * @return array a summary of (title, description, links, and content) of * the information in $page */ public function process($page, $url) { $summary = null; $summary[self::TITLE] = ""; $summary[self::DESCRIPTION] = ""; $summary[self::LANG] = null; $summary[self::ROBOT_PATHS] = [self::ALLOWED_SITES => [], self::DISALLOWED_SITES => []]; $summary[self::AGENT_LIST] = []; $summary[self::LINKS] = []; $host_url = UrlParser::getHost($url); $lines = explode("\n", $page); $add_rule_state = false; $rule_added_flag = false; $delay_flag = false; $delay = 0; foreach ($lines as $pre_line) { $pre_line_parts = explode("#", $pre_line); $line = $pre_line_parts[0]; $line_parts = explode(":", $line); if (!isset($line_parts[1])) { continue; } $field = array_shift($line_parts); $value = implode(":", $line_parts); //notice we lower case field, so switch below is case insensitive $field = strtolower(trim($field)); $value = trim($value); $specificness = 0; if (strlen($value) == 0) { continue; } switch ($field) { case "user-agent": //we allow * in user agent string $summary[self::AGENT_LIST][] = $value; $current_specificness = (strcmp($value, C\USER_AGENT_SHORT) == 0) ? 1 : 0; if ($current_specificness < $specificness) { break; } if ($specificness < $current_specificness) { //Give precedence to exact match on agent string $specificness = $current_specificness; $add_rule_state = true; $summary[self::ROBOT_PATHS] = [self::ALLOWED_SITES => [], self::DISALLOWED_SITES => []]; break; } $agent_parts = explode("*", $value); $offset = 0; $add_rule_state = true; foreach ($agent_parts as $part) { if ($part == "") { continue; } $new_offset = stripos(C\USER_AGENT_SHORT, $part, $offset); if ($new_offset === false) { $add_rule_state = false; break; } $offset = $new_offset; } break; case "sitemap": $tmp_url = UrlParser::canonicalLink($value, $host_url); if (!UrlParser::checkRecursiveUrl($tmp_url) && strlen($tmp_url) < C\MAX_URL_LEN) { /* Sometimes sitemap files don't end in xml so add a fragment to flag as a sitemap when determining queue tier */ $summary[self::LINKS][] = $tmp_url . "#sitemap"; } break; case "allow": if ($add_rule_state) { $rule_added_flag = true; $summary[self::ROBOT_PATHS][self::ALLOWED_SITES][] = $this->makeCanonicalRobotPath($value); } break; case "disallow": if ($add_rule_state) { $rule_added_flag = true; $summary[self::ROBOT_PATHS][self::DISALLOWED_SITES][] = $this->makeCanonicalRobotPath($value); } break; case "crawl-delay": if ($add_rule_state) { $delay_flag = true; $delay = max($delay, intval($value)); } break; } } if ($delay_flag) { if ($delay > C\MAXIMUM_CRAWL_DELAY) { $summary[self::ROBOT_PATHS][self::DISALLOWED_SITES][] = "/"; } else { $summary[self::CRAWL_DELAY] = $delay; } } $summary[self::PAGE] = "<html><body><pre>". strip_tags($page)."</pre></body></html>"; return $summary; } /** * Converts a path in a robots.txt file into a standard form usable by * Yioop * For robot paths * foo * is treated the same as * /foo * Path might contain urlencoded characters. These are all decoded * except for %2F which corresponds to a / (this is as per * http://www.robotstxt.org/norobots-rfc.txt) * * @param string $path to convert * @return string Yioop canonical path */ public function makeCanonicalRobotPath($path) { if ($path[0] != "/") { $path = "/$path"; } return urldecode(preg_replace("/\%2F/i", "%252F", $path)); } }