viewgit/inc/functions.php:22 Function utf8_encode() is deprecated [8192]

Last commit for src/library/processors/HtmlProcessor.php: 2addb500315b7393a90fe66431d7832b1e7386c7

Adjust copyrights years

Chris Pollett [2024-01-03 21:Jan:rd]
Adjust copyrights years
 * SeekQuarry/Yioop --
 * Open Source Pure PHP Search Engine, Crawler, and Indexer
 * Copyright (C) 2009 - 2024  Chris Pollett
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * GNU General Public License for more details.
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <>.
 * @author Chris Pollett
 * @license GPL3
 * @link
 * @copyright 2009 - 2024
 * @filesource
namespace seekquarry\yioop\library\processors;

use seekquarry\yioop\configs as C;
use seekquarry\yioop\library as L;
use seekquarry\yioop\library\UrlParser;
use seekquarry\yioop\library\PhraseParser;
use seekquarry\yioop\library\ScraperManager;
 * Used to create crawl summary information
 * for HTML files
 * @author Chris Pollett
class HtmlProcessor extends TextProcessor
     * An array of scrapers to be used by this HtmlProcessor
     * @var array
    public $scrapers = [];
     * Whether we are using this processor in the Page Options activity
     * @var boolean
    public static $page_options_testing = false;
     * Maximum number of characters in a title
    const MAX_TITLE_LEN = 100;
     * Set-ups the any indexing plugins associated with this page
     * processor
     * @param array $plugins an array of indexing plugins which might
     *     do further processing on the data handles by this page
     *     processor
     * @param int $max_description_len maximal length of a page summary
     * @param int $max_links_to_extract maximum number of links to extract
     *      from a single document
     * @param string $summarizer_option CRAWL_CONSTANT specifying what kind
     *      of summarizer to use self::BASIC_SUMMARIZER,
     *      self::CENTROID_SUMMARIZER
    public function __construct($plugins = [], $max_description_len = null,
        $max_links_to_extract = C\MAX_LINKS_TO_EXTRACT,
        $summarizer_option = self::BASIC_SUMMARIZER)
        parent::__construct($plugins, $max_description_len,
            $max_links_to_extract, $summarizer_option);
        /** Register file types we handle */
        $add_extensions = ["asp", "aspx", "cgi", "cfm", "cfml", "do", "htm",
            "html", "jsp", "php", "pl", "py", "shtml"];
        self::$indexed_file_types = array_merge(self::$indexed_file_types,
        self::$mime_processor["text/html"] = "HtmlProcessor";
        self::$mime_processor["text/asp"] = "HtmlProcessor";
        self::$mime_processor["application/xhtml+xml"] = "HtmlProcessor";
     * Used to extract the title, description and links from
     * a string consisting of webpage data.
     * @param string $page web-page contents
     * @param string $url the url where the page contents came from,
     *    used to canonicalize relative links
     * @return array a summary of the contents of the page
    public function process($page, $url)
        $summary = [];
        $page ??= "";
        $url ??= "";
        if (is_string($page)) {
            /*check to see if we have a page from a CMS we recognize
              if so we will extract the important content and only
               summarize the important content */
            if ($scraper =
                ScraperManager::getScraper($page, $this->scrapers)) {
                $summary[self::SCRAPER_LABEL] = $scraper['NAME'];
            $original_page = $page;
            $page = preg_replace('/\&nbsp\;|\&rdquo\;|\&ldquo\;|\&mdash\;/si',
                ' ', $page) ?? "";
            $page =
                preg_replace('@<script[^>]*?>[\s\S]*?</script\s*>@si', ' ',
                $page) ?? "";
            $dom_page = preg_replace('@<style[^>]*?>[\s\S]*?</style>@si', ' ',
                $page) ?? "";
            $dom = self::dom($dom_page);
            $summary[self::FALLBACK_PROCESSOR] = "";
            if ($dom !== false) {
                $summary[self::ROBOT_METAS] = self::getMetaRobots($dom);
                $summary[self::TITLE] = self::title($dom);
                if ($summary[self::TITLE] == "") {
                    $summary[self::TITLE] = self::crudeTitle($dom_page);
                    $summary[self::FALLBACK_PROCESSOR] .= "CRUDE_TITLE ";
                $summary[self::LANG] = self::lang($dom, strip_tags($dom_page),
                $description_dom = $dom;
                if (!empty($scraper)) {
                    $scrape_results = ScraperManager::applyScraperRules(
                        $dom_page, $scraper);
                    if (!empty($scrape_results)) {
                        list($scrape_fields, $dom_page) = $scrape_results;
                        $description_dom = self::dom($dom_page);
                        $summary = array_merge($summary, $scrape_fields);
                list($summary[self::DESCRIPTION], $summary[self::WORD_CLOUD],
                    $summary[self::DESCRIPTION_SCORES]) =
                    $this->summarizer->getSummary($description_dom, $dom_page,
                $crude = false;
                if (trim($summary[self::DESCRIPTION]) == "") {
                    $summary[self::DESCRIPTION] = self::crudeDescription(
                    $summary[self::FALLBACK_PROCESSOR] .= "CRUDE_DESCRIPTION ";
                    L\crawlLog("..No text extracted. ".
                        "Invoked crude description fallback.");
                    $crude = true;
                $summary[self::FAVICON_URL] = self::favicon($dom, $url);
                $summary[self::LINKS] = self::links($dom, $url,
                if ($summary[self::LINKS] == []) {
                    $summary[self::LINKS] = parent::extractHttpHttpsUrls(
                    $summary[self::FALLBACK_PROCESSOR] .= "PARENT_LINKS ";
                $location = self::location($dom, $url);
                if ($location) {
                    $summary[self::LINKS][$location] = "location:" . $url;
                    $summary[self::LOCATION] = true;
                    $summary[self::DESCRIPTION] .= $url . " => " . $location;
                    if (!$summary[self::TITLE]) {
                        $summary[self::TITLE] = $url;
                if (!$crude && !$location) {
                    $location = self::relCanonical($dom, $url);
                    if ($location) {
                        $summary[self::LINKS] = [];
                        $summary[self::LINKS][$location] = "location:" . $url;
                        $summary[self::LOCATION] = true;
                        if (!$summary[self::DESCRIPTION]) {
                            $summary[self::DESCRIPTION] .=
                                $url." => ".$location;
                        if (!$summary[self::TITLE]) {
                            $summary[self::TITLE] = $url;
                $summary[self::PAGE] = $original_page;
                if (strlen($summary[self::DESCRIPTION] . $summary[self::TITLE])
                    == 0 && count($summary[self::LINKS]) == 0 && !$location) {
                    /* maybe not html? treat as text with messed up tags
                        still try to get urls
                    $summary_text = parent::process(strip_tags($page), $url);
                    foreach ($summary as $field => $value) {
                        if (empty($value) && isset($summary_text[$field])) {
                            $summary[$field] = $summary_text[$field];
                    $summary[self::FALLBACK_PROCESSOR] .= "PARENT_PROCESS ";
                if (UrlParser::getHost($url) == rtrim($url, "/")) {
                    $summary[self::TOP_LEVEL_LINKS] =
                if (!empty($summary[self::FAVICON_URL]) &&
                    array_keys($summary[self::LINKS]))) {
                    $summary[self::LINKS][$summary[self::FAVICON_URL]] =
            } else if ($dom == false) {
                $summary = parent::process($page, $url);
                $summary[self::FALLBACK_PROCESSOR] .= "PARENT_PROCESS ";
        return $summary;
     * Used to compute the favicon url for a web page.
     * @param object $dom document object model of the web page trying to
     *   compute the favicon url for
     * @param string $url of web page that $dom corresponds to. Used to
     *   help compute favicon url if link to icon relative in $dom or
     *   if non-present and guessing using hostname.
     * @return string url of favicon for web page (empty string if couldn't
     *   determine)
    public static function favicon($dom, $url)
        $xpath = new \DOMXPath($dom);
        $base_refs = $xpath->evaluate("/html//base");
        if ($base_refs->item(0)) {
            $tmp_url = $base_refs->item(0)->getAttribute('href');
            if (strlen($tmp_url ?? "") > 0) {
                $url = UrlParser::canonicalLink($tmp_url, $url);
        $hrefs = $xpath->evaluate("/html/head/link[contains(@rel, 'icon')]");
        $favicon_url = "";
        foreach ($hrefs as $href) {
            $favicon_url = UrlParser::canonicalLink(
                $href->getAttribute('href'), $url);
            $len = strlen($favicon_url);
            if (UrlParser::checkRecursiveUrl($favicon_url)  &&
                $len >= C\MAX_URL_LEN && $len <= 4) {
                $favicon_url = "";
            if (!empty($favicon_url)) {
        if (empty($favicon_url)) {
            $host = UrlParser::getHost($url);
            $favicon_url = "$host/favicon.ico";
        return $favicon_url;
     * For a url which consists of just a hostname, computes the top level
     * links within its web page. These links will be eventually display
     * underneath the main link in the search results
     * @param string $url of website that is currently being processed
     * @param array $links associative array of $link_url => $link_text pairs
     * @return array of important links for the url
    public static function computeTopLevelLinks($url, $links)
        $top_level_links = [];
        $links_scores = [];
        $out_links = [];
        foreach ($links as $link_url => $link_text) {
            // Avoid redirects in top-level links
            if (preg_match("/^(Location|http)/i", $link_text)) {
            $cld = UrlParser::getCompanyLevelDomain($url);
            if (stristr($link_url, $cld) === false ||
                trim($link_url, "/") == trim($url, "/")) {
            $mime_type = UrlParser::guessMimeTypeFromFileName($link_url,
            if ($mime_type != "text/html") {
            $links_scores[$link_url] ??= 1.0;
            if (stristr($link_url, $url) !== false) {
            $link_parts = explode("/", $link_url);
            $last_part = urldecode($link_parts[count($link_parts) - 1]);
            $lower_part = preg_replace("/\-|\_|\./u", " ",
            // .. is used to separate link text for links with same url
            $link_text_parts = explode("..", $link_text);
            $num_parts = count($link_text_parts);
            // if a lot of links with same url, guess more important
            $links_scores[$link_url] += 0.5 * $num_parts;
            $link_title = "";
            foreach($link_text_parts as $link_text_part) {
                $test_text = preg_replace("/\-|\_|\./u", " ", $link_text_part);
                $test_text ??= "";
                if (empty($link_title) && !empty($link_text_part)) {
                    $link_title = $link_text_part;
                    $active_count = substr_count($test_text, " ");
                if (substr_count($test_text, " ") < $active_count) {
                    $link_title = $link_text_part;
                    $active_count = substr_count($test_text, " ");
            $link_title = preg_replace("/\-|\_|\./u", " ", $link_title);
            $link_title = trim($link_title ?? "");
            $link_title = mb_convert_case($link_title, MB_CASE_TITLE_SIMPLE);
            $out_links[$link_url] = $link_title;
            $lower_text = mb_strtolower($link_title);
            $num_spaces = substr_count($last_part, " ");
            $num_text_spaces = substr_count($lower_text, " ");
            $links_scores[$link_url] += min(1/($num_spaces + 1),
                (($num_text_spaces > 3) ? -$num_text_spaces : 1));
            if (trim($lower_text) == trim($lower_part)) {
                $links_scores[$link_url] += 1.0;
            $num_symbols = preg_match_all("/\$|\=|\&/", $lower_text) ?? 0;
            $links_scores[$link_url] -= $num_symbols;
        $i = 0;
        foreach ($links_scores as $link_url => $score) {
            if ($i >= min(C\MAX_TOP_LEVEL_LINKS, count($links_scores))) {
            if ($score < 0) {
            // don't add if title hasn't changed
            if (!in_array($out_links[$link_url], $top_level_links)) {
                $top_level_links[$link_url] = $out_links[$link_url];
        return $top_level_links;
     * Get any NOINDEX, NOFOLLOW, NOARCHIVE, NONE, info out of any robot
     * meta tags.
     * @param object $dom - a document object to check the meta tags for
     * @return array of robot meta instructions
    public static function getMetaRobots($dom)
        $xpath = new \DOMXPath($dom);
        // we use robot rather than robots just in case people forget the s
        $robots_check = "contains(translate(@name,".
            "'abcdefghijklmnopqrstuvwxyz'," .
        $metas = $xpath->evaluate("/html/head//meta[$robots_check]");
        $found_metas = [];
        foreach ($metas as $meta) {
            $content = $meta->getAttribute('content');
            $robot_metas = explode(",", $content);
            foreach ($robot_metas as $robot_meta) {
                $found_metas[] = strtoupper(trim($robot_meta));
        return $found_metas;
     * Determines the language of the html document by looking at the root
     * language attribute. If that fails $sample_text is used to try to guess
     * the language
     * @param object $dom  a document object to check the language of
     * @param string $sample_text sample text to try guess the language from
     * @param string $url url of web-page as a fallback look at the country
     *     to figure out language
     * @return string language tag for guessed language
    public static function lang($dom, $sample_text = null, $url = null)
        foreach ($dom->childNodes as $item) {
            if ($item->nodeType == XML_PI_NODE) {
                if(preg_match('/lang\s*\=\s*[\'|\"]?([a-zA-Z][a-zA-Z]' .
                    '(\-[a-zA-Z][a-zA-Z])?)[\'|\"]?/', $item->nodeValue,
                    $match)) {
                    if (!empty($match[1])) {
                        $lang = $match[1];
                        if ($lang != 'en' && $lang != 'en-US') {
                            return $lang;
        $htmls = $dom->getElementsByTagName("html");
        $lang = (empty($lang)) ? null : $lang;
        foreach ($htmls as $html) {
            $lang = $html->getAttribute('lang');
            $lang = str_replace("_", "-", $lang);
            if ($lang != null && $lang != 'en' && $lang != 'en-US') {
                return $lang;
        //baidu doesn't have a lang attribute but does say encoding
        $xpath = new \DOMXPath($dom);
        $charset_checks = ["contains(translate(@http-equiv,".
            "'abcdefghijklmnopqrstuvwxyz'," .
            "'abcdefghijklmnopqrstuvwxyz'," .
        foreach ($charset_checks as $charset_check => $index) {
            $metas = $xpath->evaluate("/html/head//meta[$charset_check]");
            $found_metas = [];
            foreach ($metas as $meta) {
                $content = $meta->getAttribute('content');
                $charset_metas = explode("=", $content);
                if ($index == 0) {
                    return $charset_metas[$index];
                if (isset($charset_metas[$index])) {
                    $charset = strtoupper($charset_metas[$index]);
                    $lang = L\guessLangEncoding($charset);
                    if ($lang != 'en') { //default is en, so keep checking
                        return $lang;
        $lang = self::calculateLang($sample_text, $url);
        return $lang;
     * Returns title of a webpage based on its document object
     * @param object $dom a document object to extract a title from.
     * @return string  a title of the page
    public static function title($dom)
        $xpath = new \DOMXPath($dom);
        $title_parts = ["/html/head/title",
            "/html//title", "/html//h1", "/html//h2",
            "/html//h3", "/html//h4", "/html//h5", "/html//h6"];
        $title = "";
        foreach ($title_parts as $part) {
            $doc_nodes = $xpath->evaluate($part);
            foreach ($doc_nodes as $node) {
                $title =  trim($node->nodeValue);
                if (!empty($title)) {
                    break 2;
        $title = substr($title, 0, self::MAX_TITLE_LEN);
        return $title;
     * Returns title of a webpage based on crude regex match,
     *     used as a fall back if dom parsing did not work.
     * @param string $page to extract title from
     * @return string  a title of the page
    public static function crudeTitle($page)
        list(, $title) = parent::getBetweenTags($page, 0, "<title", "</title");
        return strip_tags("<title" . $title . "</title>");
     * Returns summary of body of a web page based on crude regex matching
     *     used as a fall back if dom parsing did not work.
     * @param string $page to extract description from
     * @return string  a title of the page
    public static function crudeDescription($page)
        if (stripos($page, "<body") !== false) {
            list(, $body) = parent::getBetweenTags($page, 0, "<body", "</body");
        } else {
            $body = ">" . $page;
        $body = preg_replace("/\</", " <", $body);
        $body = strip_tags("<body" . $body . "</body>");
        if ($body == "") {
            return $body;
        $body = preg_replace("/\s+/", " ", $body);
        return mb_substr($body, 0, self::$max_description_len);
     * Extracts are location of refresh urls from the meta tags of html page
     * in site
     * @param object $dom document object version of web page
     * @param string $url the url where the dom object comes from
     * @return mixed refresh or location url if found, false otherwise
    public static function location($dom, $url)
        $xpath = new \DOMXPath($dom);
        //Look for Refresh or Location
        $metas = $xpath->evaluate("/html//meta");
        foreach ($metas as $meta) {
            if (stristr($meta->getAttribute('http-equiv'), "refresh") ||
               stristr($meta->getAttribute('http-equiv'), "location")) {
                $urls = explode("=", $meta->getAttribute('content'));
                if (isset($urls[1]) &&
                    !UrlParser::checkRecursiveUrl($urls[1]) &&
                    strlen($urls[1]) < C\MAX_URL_LEN) {
                    $refresh_url = @trim($urls[1]);
                    if ($refresh_url != $url) {
                        //ignore refresh if points to same place
                        return $refresh_url;
        return false;
     * If a canonical link element
     * (
     * is in $dom, then this function extracts it
     * @param object $dom document object version of web page
     * @param string $url the url where the dom object comes from
     * @return mixed refresh or location url if found, false otherwise
    public static function relCanonical($dom, $url)
        if (!empty(self::$page_options_testing)) {
            /*don't check for rel canonical is running a test on Page Options
            return false;
        $xpath = new \DOMXPath($dom);
        //Look for Refresh or Location
        $links = $xpath->evaluate("/html/head/link");
        foreach ($links as $link) {
            // levenshtein gives notices on strings longer than 255
            if (stristr($link->getAttribute('rel'), "canonical") ) {
                $canonical_url = trim($link->getAttribute('href'));
                if (!UrlParser::checkRecursiveUrl($canonical_url) &&
                    strlen($canonical_url) < min(252, C\MAX_URL_LEN) &&
                    (strlen($url) > min(255, C\MAX_URL_LEN + 3) ||
                    levenshtein($canonical_url, $url) > 3)) {
                    //ignore canonical if points to same place
                    return $canonical_url;
        return false;
     * Returns up to MAX_LINKS_TO_EXTRACT many links from the supplied
     * dom object where links have been canonicalized according to
     * the supplied $site information.
     * @param object $dom   a document object with links on it
     * @param string $site  a string containing a url
     * @param string $lang locale for document
     * @return array   links from the $dom object
    public static function links($dom, $site, $lang)
        $sites = [];
        $xpath = new \DOMXPath($dom);
        $tokenizer = PhraseParser::getTokenizer($lang);
        $base_refs = $xpath->evaluate("/html//base");
        if ($base_refs->item(0)) {
            $tmp_site = $base_refs->item(0)->getAttribute('href');
            if (strlen($tmp_site) > 0) {
                $site = UrlParser::canonicalLink($tmp_site, $site);
        $i = 0;
        $hrefs = $xpath->evaluate("/html/body//a");
        foreach ($hrefs as $href) {
            if (self::$max_links_to_extract < 0 ||
                $i < self::$max_links_to_extract) {
                $rel = $href->getAttribute("rel");
                if ($rel == "" || !stristr($rel, "nofollow")) {
                    $url = UrlParser::canonicalLink(
                        $href->getAttribute('href'), $site);
                    $len = strlen($url);
                    if (!UrlParser::checkRecursiveUrl($url)  &&
                        $len < C\MAX_URL_LEN && $len > 4) {
                        $text = $href->nodeValue;
                        $url_title = $href->getAttribute('title') ?? "";
                        $useful_text = $text;
                        $useful_text = preg_replace("/\.\.|\s+/u", "",
                        if (mb_strlen($useful_text) < C\MIN_LINKS_TEXT_CHARS) {
                            $parent_node = $href->parentNode;
                            if (!empty($parent_node->nodeValue)) {
                                $pre_text = $parent_node->nodeValue;
                                if (strlen($pre_text) > C\MAX_LINKS_TEXT_CHARS){
                                    $extract =  floor((C\MAX_LINKS_TEXT_CHARS -
                                    $regex = "/\b(\w{3}.{0,$extract})?(?:(?:" .
                                        preg_quote($text, "/") .
                                    preg_match($regex, $pre_text, $match);
                                    if (!empty($match[0])) {
                                        $text = $match[0];
                                        $useful_text = $text;
                            if (mb_strlen($useful_text) <
                                C\MIN_LINKS_TEXT_CHARS) {
                                $text .= " " .
                                if (mb_strlen($text) < C\MIN_LINKS_TEXT_CHARS) {
                        if (isset($sites[$url])) {
                            $sites[$url] .= " .. ".
                                preg_replace("/\s+/u", " ", strip_tags($text));
                            if (!empty($url_title)) {
                                $sites[$url] .= " .. " . $url_title;
                            $sites[$url] = mb_substr($sites[$url], 0,
                                2* C\MAX_LINKS_TEXT_CHARS);
                        } else {
                            $sites[$url] = preg_replace("/\s+/u", " ",
                            $sites[$url] = mb_substr($sites[$url], 0,
                                2* C\MAX_LINKS_TEXT_CHARS);
        $frames = $xpath->evaluate("/html/frameset/frame|/html/body//iframe");
        foreach ($frames as $frame) {
            if (self::$max_links_to_extract < 0 ||
                $i < self::$max_links_to_extract) {
                $url = UrlParser::canonicalLink(
                    $frame->getAttribute('src'), $site);
                $len = strlen($url);
                if (!UrlParser::checkRecursiveUrl($url)
                    && $len < C\MAX_URL_LEN && $len > 4) {
                    if (isset($sites[$url]) ) {
                        $sites[$url] .= " .. HTMLframe";
                    } else {
                        $sites[$url] = "HTMLframe";
        $imgs = $xpath->evaluate("/html/body//img[@alt]");
        $i = 0;
        foreach ($imgs as $img) {
            if (self::$max_links_to_extract < 0 ||
                $i < self::$max_links_to_extract) {
                $alt = $img->getAttribute('alt');
                if (strlen($alt) < 1) {
                $url = UrlParser::canonicalLink(
                    $img->getAttribute('src'), $site);
                $len = strlen($url);
                if (!UrlParser::checkRecursiveUrl($url)
                    && $len < C\MAX_URL_LEN && $len > 4) {
                    if (isset($sites[$url])) {
                        $sites[$url] .= " .. " . $alt;
                        $sites[$url] = mb_substr($sites[$url], 0,
                            2 * C\MAX_LINKS_TEXT_CHARS);
                    } else {
                        $sites[$url] = $alt;
                        $sites[$url] = mb_substr($sites[$url], 0,
                            2* C\MAX_LINKS_TEXT_CHARS);
       return $sites;
     * This returns the text content of a node but with spaces
     * where tags were (unlike just using textContent)
     * @param object $node a DOMNode
     * @return string its text content with spaces
    public static function domNodeToString($node)
        $text = $node->ownerDocument->saveHTML($node);
        $text = html_entity_decode($text);
        $text = preg_replace('/\</', ' <', $text);
        return strip_tags($text);