viewgit/inc/functions.php:22 Function utf8_encode() is deprecated [8192]
Filename | |
---|---|
src/executables/Fetcher.php | |
src/library/UrlParser.php | |
src/library/processors/HtmlProcessor.php |
diff --git a/src/executables/Fetcher.php b/src/executables/Fetcher.php index 49a9847f0..576564086 100755 --- a/src/executables/Fetcher.php +++ b/src/executables/Fetcher.php @@ -2931,8 +2931,7 @@ class Fetcher implements CrawlConstants } } if (empty($site[self::LANG])) { - $lang = L\guessLocaleFromString( - $site[self::DESCRIPTION], C\DEFAULT_LOCALE); + $lang = L\guessLocaleFromString($site[self::DESCRIPTION]); } else { $lang = $site[self::LANG]; } diff --git a/src/library/UrlParser.php b/src/library/UrlParser.php index 8a64fde7a..e00dbf893 100755 --- a/src/library/UrlParser.php +++ b/src/library/UrlParser.php @@ -214,6 +214,7 @@ class UrlParser "tr" => 'tr', "tw" => 'zh-CN', "vi" => 'vi-VN', + "vn" => 'vi-VN', "cn" => 'zh-CN', ]; $host = self::getHost($url, false); diff --git a/src/library/processors/HtmlProcessor.php b/src/library/processors/HtmlProcessor.php index 549df7bf3..9239876af 100755 --- a/src/library/processors/HtmlProcessor.php +++ b/src/library/processors/HtmlProcessor.php @@ -109,8 +109,9 @@ class HtmlProcessor extends TextProcessor $page = preg_replace('/\ \;|\&rdquo\;|\&ldquo\;|\&mdash\;/si', ' ', $page); $page = - preg_replace('@<script[^>]*?>.*?</script>@si', ' ', $page); - $dom_page = preg_replace('@<style[^>]*?>.*?</style>@si', ' ', + preg_replace('@<script[^>]*?>[\s\S]*?</script\s*>@si', ' ', + $page); + $dom_page = preg_replace('@<style[^>]*?>[\s\S]*?</style>@si', ' ', $page); $dom = self::dom($dom_page); if ($dom !== false ) { @@ -119,8 +120,8 @@ class HtmlProcessor extends TextProcessor if ($summary[self::TITLE] == "") { $summary[self::TITLE] = self::crudeTitle($dom_page); } - $summary[self::LANG] = self::lang($dom, - strip_tags($page), $url); + $summary[self::LANG] = self::lang($dom, strip_tags($dom_page), + $url); $description_dom = $dom; if (!empty($scraper)) { $scrape_results = ScraperManager::applyScraperRules( @@ -239,48 +240,49 @@ class HtmlProcessor extends TextProcessor '(\-[a-zA-Z][a-zA-Z])?)[\'|\"]?/', $item->nodeValue, $match)) { if (!empty($match[1])) { - return $match[1]; + $lang = $match[1]; + if ($lang != 'en' && $lang != 'en-US') { + return $lang; + } } } } } $htmls = $dom->getElementsByTagName("html"); - $lang = null; + $lang = (empty($lang)) ? null : $lang; foreach ($htmls as $html) { $lang = $html->getAttribute('lang'); - if ($lang != null) { + if ($lang != null && $lang != 'en' && $lang != 'en-US') { return $lang; } } - if ($lang == null) { - //baidu doesn't have a lang attribute but does say encoding - $xpath = new \DOMXPath($dom); - $charset_checks = ["contains(translate(@http-equiv,". - "'abcdefghijklmnopqrstuvwxyz'," . - " 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),'CONTENT-LANGUAGE')" => 0, - "contains(translate(@http-equiv,". - "'abcdefghijklmnopqrstuvwxyz'," . - " 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),'CONTENT-TYPE')" => 1]; - foreach ($charset_checks as $charset_check => $index) { - $metas = $xpath->evaluate("/html/head//meta[$charset_check]"); - $found_metas = []; - foreach ($metas as $meta) { - $content = $meta->getAttribute('content'); - $charset_metas = explode("=", $content); - if ($index == 0) { - return $charset_metas[$index]; - } - if (isset($charset_metas[$index])) { - $charset = strtoupper($charset_metas[$index]); - $lang = L\guessLangEncoding($charset); - if ($lang != 'en') { //default is en, so keep checking - return $lang; - } + //baidu doesn't have a lang attribute but does say encoding + $xpath = new \DOMXPath($dom); + $charset_checks = ["contains(translate(@http-equiv,". + "'abcdefghijklmnopqrstuvwxyz'," . + " 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),'CONTENT-LANGUAGE')" => 0, + "contains(translate(@http-equiv,". + "'abcdefghijklmnopqrstuvwxyz'," . + " 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),'CONTENT-TYPE')" => 1]; + foreach ($charset_checks as $charset_check => $index) { + $metas = $xpath->evaluate("/html/head//meta[$charset_check]"); + $found_metas = []; + foreach ($metas as $meta) { + $content = $meta->getAttribute('content'); + $charset_metas = explode("=", $content); + if ($index == 0) { + return $charset_metas[$index]; + } + if (isset($charset_metas[$index])) { + $charset = strtoupper($charset_metas[$index]); + $lang = L\guessLangEncoding($charset); + if ($lang != 'en') { //default is en, so keep checking + return $lang; } } } - $lang = self::calculateLang($sample_text, $url); } + $lang = self::calculateLang($sample_text, $url); return $lang; } /**