diff --git a/src/library/ComputerVision.php b/src/library/ComputerVision.php new file mode 100644 index 000000000..91802bd8c --- /dev/null +++ b/src/library/ComputerVision.php @@ -0,0 +1,86 @@ +<?php +/** + * SeekQuarry/Yioop -- + * Open Source Pure PHP Search Engine, Crawler, and Indexer + * + * Copyright (C) 2009 - 2020 Chris Pollett chris@pollett.org + * + * LICENSE: + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <https://www.gnu.org/licenses/>. + * + * END LICENSE + * + * + * @author Chris Pollett chris@pollett.org + * @license https://www.gnu.org/licenses/ GPL3 + * @link https://www.seekquarry.com/ + * @copyright 2009 - 2020 + * @filesource + */ +namespace seekquarry\yioop\library; + +use seekquarry\yioop\configs as C; + +/** + * For crawlHash + */ +require_once __DIR__ . "/Utility.php"; +/** + * To convert to Iso639-2 + */ +require_once __DIR__ . "/LocaleFunctions.php"; +/** + * + */ +class ComputerVision +{ + public static function ocrEnabled() + { + return C\nsdefined("TESSERACT"); + } + /** + * + */ + public static function recognizeText($image_path, + $langs = [C\DEFAULT_LOCALE]) + { + if (!C\nsdefined("TESSERACT")) { + return ""; + } + $temp_dir = C\CRAWL_DIR . "/temp/"; + if (!file_exists($temp_dir)) { + mkdir($temp_dir); + } + if (!file_exists($temp_dir)) { + return ""; + } + $image_file_name = pathinfo($image_path, PATHINFO_BASENAME); + $iso_string = ""; + $add = ""; + foreach ($langs as $lang) { + $iso_lang = localeTagToIso639_2Tag($lang); + $iso_string .= $add . $iso_lang; + $add = "+"; + } + $ocr_file = $temp_dir . $image_file_name . "-out"; + $ocr_exec = C\TESSERACT . " $image_path $ocr_file -l $iso_string"; + exec($ocr_exec); + $ocr_file .= ".txt"; + $ocr_string = file_exists($ocr_file) ? + file_get_contents($ocr_file) : ""; + @unlink($ocr_file); + return trim($ocr_string, " \t\n\r\0\x0B\x0C"); + } +} diff --git a/src/library/ContextWeightedNamedEntityRecognizer.php b/src/library/ContextWeightedNamedEntityRecognizer.php index 0790fdabc..e6b11d88e 100644 --- a/src/library/ContextWeightedNamedEntityRecognizer.php +++ b/src/library/ContextWeightedNamedEntityRecognizer.php @@ -26,7 +26,6 @@ * @copyright 2009 - 2019 * @filesource */ - namespace seekquarry\yioop\library; use seekquarry\yioop\configs as C; @@ -39,7 +38,6 @@ use seekquarry\yioop\locale\zh_CN\resources as ZH; * * @author Xianghong Sun */ - class ContextWeightedNamedEntityRecognizer { /** @@ -94,7 +92,7 @@ class ContextWeightedNamedEntityRecognizer $this->lang = $lang; } } - + /** * A function that process the trainning data * @param @mixed $text_files can be a file or an array of file names @@ -104,7 +102,7 @@ class ContextWeightedNamedEntityRecognizer * 我们/o 是/o 受到/o 郑振铎/nr 先生/o 、/o 阿英/nr 先生/o 著作/o 的/o * 启示/o ,/o 从/o 个人/o 条件/o 出发/o ,/o 瞄准/o 现代/o 出版/o 史/o * 研究/o 的/o 空白/o ,/o 重点/o 集/o 藏/o 解放区/o 、/o 国民党/nt 毁/o - * 禁/o 出版物/o 。/o + * 禁/o 出版物/o 。/o * To adapt to other language, some modifications are needed */ public static function processTexts($text_files, $term_tag_splier="/", @@ -123,16 +121,15 @@ class ContextWeightedNamedEntityRecognizer if (!count($word_tag_pairs)) { continue; } - $ret[]=[]; - $ret[count($ret)-1][0]=[]; - $ret[count($ret)-1][1]=[]; + $ret[] = []; + $ret[count($ret)-1][0] = []; + $ret[count($ret)-1][1] = []; foreach ($word_tag_pairs as $word_tag_pair) { $t = explode("/", $word_tag_pair); - //echo $word_tag_pair; - //print_r($t); if (count($t) == 2) { $tag = $tag_process ? $tag_process($t[1]) : $t[1]; - foreach(preg_split('//u', $t[0], null, PREG_SPLIT_NO_EMPTY) as $ch) { + foreach(preg_split('//u', $t[0], null, + PREG_SPLIT_NO_EMPTY) as $ch) { $ret[count($ret)-1][0][] = $term_process ? $term_process($ch) : $ch; $ret[count($ret)-1][1][] = $tag; @@ -145,21 +142,20 @@ class ContextWeightedNamedEntityRecognizer } return $ret; } - + /** * Function to train a data * Notice: This function might run very long time, depending on training set * @param @mixed $text_files are training data * can be a file or an array of file names - * @param @float $learning_rate + * @param @float $learning_rate * @param @int $max_epoch 1200 might be a good one, - the weight will overfit if it's greater than this number + * the weight will overfit if it's greater than this number * @param @function $term_process is a preporcess on term before training * @param @function $tag_process is a preporcess on tag before training */ - public function train($text_files, - $learning_rate=0.1, $max_epoch = 1200, - $term_process = null, $tag_process = null) + public function train($text_files, $learning_rate=0.1, $max_epoch = 1200, + $term_process = null, $tag_process = null) { if (is_string($text_files)) { $text_files = [$text_files]; @@ -192,14 +188,14 @@ class ContextWeightedNamedEntityRecognizer $this->tag_feature[$tags[$i-1]]=[]; } } else { - if (!isset($this->tag_feature[$tags[$i-2]."-".$tags[$i-1]])) { - $this->tag_feature[$tags[$i-2]."-".$tags[$i-1]]=[]; + if (!isset($this->tag_feature[$tags[$i-2] . "-" . + $tags[$i-1]])) { + $this->tag_feature[$tags[$i-2]."-".$tags[$i-1]] = []; } if (!isset($this->tag_feature[$tags[$i-1]])) { $this->tag_feature[$tags[$i-1]]=[]; } } - if (!isset($this->word_feature[$terms[$i]])) { $this->word_feature[$terms[$i]] = []; } @@ -234,7 +230,8 @@ class ContextWeightedNamedEntityRecognizer $cross_entropy_loss = 1; $pre_cross_entropy_loss = 2; for ($epoch = 0; ($epoch < $max_epoch) && - $pre_cross_entropy_loss - $cross_entropy_loss > 0.000001; $epoch++) { + $pre_cross_entropy_loss - $cross_entropy_loss > 0.000001; + $epoch++) { $this->min_w=0; $this->max_w=0; $time = time(); @@ -243,10 +240,10 @@ class ContextWeightedNamedEntityRecognizer $pre_cross_entropy_loss = $cross_entropy_loss; $cross_entropy_loss = 0; $cross_entropy_loss_n = 0; - + $dy_db=[]; $dy_db_n=[]; - + $dy_dt=[]; $dy_dt_n=[]; for($i = 0; $i < count($this->tag_set); $i++) { @@ -298,11 +295,11 @@ class ContextWeightedNamedEntityRecognizer $dy_dw[$k[$j]][$j][$tag_index] = 0; $dy_dw_n[$k[$j]][$j][$tag_index] = 0; } - + $dy_dw[$k[$j]][$j][$tag_index] += ($sigmoid - $equality); $dy_dw_n[$k[$j]][$j][$tag_index] += 1; - + } //dy_dt if (!isset($dy_dt[$tf1])) { @@ -365,8 +362,8 @@ class ContextWeightedNamedEntityRecognizer } foreach ($dy_db as $k => $v) { $this->bias[$k]-= - $dy_db[$k] / - $dy_db_n[$k] * + $dy_db[$k] / + $dy_db_n[$k] * $learning_rate; } if ($epoch % 10 == 9 ) { @@ -411,7 +408,7 @@ class ContextWeightedNamedEntityRecognizer for ($j=-2; $j <=2; $j++) { $k=$this->getIndex($i+$j, $terms); if (isset($this->word_feature[$k])) { - $score[$possiable_tag] += + $score[$possiable_tag] += $this->getW($k,$j,$tag_index); } } @@ -452,7 +449,6 @@ class ContextWeightedNamedEntityRecognizer } return $ret; } - /** * A list of private helper functions * Given a setence ($term), find the key at position $index @@ -468,7 +464,7 @@ class ContextWeightedNamedEntityRecognizer } return $k; } - + /** * save the trained weight to disk */ @@ -533,7 +529,8 @@ class ContextWeightedNamedEntityRecognizer */ private function unpack_b() { - return array_merge(unpack("f".strval(count($this->tag_set)),$this->bias)); + return array_merge(unpack("f" . strval(count($this->tag_set)), + $this->bias)); } /** * Pack the tag_feature @@ -547,7 +544,8 @@ class ContextWeightedNamedEntityRecognizer */ private function unpack_t($key) { - return array_merge(unpack("f".strval(count($this->tag_set)),$this->tag_feature[$key])); + return array_merge(unpack("f".strval(count($this->tag_set)), + $this->tag_feature[$key])); } /** * Pack the word_feature diff --git a/src/library/ContextWeightedPosTagger.php b/src/library/ContextWeightedPosTagger.php index 140097b95..d49f252bd 100644 --- a/src/library/ContextWeightedPosTagger.php +++ b/src/library/ContextWeightedPosTagger.php @@ -26,7 +26,6 @@ * @copyright 2009 - 2019 * @filesource */ - namespace seekquarry\yioop\library; use seekquarry\yioop\configs as C; @@ -45,8 +44,8 @@ use seekquarry\yioop\locale\zh_CN\resources as ZH; * * @author Xianghong Sun */ - -class ContextWeightedPosTagger { +class ContextWeightedPosTagger +{ /** * Current Language, only tested on Simplified Chinese * Might be extensable for other languages in the furture @@ -72,7 +71,6 @@ class ContextWeightedPosTagger { */ private $min_w; private $max_w; - /** * All Possiable tag set * Generized by training method @@ -147,10 +145,9 @@ class ContextWeightedPosTagger { { $this->$var_name = $value; } - /** * check if the term can be determined by algorithm, - * usually by regualr expression, because there are infinity + * usually by regualr expression, because there are infinity * amount of them. * ex. 13th is an ordinal number, 123 is a cardinal number * then use the determined tag to be the weight key @@ -164,7 +161,7 @@ class ContextWeightedPosTagger { } return $term; } - + /** * A function that process the trainning data * @param @mixed $text_files can be a file or an array of file names @@ -196,11 +193,11 @@ class ContextWeightedPosTagger { $ret[count($ret)-1][1]=[]; foreach ($word_tag_pairs as $word_tag_pair) { $t = explode($term_tag_splier, $word_tag_pair); - + if (count($t) == 2) { $ret[count($ret)-1][0][] = $term_process ? $term_process($t[0]) : $t[0]; - $ret[count($ret)-1][1][] = + $ret[count($ret)-1][1][] = $tag_process ? $tag_process($t[1]) : $t[1]; } } @@ -210,22 +207,20 @@ class ContextWeightedPosTagger { } return $ret; } - /** - * Function to train a data - * Notice: This function might run very long time, depending on training set - * @param @mixed $text_files are training data - * can be a file or an array of file names - * @param @float $learning_rate - * @param @int $max_epoch 1200 might be a good one, - the weight will overfit if it's greater than this number - * @parama @bool $resume if true, read the weight file and continue training - if false, start from beginning - */ - public function train($text_files, $term_tag_splier="_", - $learning_rate=0.1, $max_epoch = 1200, - $term_process = null, $tag_process = null, - $resume=false) + * Function to train a data + * Notice: This function might run very long time, depending on training set + * @param @mixed $text_files are training data + * can be a file or an array of file names + * @param @float $learning_rate + * @param @int $max_epoch 1200 might be a good one, + * the weight will overfit if it's greater than this number + * @param @bool $resume if true, read the weight file and continue training + * if false, start from beginning + */ + public function train($text_files, $term_tag_splier="_", $learning_rate=0.1, + $max_epoch = 1200, $term_process = null, $tag_process = null, + $resume = false) { if (is_string($text_files)) { $text_files = [$text_files]; @@ -286,8 +281,8 @@ class ContextWeightedPosTagger { //train the weight $cross_entropy_loss = 1; $pre_cross_entropy_loss = 2; - for ($epoch = 0; $epoch < $max_epoch && - $pre_cross_entropy_loss - $cross_entropy_loss > 0.000001; $epoch++) { + for ($epoch = 0; $epoch < $max_epoch && $pre_cross_entropy_loss - + $cross_entropy_loss > 0.000001; $epoch++) { $this->min_w=0; $this->max_w=0; $time = time(); @@ -296,7 +291,7 @@ class ContextWeightedPosTagger { $pre_cross_entropy_loss = $cross_entropy_loss; $cross_entropy_loss = 0; $cross_entropy_loss_n = 0; - + $dy_db=[]; $dy_db_n=[]; for($i = 0; $i < count($this->tag_set); $i++) { @@ -333,11 +328,11 @@ class ContextWeightedPosTagger { $dy_dw[$k[$j]][$j][$tag_index] = 0; $dy_dw_n[$k[$j]][$j][$tag_index] = 0; } - + $dy_dw[$k[$j]][$j][$tag_index] += ($sigmoid - $equality); $dy_dw_n[$k[$j]][$j][$tag_index] += 1; - + } //dy_db $dy_db[$tag_index] += ($sigmoid - $equality); @@ -351,7 +346,7 @@ class ContextWeightedPosTagger { } $cross_entropy_loss /= $cross_entropy_loss_n; $duration = time() - $time; - echo "epoch {$epoch} cross_entropy {$cross_entropy_loss}". + echo "epoch {$epoch} cross_entropy {$cross_entropy_loss}" . " Takes {$duration} seconds\n"; foreach ($dy_dw as $i =>$v1) { foreach ($v1 as $j =>$v2) { @@ -371,8 +366,8 @@ class ContextWeightedPosTagger { } foreach ($dy_db as $k =>$v) { $this->b[$k]-= - $dy_db[$k] / - $dy_db_n[$k] * + $dy_db[$k] / + $dy_db_n[$k] * $learning_rate; } if ($epoch % 10 == 9 ) { @@ -386,7 +381,7 @@ class ContextWeightedPosTagger { * The primary function to predit the tag * @param mixed $sentence is an array of segmented words/terms * or a string with words/terms seperated by space - * @return @array of tags + * @return @array of tags */ public function predict($sentence) { @@ -415,15 +410,16 @@ class ContextWeightedPosTagger { for ($j=-2; $j <=2; $j++) { $k=$this->getIndex($i+$j, $terms); if (isset($this->w[$k])) { - $score[$possiable_tag] += + $score[$possiable_tag] += $this->getW($k,$j,$tag_index); - } else if ($j==0&&in_array($possiable_tag,$this->rule_defined_key)) { + } else if ($j==0&&in_array($possiable_tag, + $this->rule_defined_key)) { $score[$possiable_tag] += $this->min_w; } } - + $score[$possiable_tag] += $this->getB($tag_index); - + //$score[$possiable_tag] // += 1 / (1 + exp(-1 * $score[$possiable_tag])); } @@ -435,11 +431,11 @@ class ContextWeightedPosTagger { * Wrap function for predict * @param $texts to be a @string of texts * @param $return_string is a boolean to determing if the user - * want it to out put to stdout or a return value + * want it to out put to stdout or a return value * @return @string if $return_string is true; - @boolean true otherwise + * @boolean true otherwise * e.g. 中国_NR 人民_NN 将_AD 满怀信心_VV - 地_DEV 开创_VV 新_VA 的_DEC 业绩_NN 。_PU + * 地_DEV 开创_VV 新_VA 的_DEC 业绩_NN 。_PU */ public function tag($texts, $return_string=false) { @@ -531,8 +527,8 @@ class ContextWeightedPosTagger { */ private function load_weight($trainning_load=false) { - $dic_file - = C\LOCALE_DIR . "/{$this->lang}/resources/pos_weight.txt.gz"; + $dic_file = C\LOCALE_DIR . + "/{$this->lang}/resources/pos_weight.txt.gz"; if (!file_exists($dic_file)) { echo "$dic_file does not exist!"; exit(); diff --git a/src/library/LocaleFunctions.php b/src/library/LocaleFunctions.php index 21a1f5df3..b514c7c71 100755 --- a/src/library/LocaleFunctions.php +++ b/src/library/LocaleFunctions.php @@ -49,6 +49,21 @@ function localesWithStopwordsList() 'in-ID', 'it', 'ja', 'kn', 'ko', 'nl', 'pl', 'pt', 'ru', 'te', 'th', 'vi-VN', 'zh-CN']; } +/** + * + */ +function localeTagToIso639_2Tag($locale_tag) +{ + $lang_map = ["ar" => "ara", "bn" => "ben", "de" => "deu", + "en" => "eng", "es" => "spa", "fa" => "fas", "fr" => "fra", + "he" => "heb", "hi" => "hin", "id" => "ind", "it" => "ita", + "ja" => "jpn+jpn_vert", "kn" => "kan", "ko" => "kor", "nl" => "nld", + "pl" => "pol", "pt" => "por", "ru" => "rus", "te" => "tel", + "th" => "tha", "tl"=> "tgl", "tr"=> "tur", "vi" => "vie", + "zh" => "chi_sim+chi_tra+chi_sim_vert+chi_tra_vert"]; + $lookup_tag = preg_split("/\-|\_/", $locale_tag)[0]; + return $lang_map[$lookup_tag] ?? C\DEFAULT_LOCALE; +} /** * Attempts to guess the user's locale based on the request, session, * and user-agent data diff --git a/src/library/Utility.php b/src/library/Utility.php index 96dbfc4c6..162b817d7 100755 --- a/src/library/Utility.php +++ b/src/library/Utility.php @@ -873,6 +873,21 @@ function toHexString($str) } return $out; } +/** + * Converts a string to string where each char has been replaced by a Integer + * equivalent + * + * @param string $str what we want rewritten in hex + * @return string the hexified string + */ +function toIntString($str) +{ + $out = ""; + for ($i = 0; $i < strlen($str); $i++) { + $out .= sprintf("%03u",ord($str[$i]))." "; + } + return $out; +} /** * Converts a string to string where each char has been replaced by its * binary equivalent diff --git a/src/library/processors/BmpProcessor.php b/src/library/processors/BmpProcessor.php index 728a3d81a..0c3370b1b 100644 --- a/src/library/processors/BmpProcessor.php +++ b/src/library/processors/BmpProcessor.php @@ -30,6 +30,8 @@ */ namespace seekquarry\yioop\library\processors; +use seekquarry\yioop\configs as C; +use seekquarry\yioop\library\ComputerVision; use seekquarry\yioop\library\UrlParser; /** @@ -94,7 +96,18 @@ class BmpProcessor extends ImageProcessor $this->addWidthHeightSummary($summary, $page); $summary[self::TITLE] = ""; $summary[self::DESCRIPTION] = - UrlParser::getDocumentFilename($url); + UrlParser::getDocumentFilename($url) . "\n"; + if (ComputerVision::ocrEnabled()) { + set_error_handler(null); + $temp_file = $this->saveTempFile($page, $url, "bmp"); + $lang = UrlParser::getLang($url); + $ocr_data = ComputerVision::recognizeText($temp_file, [$lang]); + if (!empty($ocr_data)) { + $summary[self::DESCRIPTION] .= $ocr_data; + } + @unlink($temp_file); + set_error_handler(C\NS_CONFIGS . "yioop_error_handler"); + } $summary[self::LINKS] = []; $summary[self::PAGE] = "<html><body><div><img src='data:image/bmp;base64," . diff --git a/src/library/processors/GifProcessor.php b/src/library/processors/GifProcessor.php index deca01e07..7fb5f926b 100755 --- a/src/library/processors/GifProcessor.php +++ b/src/library/processors/GifProcessor.php @@ -31,6 +31,7 @@ namespace seekquarry\yioop\library\processors; use seekquarry\yioop\configs as C; +use seekquarry\yioop\library\ComputerVision; use seekquarry\yioop\library\UrlParser; /** @@ -80,15 +81,23 @@ class GifProcessor extends ImageProcessor $summary = []; $this->addWidthHeightSummary($summary, $page); $summary[self::TITLE] = ""; - $summary[self::DESCRIPTION] = + $summary[self::DESCRIPTION] = UrlParser::getDocumentFilename($url) + . "\n"; + if (ComputerVision::ocrEnabled()) { + set_error_handler(null); + $temp_file = $this->saveTempFile($page, $url, "gif"); + $lang = UrlParser::getLang($url); + $ocr_data = ComputerVision::recognizeText($temp_file, [$lang]); + if (!empty($ocr_data)) { + $summary[self::DESCRIPTION] .= $ocr_data; + } + @unlink($temp_file); + set_error_handler(C\NS_CONFIGS . "yioop_error_handler"); + } $xmp_data = $this->getXmpData($page); if ($xmp_data) { - $summary[self::DESCRIPTION] = - UrlParser::getDocumentFilename($url) . "\nXMP Data\n". + $summary[self::DESCRIPTION] .= "\nXMP Data\n". $xmp_data; - } else { - $summary[self::DESCRIPTION] = - UrlParser::getDocumentFilename($url); } $summary[self::LINKS] = []; $summary[self::PAGE] = diff --git a/src/library/processors/ImageProcessor.php b/src/library/processors/ImageProcessor.php index 8bf0ee370..3f54d9b1e 100755 --- a/src/library/processors/ImageProcessor.php +++ b/src/library/processors/ImageProcessor.php @@ -57,6 +57,22 @@ class ImageProcessor extends PageProcessor { return null; } + /** + * + */ + public function saveTempFile($page, $url, $file_extension) + { + $temp_dir = C\CRAWL_DIR . "/temp/"; + if (!file_exists($temp_dir)) { + mkdir($temp_dir); + } + if (!file_exists($temp_dir)) { + return null; + } + $temp_file = $temp_dir . L\crawlHash($url) . ".$file_extension"; + file_put_contents($temp_file, $page); + return $temp_file; + } /** * Given an $image_string determines if possible its width and height * then assigns the values into the CrawlConstants:WIDTH, diff --git a/src/library/processors/JpgProcessor.php b/src/library/processors/JpgProcessor.php index b5fb21a6a..84d753d31 100755 --- a/src/library/processors/JpgProcessor.php +++ b/src/library/processors/JpgProcessor.php @@ -32,6 +32,7 @@ namespace seekquarry\yioop\library\processors; use seekquarry\yioop\configs as C; use seekquarry\yioop\library as L; +use seekquarry\yioop\library\ComputerVision; use seekquarry\yioop\library\UrlParser; ini_set("gd.jpeg_ignore_warning", 1); @@ -85,19 +86,24 @@ class JpgProcessor extends ImageProcessor $this->addWidthHeightSummary($summary, $page); $summary[self::TITLE] = ""; $file_name = UrlParser::getDocumentFilename($url); - if (function_exists("exif_read_data")) { - $temp_dir = C\CRAWL_DIR . "/temp/"; - if (!file_exists($temp_dir)) { - mkdir($temp_dir); - } - if (!file_exists($temp_dir)) { - return null; + $summary[self::DESCRIPTION] = $file_name . "\n"; + if (ComputerVision::ocrEnabled()) { + set_error_handler(null); + $temp_file = $this->saveTempFile($page, $url, "jpg"); + $lang = UrlParser::getLang($url); + $ocr_data = ComputerVision::recognizeText($temp_file, [$lang]); + if (!empty($ocr_data)) { + $summary[self::DESCRIPTION] .= $ocr_data; } - $temp_file = $temp_dir . L\crawlHash($url) . ".jpg"; - file_put_contents($temp_file, $page); + @unlink($temp_file); + set_error_handler(C\NS_CONFIGS . "yioop_error_handler"); + } + if (function_exists("exif_read_data")) { set_error_handler(null); - $summary[self::DESCRIPTION] = "$file_name\nEXIF DATA\n". + $temp_file = $this->saveTempFile($page, $url, "jpg"); + $summary[self::DESCRIPTION] .= "\nEXIF DATA\n". print_r(@exif_read_data($temp_file), true); + @unlink($temp_file); set_error_handler(C\NS_CONFIGS . "yioop_error_handler"); } else { $summary[self::DESCRIPTION] = $file_name; diff --git a/src/library/processors/LocaleFunctions.php b/src/library/processors/LocaleFunctions.php deleted file mode 100644 index 012b50801..000000000 --- a/src/library/processors/LocaleFunctions.php +++ /dev/null @@ -1,541 +0,0 @@ -<?php -/** - * SeekQuarry/Yioop -- - * Open Source Pure PHP Search Engine, Crawler, and Indexer - * - * Copyright (C) 2009 - 2020 Chris Pollett chris@pollett.org - * - * LICENSE: - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <https://www.gnu.org/licenses/>. - * - * END LICENSE - * - * This file contains global functions connected to localization that - * are used throughout the web site part of Yioop! - * - * @author Chris Pollett chris@pollett.org - * @license https://www.gnu.org/licenses/ GPL3 - * @link https://www.seekquarry.com/ - * @copyright 2009 - 2020 - * @filesource - */ -namespace seekquarry\yioop\library; - -use seekquarry\yioop\configs as C; -use seekquarry\yioop\models\LocaleModel; - -/** For Yioop global defines */ -require_once __DIR__."/../configs/Config.php"; -/** - * Attempts to guess the user's locale based on the request, session, - * and user-agent data - * - * @return string IANA language tag of the guessed locale - */ -function guessLocale() -{ - /* the request variable l and the browser's HTTP_ACCEPT_LANGUAGE - are used to determine the locale */ - if (isset($_SERVER['HTTP_ACCEPT_LANGUAGE'])) { - $l_parts = explode(",", $_SERVER['HTTP_ACCEPT_LANGUAGE']); - if (count($l_parts) > 0) { - $guess_l = $l_parts[0]; - } - $guess_map = [ - "cn" => "zh-CN", - "en" => "en-US", - "en-us" => "en-US", - "en-US" => "en-US", - "fr" => "fr-FR", - "ko" => "ko", - "in" => "in-ID", - "ja" => "ja", - "vi" => "vi-VN", - "vi-vn" => "vi-VN", - "vi-VN" => "vi-VN", - "zh" => "zh-CN", - "zh-CN" => "zh-CN", - "zh-cn" => "zh-CN", - ]; - if (isset($guess_map[$guess_l])) { - $guess_l = $guess_map[$guess_l]; - } - } - if (isset($_SESSION['l']) || isset($_REQUEST['l']) || isset($guess_l)) { - $l = (isset($_REQUEST['l'])) ? $_REQUEST['l'] : - ((isset($_SESSION['l'])) ? $_SESSION['l'] : $guess_l); - if (strlen($l) < 10) { - $l = addslashes($l); - if (is_dir(C\LOCALE_DIR . "/" . str_replace("-", "_", $l))) { - $locale_tag = $l; - } - } - } - if (!isset($locale_tag)) { - $locale_tag = C\DEFAULT_LOCALE; - } - return $locale_tag; -} -/** - * Attempts to guess the user's locale based on a string sample - * - * @param string $phrase_string used to make guess - * @param string $locale_tag language tag to use if can't guess -- if not - * provided uses current locale's value - * @param int threshold number of chars to guess a particular encoding - * @return string IANA language tag of the guessed locale - - */ -function guessLocaleFromString($phrase_string, $locale_tag = null) -{ - $original_phrase_string = mb_substr($phrase_string, 0, - C\AD_HOC_TITLE_LENGTH); - $locale_tag = ($locale_tag == null) ? getLocaleTag() : $locale_tag; - $sub = C\PUNCT . "|[0-9]|\s"; - $phrase_string = preg_replace('/' . $sub . '/', "", $phrase_string); - $phrase_string = mb_convert_encoding($phrase_string, "UTF-32", "UTF-8"); - $len = strlen($phrase_string); - $guess = ['ar' => 0, 'he' => 0, 'hi' => 0, 'ko' => 0, 'ja' => 0, 'ru' => 0, - 'th' => 0, 'zh-CN' => 0]; - $guess[$locale_tag] = 1; - for ($i = 0; $i < $len; $i += 4) { - $start = ord($phrase_string[$i+2]); - $next = ord($phrase_string[$i+3]); - if ($start >= 6 && $start <= 7) { - if ($locale_tag == "fa") { - $guess[$locale_tag] +=2; - } else { - $guess['ar'] += 2; - } - } else if ($start == 5 && $next >= 144) { - $guess['he'] += 2; - } else if (($start == 9 && $next < 128) || ($start == 168 && - $next >= 224)) { - $guess['hi'] += 2; - } else if ($start == 17 || $start >= 172 && $start < 215) { - $guess['ko'] += 2; - } else if ($start >= 48 && $start <= 49) { - $guess['ja'] += 3; - } else if ($start == 4 || ($start == 5 && $next < 48)) { - $guess['ru']++; - } else if ($start == 14 && $next < 128) { - $guess['th'] += 2; - } else if ($start >= 78 && $start <= 159) { - $guess['zh-CN'] += 4; - } else if ($start == 0 && $next < 128) { - $guess[$locale_tag]++; // assume ascii is from $locale_tag - } - } - $num_points = ($len / 4) - 1; //there will be a lead and tail space - $max = $guess[$locale_tag]; - if ($num_points >= 0 ) { - foreach ($guess as $tag => $cnt) { - if ($cnt >= $num_points && $cnt > $max) { - $locale_tag = $tag; - $max = $cnt; - break; - } - } - } - if ($locale_tag == 'en-US') { - $locale_tag = checkQuery($original_phrase_string); - } - return $locale_tag; -} -/** - * Tries to find wether query belongs to a programming language - * - * @param string $query query entered by user - * - * @return string $lang programming language for the the query provided - */ -function checkQuery($query) -{ - $programming_language_map = ['java:' => 'java', 'python:' => 'py']; - $control_word = "/^(java:|python:)/"; - $position = preg_match($control_word, trim($query), - $matches, PREG_OFFSET_CAPTURE); - if (isset($matches[0][0])) { - $matched_word = $matches[0][0]; - if (isset($programming_language_map[$matched_word])) { - $lang = $programming_language_map[$matched_word]; - } else { - $lang = 'en-US'; - } - } else { - $lang = 'en-US'; - } - return $lang; -} -/** - * Tries to guess at a language tag based on the name of a character - * encoding - * - * @param string $encoding a character encoding name - * - * @return string guessed language tag - */ -function guessLangEncoding($encoding) -{ - $lang = ["EUC-JP", "Shift_JIS", "JIS", "ISO-2022-JP"]; - if (in_array($encoding, $lang)) { - return "ja"; - } - $lang = ["EUC-CN", "GBK", "GB2312", "EUC-TW", "HZ", "CP936", - "BIG-5", "CP950"]; - if (in_array($encoding, $lang)) { - return "zh-CN"; - } - $lang = ["EUC-KR", "UHC", "CP949", "ISO-2022-KR"]; - if (in_array($encoding, $lang)) { - return "ko"; - } - $lang = ["Windows-1251", "CP1251", "CP866", "IBM866", "KOI8-R"]; - if (in_array($encoding, $lang)) { - return "ru"; - } - return 'en'; -} -/** - * Tries to guess the encoding used for an Html document - * - * @param string $html a character encoding name - * @param string $return_loc_info if meta http-equiv info was used to - * find the encoding, then if $return_loc_info is true, we - * return the location of charset substring. This allows converting to - * UTF-8 later so cached pages will display correctly and - * redirects without char encoding won't be given a different hash. - * - * @return mixed either string or array if string then guessed encoding, - * if array guessed encoding, start_pos of where charset info came from, - * length - */ -function guessEncodingHtmlXml($html, $return_loc_info = false) -{ - // first try for XML encoding info - preg_match("/\<\?xml[^\?]+encoding\=[\'\"](.+)[\'\"][^\?]+\?\>/",$html, - $matches, PREG_OFFSET_CAPTURE); - if (!empty($matches[1][1])) { - $encoding = strtoupper($matches[1][0]); - $start_charset = $matches[1][1]; - $len_c = strlen($encoding); - if ($return_loc_info) { - return [$encoding, $start_charset, $len_c]; - } - return $encoding; - } - /* - If the doc is HTML and it uses a http-equiv to set the encoding - then we override what the server says (if anything). As we - are going to convert to UTF-8 we remove the charset info - from the meta tag so cached pages will display correctly and - redirects without char encoding won't be given a different hash. - */ - $end_head = stripos($html, "</head"); - if ($end_head) { - $reg = "/charset(\s*)=(\s*)(\'|\")?((\w|\-)+)(\'|\")?/iu"; - $is_match = preg_match($reg, $html, $match); - if (!$is_match) { - $reg = "charset(\s*)=(\s*)(\'|\")?((\w|\-)+)(\'|\")?"; - mb_regex_encoding("UTF-8"); - mb_ereg_search_init($html); - mb_ereg_search($reg, "i"); - $match = mb_ereg_search_getregs(); - if (isset($match[0])) { - $is_match = true; - } - } - if ($is_match && isset($match[6])) { - $len_c = strlen($match[0]); - if (($match[6] == "'" || $match[6] == '"') && - $match[3] != $match[6]) { - $len_c--; - } - $start_charset = strpos($html, $match[0]); - if ($start_charset + $len_c < $end_head) { - if (isset($match[4])) { - $encoding = strtoupper($match[4]); - if ($return_loc_info) { - return [$encoding, $start_charset, $len_c]; - } - return $encoding; - } - } - } - } - return mb_detect_encoding($html, 'auto'); -} -/** - * Converts page data in a site associative array to UTF-8 if it is not - * already in UTF-8 - * - * @param array& $site an associative of info about a web site - * @param string $page_field the field in the associative array that - * contains the $site's web page as a string. - * @param string $encoding_field the field in the associative array that - * contains the character encoding the page is currently in - * @param function $log_function a callback function used to write log - * messages with, if desired. - */ -function convertUtf8IfNeeded(&$site, $page_field, $encoding_field, - $log_function = "") -{ - if ($log_function == "") { - $log_function = function($msg) { - }; - } - if (empty($site[$encoding_field])) { - $site[$encoding_field] = guessEncodingHtmlXml($site[$page_field]); - } - if (!empty($site[$encoding_field]) && $site[$encoding_field] != "UTF-8") { - set_error_handler(null); - if (!@mb_check_encoding($site[$page_field], - $site[$encoding_field])) { - $log_function(" MB_CHECK_ENCODING FAILED!!"); - } - $log_function(" Converting from encoding ". - $site[$encoding_field]."..."); - //if HEBREW WINDOWS-1255 use ISO-8859 instead - if (stristr($site[$encoding_field], "1255")) { - $site[$encoding_field]= "ISO-8859-8"; - $log_function(" using encoding " . $site[$encoding_field]."..."); - } - if (stristr($site[$encoding_field], "1256")) { - $site[$page_field] = w1256ToUTF8($site[$page_field]); - $log_function(" using Yioop hack encoding ..."); - } else { - $site[$page_field] = @mb_convert_encoding($site[$page_field], - "UTF-8", $site[$encoding_field]); - } - set_error_handler(C\NS_CONFIGS . "yioop_error_handler"); - } else if (!empty($site[$encoding_field]) && - $site[$encoding_field] == "UTF-8") { - $log_function(" UTF-8 data detected!"); - } -} -/** - * Translate the supplied arguments into the current locale. - * This function takes a variable number of arguments. The first - * being an identifier to translate. Additional arguments - * are used to interpolate values in for %s's in the translation. - * - * @param string string_identifier identifier to be translated - * @param mixed additional_args used for interpolation in translated string - * @return string translated string - */ -function tl() -{ - $locale = LocaleModel::$current_locale; - if (!is_object($locale)) { - return false; - } - $args = func_get_args(); - $translation = $locale->translate($args); - if (!trim($translation)) { - $translation = $args[0]; - } - return $translation; -} -/** - * Sets the language to be used for locale settings - * - * @param string $locale_tag the tag of the language to use to determine - * locale settings - */ -function setLocaleObject($locale_tag) -{ - $locale_model = C\NS_MODELS . "LocaleModel"; - $locale = new $locale_model(); - $locale->initialize($locale_tag); - LocaleModel::$current_locale = $locale; -} -/** - * Gets the language tag (for instance, en_US for American English) of the - * locale that is currently being used. This function has the side - * effect of setting Yioop's current locale. - * - * @return string the tag of the language currently being used for locale - * settings - */ -function getLocaleTag() -{ - $locale = LocaleModel::$current_locale; - if (!$locale) { - $locale_tag = guessLocale(); - setLocaleObject($locale_tag); - return $locale_tag; - } - return $locale->getLocaleTag(); -} -/** - * Returns the current language directions. - * - * @return string ltr or rtl depending on if the language is left-to-right - * or right-to-left - */ -function getLocaleDirection() -{ - $locale = LocaleModel::$current_locale; - return $locale->getLocaleDirection(); -} -/** - * Returns the query statistics info for the current llocalt. - * - * @return array consisting of queries and elapses times for locale computations - */ -function getLocaleQueryStatistics() -{ - $locale = LocaleModel::$current_locale; - $query_info = []; - $query_info['QUERY_LOG'] = $locale->db->query_log; - $query_info['TOTAL_ELAPSED_TIME'] = $locale->db->total_time; - return $query_info; -} -/** - * Returns the current locales method of writing blocks (things like divs or - * paragraphs).A language like English puts blocks one after another from the - * top of the page to the bottom. Other languages like classical Chinese list - * them from right to left. - * - * @return string tb lr rl depending on the current locales block progression - */ -function getBlockProgression() -{ - $locale = LocaleModel::$current_locale; - return $locale->getBlockProgression(); - -} -/** - * Returns the writing mode of the current locale. This is a combination of the - * locale direction and the block progression. For instance, for English the - * writing mode is lr-tb (left-to-right top-to-bottom). - * - * @return string the locales writing mode - */ -function getWritingMode() -{ - $locale = LocaleModel::$current_locale; - return $locale->getWritingMode(); - -} -/** - * Convert the string $str encoded in Windows-1256 into UTF-8 - * - * @param string $str Windows-1256 string to convert - * @return string the UTF-8 equivalent - */ -function w1256ToUTF8($str) -{ - static $conv = [ - 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008, - 0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x000E, 0x000F, 0x0010, 0x0011, - 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 0x0018, 0x0019, 0x001A, - 0x001B, 0x001C, 0x001D, 0x001E, 0x001F, 0x0020, 0x0021, 0x0022, 0x0023, - 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002A, 0x002B, 0x002C, - 0x002D, 0x002E, 0x002F, 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, - 0x0036, 0x0037, 0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, - 0x003F, 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, - 0x0048, 0x0049, 0x004A, 0x004B, 0x004C, 0x004D, 0x004E, 0x004F, 0x0050, - 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, - 0x005A, 0x005B, 0x005C, 0x005D, 0x005E, 0x005F, 0x0060, 0x0061, 0x0062, - 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006A, 0x006B, - 0x006C, 0x006D, 0x006E, 0x006F, 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, - 0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007A, 0x007B, 0x007C, 0x007D, - 0x007E, 0x007F, 0x20AC, 0x067E, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, - 0x2021, 0x02C6, 0x2030, 0x0679, 0x2039, 0x0152, 0x0686, 0x0698, 0x0688, - 0x06AF, 0x2018, 0x2020, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, 0x06A9, - 0x2122, 0x0691, 0x203A, 0x0153, 0x200C, 0x200D, 0x06BA, 0x00A0, 0x060C, - 0x00A2, 0x00A3, 0x00A4, 0x00A5, 0x00A6, 0x00A7, 0x00A8, 0x00A9, 0x06BE, - 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x00AF, 0x00B0, 0x00B1, 0x00B2, 0x00B3, - 0x00B4, 0x00B5, 0x00B6, 0x00B7, 0x00B8, 0x00B9, 0x061B, 0x00BB, 0x00BC, - 0x00BD, 0x00BE, 0x061F, 0x06C1, 0x0621, 0x0622, 0x0623, 0x0624, 0x0625, - 0x0626, 0x0627, 0x0628, 0x0629, 0x062A, 0x062B, 0x062C, 0x062D, 0x062E, - 0x062F, 0x0630, 0x0631, 0x0632, 0x0633, 0x0634, 0x0635, 0x0636, 0x00D7, - 0x0637, 0x0638, 0x0639, 0x063A, 0x0640, 0x0641, 0x0642, 0x0643, 0x00E0, - 0x0644, 0x00E2, 0x0645, 0x0646, 0x0647, 0x0648, 0x00E7, 0x00E8, 0x00E9, - 0x00EA, 0x00EB, 0x0649, 0x064A, 0x00EE, 0x00EF, 0x064B, 0x064C, 0x064D, - 0x064E, 0x00F4, 0x064F, 0x0650, 0x00F7, 0x0651, 0x00F9, 0x0652, 0x00FB, - 0x00FC, 0x200E, 0x200F, 0x06D2 - ]; - $len = strlen($str); - $out = ""; - for ($i = 0; $i < $len; $i++) { - $out .= utf8chr($conv[ord($str[$i])]); - } - return $out; -} -/** - * Given a unicode codepoint convert it to UTF-8 - * - * @param int $code the codepoint to convert - * @return string the corresponding UTF-8 string - */ -function utf8chr($code) -{ - if ($code <= 0x7F) - return chr($code); - if ($code <= 0x7FF) - return pack("C*", ($code >> 6)+192, ($code & 63) + 128); - if ($code <= 0xFFFF) - return pack("C*", ($code >> 12)+224, (($code>>6) & 63) + 128, - ($code&63)+128); - if ($code <= 0x1FFFFF) - return pack("C*", ($code >> 18) + 240, (($code >> 12) & 63) + 128, - (($code >> 6) & 63) + 128, ($code & 63) + 128); - return ''; -} -/** - * Function for formatting a date string based on the locale. - * @param $timestamp is the crawl time - * @param $locale_tag is the tag for locale - * @return string formatted date string - */ -function formatDateByLocale($timestamp, $locale_tag) -{ - switch ($locale_tag) { - case 'de': - setlocale(LC_ALL,'deu'); - return strftime("%B %d %Y %H:%M",$timestamp); - case 'en-US': - setlocale(LC_ALL,'enu'); - return strftime("%B %d %Y %H:%M",$timestamp); - case 'es': - setlocale(LC_ALL,'esp'); - return strftime("%B %d %Y %H:%M",$timestamp); - case 'fr-FR': - setlocale(LC_ALL,'fra'); - return strftime("%B %d %Y %H:%M",$timestamp); - case 'it': - setlocale(LC_ALL,'ita'); - return strftime("%B %d %Y %H:%M",$timestamp); - case 'ja': - setlocale(LC_ALL,'jpn'); - return strftime("%B %d %Y %H:%M",$timestamp); - case 'ko': - setlocale(LC_ALL,'kor'); - return strftime("%B %d %Y %H:%M",$timestamp); - case 'pl': - setlocale(LC_ALL,'plk'); - return strftime("%B %d %Y %H:%M",$timestamp); - case 'ru': - setlocale(LC_ALL,'rus'); - return strftime("%B %d %Y %H:%M",$timestamp); - case 'tr': - setlocale(LC_ALL,'trk'); - return strftime("%B %d %Y %H:%M",$timestamp); - default: - return date("F d Y H:i", intval($timestamp)); - } -} diff --git a/src/library/processors/PdfProcessor.php b/src/library/processors/PdfProcessor.php index 9038a117e..551b17513 100755 --- a/src/library/processors/PdfProcessor.php +++ b/src/library/processors/PdfProcessor.php @@ -32,6 +32,9 @@ namespace seekquarry\yioop\library\processors; use seekquarry\yioop\configs as C; use seekquarry\yioop\Library as L; +use seekquarry\yioop\library\ComputerVision; +use seekquarry\yioop\library\UrlParser; + /** * Used to create crawl summary information * for PDF files @@ -77,7 +80,7 @@ class PdfProcessor extends TextProcessor $text = ""; if (is_string($page)) { list($encoding, $title) = self::getEncodingTitle($page); - $text = self::getText($page, $encoding); + $text = self::getText($page, $url, $encoding); } if ($text == "") { $text = $url; @@ -126,12 +129,14 @@ class PdfProcessor extends TextProcessor * Gets the text out of a PDF document * * @param string $pdf_string a string representing the PDF document + * @param $url the url where the page contents came from, + * used to canonicalize relative links * @param string $encoding which of the default (if any) PDF encoding * formats is being used: MacRomanEncoding, WinAnsiEncoding, * PDFDocEncoding, etc. * @return string text extracted from the document */ - public static function getText($pdf_string, $encoding = "") + public static function getText($pdf_string, $url, $encoding = "") { $len = strlen($pdf_string); $cur_pos = 0; @@ -139,11 +144,84 @@ class PdfProcessor extends TextProcessor $i = 0; set_error_handler(null); $state = "text"; + $temp_dir = C\CRAWL_DIR . "/temp/"; + if (!file_exists($temp_dir)) { + mkdir($temp_dir); + } + if (!file_exists($temp_dir)) { + return null; + } + $lang = UrlParser::getLang($url); while($cur_pos < $len) { list($cur_pos, $object_string) = self::getNextObject($pdf_string, $cur_pos); $object_dictionary = self::getObjectDictionary($object_string); - if (self::objectDictionaryHas( + if (ComputerVision::ocrEnabled() && + self::objectDictionaryHas($object_dictionary, ["Image"]) && + self::objectDictionaryHas($object_dictionary, ["XObject"]) && + self::objectDictionaryHas($object_dictionary, ["Width"]) && + self::objectDictionaryHas($object_dictionary, ["Height"]) && + !self::objectDictionaryHas($object_dictionary, ["ImageMask"])) { + $stream_data = ltrim(self::getObjectStream($object_string)); + preg_match("/\/Width\s+(\d+)\b/", $object_dictionary, $matches); + $width = $matches[1] ?? 0; + preg_match("/\/Height\s+(\d+)\b/", $object_dictionary, + $matches); + $height = $matches[1] ?? 0; + preg_match("/\/BitsPerComponent\s+(\d+)\b/", $object_dictionary, + $matches); + $bits_per_component = $matches[1] ?? 8; + preg_match("/\/ColorSpace\s+(Device)?(Gray|RGB|CMYK)\b/", + $object_dictionary, $matches); + $color_space = $matches[2] ?? "RGB"; + $is_jpeg = preg_match("/\/Filter\s+\/DCTDecode\b/", + $object_dictionary); + if (!$width || !$height || $color_space == "CMYK") { + continue; + } + $is_rgb = ($color_space == "RGB"); + if (self::objectDictionaryHas($object_dictionary, + ["FlateDecode"])) { + $stream_data = @gzuncompress($stream_data); + } + if ($is_jpeg) { + $image = imagecreatefromstring($stream_data); + } else { + $image = imagecreatetruecolor($width, $height); + $pix_loc = 0; + for($y = 0; $y < $height; $y++) { + for($x = 0; $x < $width; $x++) { + if ($is_rgb) { + $r = empty($stream_data[$pix_loc]) ? 255 : + ord($stream_data[$pix_loc]); + $g = empty($stream_data[$pix_loc + 1]) ? 255 : + ord($stream_data[$pix_loc + 1]); + $b = empty($stream_data[$pix_loc + 2]) ? 255 : + ord($stream_data[$pix_loc + 2]); + $pix_loc += 3; + } else { + $r = empty($stream_data[$pix_loc]) ? 255 : + ord($stream_data[$pix_loc]); + $g = $r; + $b = $r; + $pix_loc++; + } + $color = imagecolorallocate($image, $r, $g, $b); + imagesetpixel($image, $x, $y, $color); + } + } + } + $temp_file = $temp_dir . L\crawlHash($stream_data) . ".png"; + if ($image) { + imagepng($image, $temp_file); + $ocr_data = ComputerVision::recognizeText($temp_file, + [$lang]); + if (!empty($ocr_data)) { + $out .= $ocr_data; + } + @unlink($temp_file); + } + } else if (self::objectDictionaryHas( $object_dictionary, ["Type", "Font", "FontDescriptor"])) { $state = "font"; continue; diff --git a/src/library/processors/PngProcessor.php b/src/library/processors/PngProcessor.php index a8145ad84..264d04ce7 100755 --- a/src/library/processors/PngProcessor.php +++ b/src/library/processors/PngProcessor.php @@ -31,6 +31,7 @@ namespace seekquarry\yioop\library\processors; use seekquarry\yioop\configs as C; +use seekquarry\yioop\library\ComputerVision; use seekquarry\yioop\library\UrlParser; /** @@ -80,14 +81,23 @@ class PngProcessor extends ImageProcessor $summary = []; $this->addWidthHeightSummary($summary, $page); $summary[self::TITLE] = ""; + $summary[self::DESCRIPTION] = UrlParser::getDocumentFilename($url) + . "\n"; + if (ComputerVision::ocrEnabled()) { + set_error_handler(null); + $temp_file = $this->saveTempFile($page, $url, "png"); + $lang = UrlParser::getLang($url); + $ocr_data = ComputerVision::recognizeText($temp_file, [$lang]); + if (!empty($ocr_data)) { + $summary[self::DESCRIPTION] .= $ocr_data; + } + @unlink($temp_file); + set_error_handler(C\NS_CONFIGS . "yioop_error_handler"); + } $xmp_data = $this->getXmpData($page); if ($xmp_data) { - $summary[self::DESCRIPTION] = - UrlParser::getDocumentFilename($url) . "\nXMP Data\n". + $summary[self::DESCRIPTION] .= "\nXMP Data\n". $xmp_data; - } else { - $summary[self::DESCRIPTION] = - UrlParser::getDocumentFilename($url); } $summary[self::LINKS] = []; $summary[self::PAGE] = diff --git a/src/locale/zh_CN/resources/term_weight.txt.gz b/src/locale/zh_CN/resources/term_weight.txt.gz old mode 100644 new mode 100755 index 12823fa77..5f0c753af Binary files a/src/locale/zh_CN/resources/term_weight.txt.gz and b/src/locale/zh_CN/resources/term_weight.txt.gz differ diff --git a/tests/PdfProcessorTest.php b/tests/PdfProcessorTest.php index 81cedaa4b..fe25bd561 100644 --- a/tests/PdfProcessorTest.php +++ b/tests/PdfProcessorTest.php @@ -32,6 +32,7 @@ namespace seekquarry\yioop\tests; use seekquarry\yioop\configs as C; use seekquarry\yioop\library as L; +use seekquarry\yioop\library\ComputerVision; use seekquarry\yioop\library\CrawlConstants; use seekquarry\yioop\library\processors\PdfProcessor; use seekquarry\yioop\library\UnitTest; @@ -51,12 +52,6 @@ class PdfProcessorTest extends UnitTest implements CrawlConstants */ public function setUp() { - $pdf_object = new PdfProcessor(); - $url = "http://www.yioop.com/test.pdf"; - $filename = C\PARENT_DIR . "/tests/test_files/test.pdf"; - $page = file_get_contents($filename); - $summary = $pdf_object->process($page, $url); - $this->test_objects['summary'] = $summary; } /** * Delete any files associated with our test on PdfProcessor (in this case @@ -71,8 +66,12 @@ class PdfProcessorTest extends UnitTest implements CrawlConstants */ public function wordExtractionTestCase() { - $words = explode(" ", - $this->test_objects['summary'][self::DESCRIPTION]); + $pdf_object = new PdfProcessor(); + $url = "http://www.yioop.com/test.pdf"; + $filename = C\PARENT_DIR . "/tests/test_files/test.pdf"; + $page = file_get_contents($filename); + $summary = $pdf_object->process($page, $url); + $words = explode(" ", $summary[self::DESCRIPTION]); $this->assertTrue(in_array("Documentation", $words), "Word Extraction 1"); $this->assertTrue(in_array("Yioop", $words), @@ -80,4 +79,24 @@ class PdfProcessorTest extends UnitTest implements CrawlConstants $this->assertTrue(in_array("Open", $words), "Word Extraction 3"); } + /** + * + */ + public function textFromImageTestCase() + { + if (ComputerVision::ocrEnabled()) { + $pdf_object = new PdfProcessor(); + $url = "http://www.yioop.com/test2.pdf"; + $filename = C\PARENT_DIR . "/tests/test_files/test2.pdf"; + $page = file_get_contents($filename); + $summary = $pdf_object->process($page, $url); + $words = explode(" ", $summary[self::DESCRIPTION]); + $this->assertTrue(in_array("Maureen", $words), + "Word From Image Extraction 1"); + $this->assertTrue(in_array("Phantom", $words), + "Word From Image Extraction 2"); + $this->assertTrue(in_array("playing", $words), + "Word From Image Extraction 3"); + } + } } diff --git a/tests/test_files/test2.pdf b/tests/test_files/test2.pdf new file mode 100644 index 000000000..006ea6c52 Binary files /dev/null and b/tests/test_files/test2.pdf differ