Adds tesseract support for ocr, a=chris

Chris Pollett [2020-04-07 20:Apr:th]

Adds tesseract support for ocr, a=chris

Filename
src/library/ComputerVision.php
src/library/ContextWeightedNamedEntityRecognizer.php
src/library/ContextWeightedPosTagger.php
src/library/LocaleFunctions.php
src/library/Utility.php
src/library/processors/BmpProcessor.php
src/library/processors/GifProcessor.php
src/library/processors/ImageProcessor.php
src/library/processors/JpgProcessor.php
src/library/processors/LocaleFunctions.php
src/library/processors/PdfProcessor.php
src/library/processors/PngProcessor.php
src/locale/zh_CN/resources/term_weight.txt.gz
tests/PdfProcessorTest.php
tests/test_files/test2.pdf

diff --git a/src/library/ComputerVision.php b/src/library/ComputerVision.php
new file mode 100644
index 000000000..91802bd8c
--- /dev/null
+++ b/src/library/ComputerVision.php
@@ -0,0 +1,86 @@
+<?php
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009 - 2020  Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ *
+ * END LICENSE
+ *
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @license https://www.gnu.org/licenses/ GPL3
+ * @link https://www.seekquarry.com/
+ * @copyright 2009 - 2020
+ * @filesource
+ */
+namespace seekquarry\yioop\library;
+
+use seekquarry\yioop\configs as C;
+
+/**
+ * For crawlHash
+ */
+require_once __DIR__ . "/Utility.php";
+/**
+ * To convert to Iso639-2
+ */
+require_once __DIR__ . "/LocaleFunctions.php";
+/**
+ *
+ */
+class ComputerVision
+{
+    public static function ocrEnabled()
+    {
+        return C\nsdefined("TESSERACT");
+    }
+    /**
+     *
+     */
+    public static function recognizeText($image_path,
+        $langs = [C\DEFAULT_LOCALE])
+    {
+        if (!C\nsdefined("TESSERACT")) {
+            return "";
+        }
+        $temp_dir = C\CRAWL_DIR . "/temp/";
+        if (!file_exists($temp_dir)) {
+             mkdir($temp_dir);
+        }
+        if (!file_exists($temp_dir)) {
+            return "";
+        }
+        $image_file_name = pathinfo($image_path, PATHINFO_BASENAME);
+        $iso_string = "";
+        $add = "";
+        foreach ($langs as $lang) {
+            $iso_lang = localeTagToIso639_2Tag($lang);
+            $iso_string .= $add . $iso_lang;
+            $add = "+";
+        }
+        $ocr_file = $temp_dir . $image_file_name . "-out";
+        $ocr_exec = C\TESSERACT . " $image_path $ocr_file -l $iso_string";
+        exec($ocr_exec);
+        $ocr_file .= ".txt";
+        $ocr_string =  file_exists($ocr_file) ?
+            file_get_contents($ocr_file) : "";
+        @unlink($ocr_file);
+        return trim($ocr_string, " \t\n\r\0\x0B\x0C");
+    }
+}
diff --git a/src/library/ContextWeightedNamedEntityRecognizer.php b/src/library/ContextWeightedNamedEntityRecognizer.php
index 0790fdabc..e6b11d88e 100644
--- a/src/library/ContextWeightedNamedEntityRecognizer.php
+++ b/src/library/ContextWeightedNamedEntityRecognizer.php
@@ -26,7 +26,6 @@
  * @copyright 2009 - 2019
  * @filesource
  */
-
 namespace seekquarry\yioop\library;

 use seekquarry\yioop\configs as C;
@@ -39,7 +38,6 @@ use seekquarry\yioop\locale\zh_CN\resources as ZH;
  *
  * @author Xianghong Sun
  */
-
 class ContextWeightedNamedEntityRecognizer
 {
     /**
@@ -94,7 +92,7 @@ class ContextWeightedNamedEntityRecognizer
                 $this->lang = $lang;
         }
     }
-
+
     /**
      * A function that process the trainning data
      * @param @mixed $text_files can be a file or an array of file names
@@ -104,7 +102,7 @@ class ContextWeightedNamedEntityRecognizer
      * 我们/o 是/o 受到/o 郑振铎/nr 先生/o 、/o 阿英/nr 先生/o 著作/o 的/o
      * 启示/o ，/o 从/o 个人/o 条件/o 出发/o ，/o 瞄准/o 现代/o 出版/o 史/o
      * 研究/o 的/o 空白/o ，/o 重点/o 集/o 藏/o 解放区/o 、/o 国民党/nt 毁/o
-     * 禁/o 出版物/o 。/o
+     * 禁/o 出版物/o 。/o
      * To adapt to other language, some modifications are needed
      */
     public static function processTexts($text_files, $term_tag_splier="/",
@@ -123,16 +121,15 @@ class ContextWeightedNamedEntityRecognizer
                     if (!count($word_tag_pairs)) {
                         continue;
                     }
-                    $ret[]=[];
-                    $ret[count($ret)-1][0]=[];
-                    $ret[count($ret)-1][1]=[];
+                    $ret[] = [];
+                    $ret[count($ret)-1][0] = [];
+                    $ret[count($ret)-1][1] = [];
                     foreach ($word_tag_pairs as $word_tag_pair) {
                         $t = explode("/", $word_tag_pair);
-                        //echo $word_tag_pair;
-                        //print_r($t);
                         if (count($t) == 2) {
                             $tag = $tag_process ? $tag_process($t[1]) : $t[1];
-                            foreach(preg_split('//u', $t[0], null, PREG_SPLIT_NO_EMPTY) as $ch) {
+                            foreach(preg_split('//u', $t[0], null,
+                                PREG_SPLIT_NO_EMPTY) as $ch) {
                                 $ret[count($ret)-1][0][] =
                                     $term_process ? $term_process($ch) : $ch;
                                 $ret[count($ret)-1][1][] = $tag;
@@ -145,21 +142,20 @@ class ContextWeightedNamedEntityRecognizer
         }
         return $ret;
     }
-
+
     /**
     * Function to train a data
     * Notice: This function might run very long time, depending on training set
     * @param @mixed $text_files are training data
     *               can be a file or an array of file names
-    * @param @float $learning_rate
+    * @param @float $learning_rate
     * @param @int  $max_epoch 1200 might be a good one,
-                   the weight will overfit if it's greater than this number
+    *           the weight will overfit if it's greater than this number
     * @param @function $term_process is a preporcess on term before training
     * @param @function $tag_process is a preporcess on tag before training
     */
-    public function train($text_files,
-                          $learning_rate=0.1, $max_epoch = 1200,
-                          $term_process = null, $tag_process = null)
+    public function train($text_files, $learning_rate=0.1, $max_epoch = 1200,
+        $term_process = null, $tag_process = null)
     {
         if (is_string($text_files)) {
             $text_files = [$text_files];
@@ -192,14 +188,14 @@ class ContextWeightedNamedEntityRecognizer
                         $this->tag_feature[$tags[$i-1]]=[];
                     }
                 } else {
-                    if (!isset($this->tag_feature[$tags[$i-2]."-".$tags[$i-1]])) {
-                        $this->tag_feature[$tags[$i-2]."-".$tags[$i-1]]=[];
+                    if (!isset($this->tag_feature[$tags[$i-2] . "-" .
+                        $tags[$i-1]])) {
+                        $this->tag_feature[$tags[$i-2]."-".$tags[$i-1]] = [];
                     }
                     if (!isset($this->tag_feature[$tags[$i-1]])) {
                         $this->tag_feature[$tags[$i-1]]=[];
                     }
                 }
-
                 if (!isset($this->word_feature[$terms[$i]])) {
                     $this->word_feature[$terms[$i]] = [];
                 }
@@ -234,7 +230,8 @@ class ContextWeightedNamedEntityRecognizer
         $cross_entropy_loss = 1;
         $pre_cross_entropy_loss = 2;
         for ($epoch = 0; ($epoch < $max_epoch) &&
-            $pre_cross_entropy_loss - $cross_entropy_loss > 0.000001; $epoch++) {
+            $pre_cross_entropy_loss - $cross_entropy_loss > 0.000001;
+            $epoch++) {
             $this->min_w=0;
             $this->max_w=0;
             $time = time();
@@ -243,10 +240,10 @@ class ContextWeightedNamedEntityRecognizer
             $pre_cross_entropy_loss = $cross_entropy_loss;
             $cross_entropy_loss = 0;
             $cross_entropy_loss_n = 0;
-
+
             $dy_db=[];
             $dy_db_n=[];
-
+
             $dy_dt=[];
             $dy_dt_n=[];
             for($i = 0; $i < count($this->tag_set); $i++) {
@@ -298,11 +295,11 @@ class ContextWeightedNamedEntityRecognizer
                                 $dy_dw[$k[$j]][$j][$tag_index] = 0;
                                 $dy_dw_n[$k[$j]][$j][$tag_index] = 0;
                             }
-
+
                             $dy_dw[$k[$j]][$j][$tag_index] +=
                                 ($sigmoid - $equality);
                             $dy_dw_n[$k[$j]][$j][$tag_index] += 1;
-
+
                         }
                         //dy_dt
                         if (!isset($dy_dt[$tf1])) {
@@ -365,8 +362,8 @@ class ContextWeightedNamedEntityRecognizer
             }
             foreach ($dy_db as $k => $v) {
                 $this->bias[$k]-=
-                    $dy_db[$k] /
-                    $dy_db_n[$k] *
+                    $dy_db[$k] /
+                    $dy_db_n[$k] *
                     $learning_rate;
             }
             if ($epoch % 10 == 9 ) {
@@ -411,7 +408,7 @@ class ContextWeightedNamedEntityRecognizer
                 for ($j=-2; $j <=2; $j++) {
                     $k=$this->getIndex($i+$j, $terms);
                     if (isset($this->word_feature[$k])) {
-                        $score[$possiable_tag] +=
+                        $score[$possiable_tag] +=
                                 $this->getW($k,$j,$tag_index);
                     }
                 }
@@ -452,7 +449,6 @@ class ContextWeightedNamedEntityRecognizer
         }
         return $ret;
     }
-
     /**
      * A list of private helper functions
      * Given a setence ($term), find the key at position $index
@@ -468,7 +464,7 @@ class ContextWeightedNamedEntityRecognizer
         }
         return $k;
     }
-
+
     /**
      * save the trained weight to disk
      */
@@ -533,7 +529,8 @@ class ContextWeightedNamedEntityRecognizer
      */
     private function unpack_b()
     {
-        return array_merge(unpack("f".strval(count($this->tag_set)),$this->bias));
+        return array_merge(unpack("f" . strval(count($this->tag_set)),
+            $this->bias));
     }
     /**
      * Pack the tag_feature
@@ -547,7 +544,8 @@ class ContextWeightedNamedEntityRecognizer
      */
     private function unpack_t($key)
     {
-        return array_merge(unpack("f".strval(count($this->tag_set)),$this->tag_feature[$key]));
+        return array_merge(unpack("f".strval(count($this->tag_set)),
+            $this->tag_feature[$key]));
     }
     /**
      * Pack the word_feature
diff --git a/src/library/ContextWeightedPosTagger.php b/src/library/ContextWeightedPosTagger.php
index 140097b95..d49f252bd 100644
--- a/src/library/ContextWeightedPosTagger.php
+++ b/src/library/ContextWeightedPosTagger.php
@@ -26,7 +26,6 @@
  * @copyright 2009 - 2019
  * @filesource
  */
-
 namespace seekquarry\yioop\library;

 use seekquarry\yioop\configs as C;
@@ -45,8 +44,8 @@ use seekquarry\yioop\locale\zh_CN\resources as ZH;
  *
  * @author Xianghong Sun
  */
-
-class ContextWeightedPosTagger {
+class ContextWeightedPosTagger
+{
     /**
      * Current Language, only tested on Simplified Chinese
      * Might be extensable for other languages in the furture
@@ -72,7 +71,6 @@ class ContextWeightedPosTagger {
      */
     private $min_w;
     private $max_w;
-
     /**
      * All Possiable tag set
      * Generized by training method
@@ -147,10 +145,9 @@ class ContextWeightedPosTagger {
     {
         $this->$var_name = $value;
     }
-
     /**
      * check if the term can be determined by algorithm,
-     * usually by regualr expression, because there are infinity
+     * usually by regualr expression, because there are infinity
      * amount of them.
      * ex. 13th is an ordinal number, 123 is a cardinal number
      * then use the determined tag to be the weight key
@@ -164,7 +161,7 @@ class ContextWeightedPosTagger {
         }
         return $term;
     }
-
+
     /**
      * A function that process the trainning data
      * @param @mixed $text_files can be a file or an array of file names
@@ -196,11 +193,11 @@ class ContextWeightedPosTagger {
                     $ret[count($ret)-1][1]=[];
                     foreach ($word_tag_pairs as $word_tag_pair) {
                         $t = explode($term_tag_splier, $word_tag_pair);
-
+
                         if (count($t) == 2) {
                             $ret[count($ret)-1][0][] =
                                 $term_process ? $term_process($t[0]) : $t[0];
-                            $ret[count($ret)-1][1][] =
+                            $ret[count($ret)-1][1][] =
                                 $tag_process ? $tag_process($t[1]) : $t[1];
                         }
                     }
@@ -210,22 +207,20 @@ class ContextWeightedPosTagger {
         }
         return $ret;
     }
-
     /**
-    * Function to train a data
-    * Notice: This function might run very long time, depending on training set
-    * @param @mixed $text_files are training data
-    *               can be a file or an array of file names
-    * @param @float $learning_rate
-    * @param @int  $max_epoch 1200 might be a good one,
-                   the weight will overfit if it's greater than this number
-    * @parama @bool $resume if true, read the weight file and continue training
-                            if false, start from beginning
-    */
-    public function train($text_files, $term_tag_splier="_",
-                          $learning_rate=0.1, $max_epoch = 1200,
-                          $term_process = null, $tag_process = null,
-                          $resume=false)
+     * Function to train a data
+     * Notice: This function might run very long time, depending on training set
+     * @param @mixed $text_files are training data
+     *  can be a file or an array of file names
+     * @param @float $learning_rate
+     * @param @int  $max_epoch 1200 might be a good one,
+     *  the weight will overfit if it's greater than this number
+     * @param @bool $resume if true, read the weight file and continue training
+     *   if false, start from beginning
+     */
+    public function train($text_files, $term_tag_splier="_", $learning_rate=0.1,
+        $max_epoch = 1200, $term_process = null, $tag_process = null,
+        $resume = false)
     {
         if (is_string($text_files)) {
             $text_files = [$text_files];
@@ -286,8 +281,8 @@ class ContextWeightedPosTagger {
         //train the weight
         $cross_entropy_loss = 1;
         $pre_cross_entropy_loss = 2;
-        for ($epoch = 0; $epoch < $max_epoch &&
-            $pre_cross_entropy_loss - $cross_entropy_loss > 0.000001; $epoch++) {
+        for ($epoch = 0; $epoch < $max_epoch && $pre_cross_entropy_loss -
+            $cross_entropy_loss > 0.000001; $epoch++) {
             $this->min_w=0;
             $this->max_w=0;
             $time = time();
@@ -296,7 +291,7 @@ class ContextWeightedPosTagger {
             $pre_cross_entropy_loss = $cross_entropy_loss;
             $cross_entropy_loss = 0;
             $cross_entropy_loss_n = 0;
-
+
             $dy_db=[];
             $dy_db_n=[];
             for($i = 0; $i < count($this->tag_set); $i++) {
@@ -333,11 +328,11 @@ class ContextWeightedPosTagger {
                                 $dy_dw[$k[$j]][$j][$tag_index] = 0;
                                 $dy_dw_n[$k[$j]][$j][$tag_index] = 0;
                             }
-
+
                             $dy_dw[$k[$j]][$j][$tag_index] +=
                                 ($sigmoid - $equality);
                             $dy_dw_n[$k[$j]][$j][$tag_index] += 1;
-
+
                         }
                         //dy_db
                         $dy_db[$tag_index] += ($sigmoid - $equality);
@@ -351,7 +346,7 @@ class ContextWeightedPosTagger {
             }
             $cross_entropy_loss /= $cross_entropy_loss_n;
             $duration = time() - $time;
-            echo "epoch {$epoch} cross_entropy {$cross_entropy_loss}".
+            echo "epoch {$epoch} cross_entropy {$cross_entropy_loss}" .
                 " Takes {$duration} seconds\n";
             foreach ($dy_dw as $i =>$v1) {
                 foreach ($v1 as $j =>$v2) {
@@ -371,8 +366,8 @@ class ContextWeightedPosTagger {
             }
             foreach ($dy_db as $k =>$v) {
                 $this->b[$k]-=
-                    $dy_db[$k] /
-                    $dy_db_n[$k] *
+                    $dy_db[$k] /
+                    $dy_db_n[$k] *
                     $learning_rate;
             }
             if ($epoch % 10 == 9 ) {
@@ -386,7 +381,7 @@ class ContextWeightedPosTagger {
      * The primary function to predit the tag
      * @param mixed $sentence is an array of segmented words/terms
      *     or a string with words/terms seperated by space
-     * @return @array of tags
+     * @return @array of tags
      */
     public function predict($sentence)
     {
@@ -415,15 +410,16 @@ class ContextWeightedPosTagger {
                 for ($j=-2; $j <=2; $j++) {
                     $k=$this->getIndex($i+$j, $terms);
                     if (isset($this->w[$k])) {
-                        $score[$possiable_tag] +=
+                        $score[$possiable_tag] +=
                                 $this->getW($k,$j,$tag_index);
-                    } else if ($j==0&&in_array($possiable_tag,$this->rule_defined_key)) {
+                    } else if ($j==0&&in_array($possiable_tag,
+                        $this->rule_defined_key)) {
                         $score[$possiable_tag] += $this->min_w;
                     }
                 }
-
+
                 $score[$possiable_tag] += $this->getB($tag_index);
-
+
                 //$score[$possiable_tag]
                 //    += 1 / (1 + exp(-1 * $score[$possiable_tag]));
             }
@@ -435,11 +431,11 @@ class ContextWeightedPosTagger {
      * Wrap function for predict
      * @param $texts to be a @string of texts
      * @param $return_string is a boolean to determing if the user
-     *        want it to out put to stdout or a return value
+     *   want it to out put to stdout or a return value
      * @return @string if $return_string is true;
-               @boolean true otherwise
+     *   @boolean true otherwise
      * e.g. 中国_NR 人民_NN 将_AD 满怀信心_VV
-            地_DEV 开创_VV 新_VA 的_DEC 业绩_NN 。_PU
+     *   地_DEV 开创_VV 新_VA 的_DEC 业绩_NN 。_PU
      */
     public function tag($texts, $return_string=false)
     {
@@ -531,8 +527,8 @@ class ContextWeightedPosTagger {
      */
     private function load_weight($trainning_load=false)
     {
-        $dic_file
-            = C\LOCALE_DIR . "/{$this->lang}/resources/pos_weight.txt.gz";
+        $dic_file = C\LOCALE_DIR .
+            "/{$this->lang}/resources/pos_weight.txt.gz";
         if (!file_exists($dic_file)) {
             echo "$dic_file does not exist!";
             exit();
diff --git a/src/library/LocaleFunctions.php b/src/library/LocaleFunctions.php
index 21a1f5df3..b514c7c71 100755
--- a/src/library/LocaleFunctions.php
+++ b/src/library/LocaleFunctions.php
@@ -49,6 +49,21 @@ function localesWithStopwordsList()
         'in-ID', 'it', 'ja', 'kn', 'ko', 'nl', 'pl', 'pt', 'ru', 'te', 'th',
         'vi-VN', 'zh-CN'];
 }
+/**
+ *
+ */
+function localeTagToIso639_2Tag($locale_tag)
+{
+    $lang_map = ["ar" => "ara", "bn" => "ben", "de" => "deu",
+        "en" => "eng", "es" => "spa", "fa" => "fas", "fr" => "fra",
+        "he" => "heb", "hi" => "hin", "id" => "ind", "it" => "ita",
+        "ja" => "jpn+jpn_vert", "kn" => "kan", "ko" => "kor", "nl" => "nld",
+        "pl" => "pol", "pt" => "por", "ru" => "rus", "te" => "tel",
+        "th" => "tha", "tl"=> "tgl", "tr"=> "tur", "vi" => "vie",
+        "zh" => "chi_sim+chi_tra+chi_sim_vert+chi_tra_vert"];
+    $lookup_tag = preg_split("/\-|\_/", $locale_tag)[0];
+    return $lang_map[$lookup_tag] ?? C\DEFAULT_LOCALE;
+}
 /**
  * Attempts to guess the user's locale based on the request, session,
  * and user-agent data
diff --git a/src/library/Utility.php b/src/library/Utility.php
index 96dbfc4c6..162b817d7 100755
--- a/src/library/Utility.php
+++ b/src/library/Utility.php
@@ -873,6 +873,21 @@ function toHexString($str)
     }
     return $out;
 }
+/**
+ * Converts a string to string where each char has been replaced by a Integer
+ * equivalent
+ *
+ * @param string $str what we want rewritten in hex
+ * @return string the hexified string
+ */
+function toIntString($str)
+{
+    $out = "";
+    for ($i = 0; $i < strlen($str); $i++) {
+        $out .= sprintf("%03u",ord($str[$i]))." ";
+    }
+    return $out;
+}
 /**
  * Converts a string to string where each char has been replaced by its
  * binary equivalent
diff --git a/src/library/processors/BmpProcessor.php b/src/library/processors/BmpProcessor.php
index 728a3d81a..0c3370b1b 100644
--- a/src/library/processors/BmpProcessor.php
+++ b/src/library/processors/BmpProcessor.php
@@ -30,6 +30,8 @@
  */
 namespace seekquarry\yioop\library\processors;

+use seekquarry\yioop\configs as C;
+use seekquarry\yioop\library\ComputerVision;
 use seekquarry\yioop\library\UrlParser;

 /**
@@ -94,7 +96,18 @@ class BmpProcessor extends ImageProcessor
             $this->addWidthHeightSummary($summary, $page);
             $summary[self::TITLE] = "";
             $summary[self::DESCRIPTION] =
-                UrlParser::getDocumentFilename($url);
+                UrlParser::getDocumentFilename($url) . "\n";
+            if (ComputerVision::ocrEnabled()) {
+                set_error_handler(null);
+                $temp_file = $this->saveTempFile($page, $url, "bmp");
+                $lang = UrlParser::getLang($url);
+                $ocr_data = ComputerVision::recognizeText($temp_file, [$lang]);
+                if (!empty($ocr_data)) {
+                    $summary[self::DESCRIPTION] .= $ocr_data;
+                }
+                @unlink($temp_file);
+                set_error_handler(C\NS_CONFIGS . "yioop_error_handler");
+            }
             $summary[self::LINKS] = [];
             $summary[self::PAGE] =
                 "<html><body><div><img src='data:image/bmp;base64," .
diff --git a/src/library/processors/GifProcessor.php b/src/library/processors/GifProcessor.php
index deca01e07..7fb5f926b 100755
--- a/src/library/processors/GifProcessor.php
+++ b/src/library/processors/GifProcessor.php
@@ -31,6 +31,7 @@
 namespace seekquarry\yioop\library\processors;

 use seekquarry\yioop\configs as C;
+use seekquarry\yioop\library\ComputerVision;
 use seekquarry\yioop\library\UrlParser;

 /**
@@ -80,15 +81,23 @@ class GifProcessor extends ImageProcessor
             $summary = [];
             $this->addWidthHeightSummary($summary, $page);
             $summary[self::TITLE] = "";
-            $summary[self::DESCRIPTION] =
+            $summary[self::DESCRIPTION] = UrlParser::getDocumentFilename($url)
+                . "\n";
+            if (ComputerVision::ocrEnabled()) {
+                set_error_handler(null);
+                $temp_file = $this->saveTempFile($page, $url, "gif");
+                $lang = UrlParser::getLang($url);
+                $ocr_data = ComputerVision::recognizeText($temp_file, [$lang]);
+                if (!empty($ocr_data)) {
+                    $summary[self::DESCRIPTION] .= $ocr_data;
+                }
+                @unlink($temp_file);
+                set_error_handler(C\NS_CONFIGS . "yioop_error_handler");
+            }
             $xmp_data = $this->getXmpData($page);
             if ($xmp_data) {
-                $summary[self::DESCRIPTION] =
-                    UrlParser::getDocumentFilename($url) . "\nXMP Data\n".
+                $summary[self::DESCRIPTION] .= "\nXMP Data\n".
                     $xmp_data;
-            } else {
-                $summary[self::DESCRIPTION] =
-                    UrlParser::getDocumentFilename($url);
             }
             $summary[self::LINKS] = [];
             $summary[self::PAGE] =
diff --git a/src/library/processors/ImageProcessor.php b/src/library/processors/ImageProcessor.php
index 8bf0ee370..3f54d9b1e 100755
--- a/src/library/processors/ImageProcessor.php
+++ b/src/library/processors/ImageProcessor.php
@@ -57,6 +57,22 @@ class ImageProcessor extends PageProcessor
     {
         return null;
     }
+    /**
+     *
+     */
+    public function saveTempFile($page, $url, $file_extension)
+    {
+        $temp_dir = C\CRAWL_DIR . "/temp/";
+        if (!file_exists($temp_dir)) {
+             mkdir($temp_dir);
+        }
+        if (!file_exists($temp_dir)) {
+            return null;
+        }
+        $temp_file = $temp_dir .  L\crawlHash($url) . ".$file_extension";
+        file_put_contents($temp_file, $page);
+        return $temp_file;
+    }
     /**
      * Given an $image_string determines if possible its width and height
      * then assigns the values into the CrawlConstants:WIDTH,
diff --git a/src/library/processors/JpgProcessor.php b/src/library/processors/JpgProcessor.php
index b5fb21a6a..84d753d31 100755
--- a/src/library/processors/JpgProcessor.php
+++ b/src/library/processors/JpgProcessor.php
@@ -32,6 +32,7 @@ namespace seekquarry\yioop\library\processors;

 use seekquarry\yioop\configs as C;
 use seekquarry\yioop\library as L;
+use seekquarry\yioop\library\ComputerVision;
 use seekquarry\yioop\library\UrlParser;

 ini_set("gd.jpeg_ignore_warning", 1);
@@ -85,19 +86,24 @@ class JpgProcessor extends ImageProcessor
             $this->addWidthHeightSummary($summary, $page);
             $summary[self::TITLE] = "";
             $file_name = UrlParser::getDocumentFilename($url);
-            if (function_exists("exif_read_data")) {
-                $temp_dir = C\CRAWL_DIR . "/temp/";
-                if (!file_exists($temp_dir)) {
-                     mkdir($temp_dir);
-                }
-                if (!file_exists($temp_dir)) {
-                    return null;
+            $summary[self::DESCRIPTION] = $file_name . "\n";
+            if (ComputerVision::ocrEnabled()) {
+                set_error_handler(null);
+                $temp_file = $this->saveTempFile($page, $url, "jpg");
+                $lang = UrlParser::getLang($url);
+                $ocr_data = ComputerVision::recognizeText($temp_file, [$lang]);
+                if (!empty($ocr_data)) {
+                    $summary[self::DESCRIPTION] .= $ocr_data;
                 }
-                $temp_file = $temp_dir .  L\crawlHash($url) . ".jpg";
-                file_put_contents($temp_file, $page);
+                @unlink($temp_file);
+                set_error_handler(C\NS_CONFIGS . "yioop_error_handler");
+            }
+            if (function_exists("exif_read_data")) {
                 set_error_handler(null);
-                $summary[self::DESCRIPTION] = "$file_name\nEXIF DATA\n".
+                $temp_file = $this->saveTempFile($page, $url, "jpg");
+                $summary[self::DESCRIPTION] .= "\nEXIF DATA\n".
                     print_r(@exif_read_data($temp_file), true);
+                @unlink($temp_file);
                 set_error_handler(C\NS_CONFIGS . "yioop_error_handler");
             } else {
                 $summary[self::DESCRIPTION] = $file_name;
diff --git a/src/library/processors/LocaleFunctions.php b/src/library/processors/LocaleFunctions.php
deleted file mode 100644
index 012b50801..000000000
--- a/src/library/processors/LocaleFunctions.php
+++ /dev/null
@@ -1,541 +0,0 @@
-<?php
-/**
- * SeekQuarry/Yioop --
- * Open Source Pure PHP Search Engine, Crawler, and Indexer
- *
- * Copyright (C) 2009 - 2020  Chris Pollett chris@pollett.org
- *
- * LICENSE:
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <https://www.gnu.org/licenses/>.
- *
- * END LICENSE
- *
- * This file contains global functions connected to localization that
- * are used throughout the web site part of Yioop!
- *
- * @author Chris Pollett chris@pollett.org
- * @license https://www.gnu.org/licenses/ GPL3
- * @link https://www.seekquarry.com/
- * @copyright 2009 - 2020
- * @filesource
- */
-namespace seekquarry\yioop\library;
-
-use seekquarry\yioop\configs as C;
-use seekquarry\yioop\models\LocaleModel;
-
-/** For Yioop global defines */
-require_once __DIR__."/../configs/Config.php";
-/**
- * Attempts to guess the user's locale based on the request, session,
- * and user-agent data
- *
- * @return string IANA language tag of the guessed locale
- */
-function guessLocale()
-{
-    /* the request variable l and the browser's HTTP_ACCEPT_LANGUAGE
-       are used to determine the locale */
-    if (isset($_SERVER['HTTP_ACCEPT_LANGUAGE'])) {
-        $l_parts = explode(",", $_SERVER['HTTP_ACCEPT_LANGUAGE']);
-        if (count($l_parts) > 0) {
-            $guess_l = $l_parts[0];
-        }
-        $guess_map = [
-            "cn" => "zh-CN",
-            "en" => "en-US",
-            "en-us" => "en-US",
-            "en-US" => "en-US",
-            "fr" => "fr-FR",
-            "ko" => "ko",
-            "in" => "in-ID",
-            "ja" => "ja",
-            "vi" => "vi-VN",
-            "vi-vn" => "vi-VN",
-            "vi-VN" => "vi-VN",
-            "zh" => "zh-CN",
-            "zh-CN" => "zh-CN",
-            "zh-cn" => "zh-CN",
-        ];
-        if (isset($guess_map[$guess_l])) {
-            $guess_l = $guess_map[$guess_l];
-        }
-    }
-    if (isset($_SESSION['l']) || isset($_REQUEST['l']) || isset($guess_l)) {
-        $l = (isset($_REQUEST['l'])) ? $_REQUEST['l'] :
-            ((isset($_SESSION['l'])) ? $_SESSION['l'] : $guess_l);
-        if (strlen($l) < 10) {
-            $l = addslashes($l);
-            if (is_dir(C\LOCALE_DIR . "/" . str_replace("-", "_", $l))) {
-                $locale_tag = $l;
-            }
-        }
-    }
-    if (!isset($locale_tag)) {
-        $locale_tag = C\DEFAULT_LOCALE;
-    }
-    return $locale_tag;
-}
-/**
- * Attempts to guess the user's locale based on a string sample
- *
- * @param string $phrase_string used to make guess
- * @param string $locale_tag language tag to use if can't guess -- if not
- *     provided uses current locale's value
- * @param int threshold number of chars to guess a particular encoding
- * @return string IANA language tag of the guessed locale
-
- */
-function guessLocaleFromString($phrase_string, $locale_tag = null)
-{
-    $original_phrase_string = mb_substr($phrase_string, 0,
-        C\AD_HOC_TITLE_LENGTH);
-    $locale_tag = ($locale_tag == null) ? getLocaleTag() : $locale_tag;
-    $sub = C\PUNCT . "|[0-9]|\s";
-    $phrase_string = preg_replace('/' . $sub . '/', "", $phrase_string);
-    $phrase_string = mb_convert_encoding($phrase_string, "UTF-32", "UTF-8");
-    $len = strlen($phrase_string);
-    $guess = ['ar' => 0, 'he' => 0, 'hi' => 0, 'ko' => 0, 'ja' => 0, 'ru' => 0,
-        'th' => 0, 'zh-CN' => 0];
-    $guess[$locale_tag] = 1;
-    for ($i = 0; $i < $len; $i += 4) {
-        $start = ord($phrase_string[$i+2]);
-        $next = ord($phrase_string[$i+3]);
-        if ($start >= 6 && $start <= 7) {
-            if ($locale_tag == "fa") {
-                $guess[$locale_tag] +=2;
-            } else {
-                $guess['ar'] += 2;
-            }
-        } else if ($start == 5 && $next >= 144) {
-            $guess['he'] += 2;
-        } else if (($start == 9 && $next < 128) || ($start == 168 &&
-            $next >= 224)) {
-            $guess['hi'] += 2;
-        } else if ($start == 17 || $start >= 172 && $start < 215) {
-            $guess['ko'] += 2;
-        } else if ($start >= 48 && $start <= 49) {
-            $guess['ja'] += 3;
-        } else if ($start == 4 || ($start == 5 && $next < 48)) {
-            $guess['ru']++;
-        } else if ($start == 14 && $next < 128) {
-            $guess['th'] += 2;
-        } else if ($start >= 78 && $start <= 159) {
-            $guess['zh-CN'] += 4;
-        } else if ($start == 0 && $next < 128) {
-            $guess[$locale_tag]++; // assume ascii is from $locale_tag
-        }
-    }
-    $num_points = ($len / 4) - 1; //there will be a lead and tail space
-    $max = $guess[$locale_tag];
-    if ($num_points >= 0 ) {
-        foreach ($guess as $tag => $cnt) {
-            if ($cnt >= $num_points && $cnt > $max) {
-                $locale_tag = $tag;
-                $max = $cnt;
-                break;
-            }
-        }
-    }
-    if ($locale_tag == 'en-US') {
-        $locale_tag = checkQuery($original_phrase_string);
-    }
-    return $locale_tag;
-}
-/**
- * Tries to find wether query belongs to a programming language
- *
- * @param string $query query entered by user
- *
- * @return string $lang programming language for the the query provided
- */
-function checkQuery($query)
-{
-    $programming_language_map = ['java:' => 'java', 'python:' => 'py'];
-    $control_word = "/^(java:|python:)/";
-    $position = preg_match($control_word, trim($query),
-        $matches, PREG_OFFSET_CAPTURE);
-    if (isset($matches[0][0])) {
-        $matched_word = $matches[0][0];
-        if (isset($programming_language_map[$matched_word])) {
-            $lang = $programming_language_map[$matched_word];
-        } else {
-            $lang = 'en-US';
-        }
-    } else {
-        $lang = 'en-US';
-    }
-    return $lang;
-}
-/**
- * Tries to guess at a language tag based on the name of a character
- * encoding
- *
- * @param string $encoding a character encoding name
- *
- * @return string guessed language tag
- */
-function guessLangEncoding($encoding)
-{
-    $lang = ["EUC-JP", "Shift_JIS", "JIS", "ISO-2022-JP"];
-    if (in_array($encoding, $lang)) {
-        return "ja";
-    }
-    $lang = ["EUC-CN", "GBK", "GB2312", "EUC-TW", "HZ", "CP936",
-        "BIG-5", "CP950"];
-    if (in_array($encoding, $lang)) {
-        return "zh-CN";
-    }
-    $lang = ["EUC-KR", "UHC", "CP949", "ISO-2022-KR"];
-    if (in_array($encoding, $lang)) {
-        return "ko";
-    }
-    $lang = ["Windows-1251", "CP1251", "CP866", "IBM866", "KOI8-R"];
-    if (in_array($encoding, $lang)) {
-        return "ru";
-    }
-    return 'en';
-}
-/**
- * Tries to guess the encoding used for an Html document
- *
- * @param string $html a character encoding name
- * @param string $return_loc_info if meta http-equiv info was used to
- *     find the encoding, then if $return_loc_info is true, we
- *     return the location of charset substring. This allows converting to
- *     UTF-8 later so cached pages will display correctly and
- *     redirects without char encoding won't be given a different hash.
- *
- * @return mixed either string or array if string then guessed encoding,
- *     if array guessed encoding, start_pos of where charset info came from,
- *     length
- */
-function guessEncodingHtmlXml($html, $return_loc_info = false)
-{
-    // first try for XML encoding info
-    preg_match("/\<\?xml[^\?]+encoding\=[\'\"](.+)[\'\"][^\?]+\?\>/",$html,
-        $matches, PREG_OFFSET_CAPTURE);
-    if (!empty($matches[1][1])) {
-        $encoding = strtoupper($matches[1][0]);
-        $start_charset = $matches[1][1];
-        $len_c = strlen($encoding);
-        if ($return_loc_info) {
-            return [$encoding, $start_charset, $len_c];
-        }
-        return $encoding;
-    }
-     /*
-       If the doc is HTML and it uses a http-equiv to set the encoding
-       then we override what the server says (if anything). As we
-       are going to convert to UTF-8 we remove the charset info
-       from the meta tag so cached pages will display correctly and
-       redirects without char encoding won't be given a different hash.
-     */
-    $end_head = stripos($html, "</head");
-    if ($end_head) {
-        $reg = "/charset(\s*)=(\s*)(\'|\")?((\w|\-)+)(\'|\")?/iu";
-        $is_match = preg_match($reg, $html, $match);
-        if (!$is_match) {
-            $reg = "charset(\s*)=(\s*)(\'|\")?((\w|\-)+)(\'|\")?";
-            mb_regex_encoding("UTF-8");
-            mb_ereg_search_init($html);
-            mb_ereg_search($reg, "i");
-            $match = mb_ereg_search_getregs();
-            if (isset($match[0])) {
-                $is_match = true;
-            }
-        }
-        if ($is_match && isset($match[6])) {
-            $len_c = strlen($match[0]);
-            if (($match[6] == "'" || $match[6] == '"') &&
-               $match[3] != $match[6]) {
-                $len_c--;
-            }
-            $start_charset = strpos($html, $match[0]);
-            if ($start_charset + $len_c < $end_head) {
-                if (isset($match[4])) {
-                    $encoding = strtoupper($match[4]);
-                    if ($return_loc_info) {
-                        return [$encoding, $start_charset, $len_c];
-                    }
-                    return $encoding;
-                }
-            }
-        }
-    }
-    return mb_detect_encoding($html, 'auto');
-}
-/**
- * Converts page data in a site associative array to UTF-8 if it is not
- * already in UTF-8
- *
- * @param array& $site an associative of info about a web site
- * @param string $page_field the field in the associative array that
- *  contains the $site's web page as a string.
- * @param string $encoding_field the  field in the associative array that
- *  contains the character encoding the page is currently in
- * @param function $log_function a callback function used to write log
- *  messages with, if desired.
- */
-function convertUtf8IfNeeded(&$site, $page_field, $encoding_field,
-    $log_function = "")
-{
-    if ($log_function == "") {
-        $log_function = function($msg) {
-        };
-    }
-    if (empty($site[$encoding_field])) {
-        $site[$encoding_field] = guessEncodingHtmlXml($site[$page_field]);
-    }
-    if (!empty($site[$encoding_field]) && $site[$encoding_field] != "UTF-8") {
-        set_error_handler(null);
-        if (!@mb_check_encoding($site[$page_field],
-            $site[$encoding_field])) {
-            $log_function("  MB_CHECK_ENCODING FAILED!!");
-        }
-        $log_function("  Converting from encoding ".
-            $site[$encoding_field]."...");
-        //if HEBREW WINDOWS-1255 use ISO-8859 instead
-        if (stristr($site[$encoding_field], "1255")) {
-            $site[$encoding_field]= "ISO-8859-8";
-            $log_function("  using encoding " . $site[$encoding_field]."...");
-        }
-        if (stristr($site[$encoding_field], "1256")) {
-            $site[$page_field] = w1256ToUTF8($site[$page_field]);
-            $log_function("  using Yioop hack encoding ...");
-        } else {
-            $site[$page_field] = @mb_convert_encoding($site[$page_field],
-                "UTF-8", $site[$encoding_field]);
-        }
-        set_error_handler(C\NS_CONFIGS . "yioop_error_handler");
-    } else if (!empty($site[$encoding_field]) &&
-        $site[$encoding_field] == "UTF-8") {
-        $log_function("   UTF-8 data detected!");
-    }
-}
-/**
- * Translate the supplied arguments into the current locale.
- * This function takes a variable number of arguments. The first
- * being an identifier to translate. Additional arguments
- * are used to interpolate values in for %s's in the translation.
- *
- * @param string string_identifier  identifier to be translated
- * @param mixed additional_args  used for interpolation in translated string
- * @return string  translated string
- */
-function tl()
-{
-    $locale = LocaleModel::$current_locale;
-    if (!is_object($locale)) {
-        return false;
-    }
-    $args = func_get_args();
-    $translation = $locale->translate($args);
-    if (!trim($translation)) {
-        $translation = $args[0];
-    }
-    return $translation;
-}
-/**
- * Sets the language to be used for locale settings
- *
- * @param string $locale_tag the tag of the language to use to determine
- *     locale settings
- */
-function setLocaleObject($locale_tag)
-{
-    $locale_model = C\NS_MODELS . "LocaleModel";
-    $locale = new $locale_model();
-    $locale->initialize($locale_tag);
-    LocaleModel::$current_locale = $locale;
-}
-/**
- * Gets the language tag (for instance, en_US for American English) of the
- * locale that is currently being used. This function has the side
- * effect of setting Yioop's current locale.
- *
- * @return string  the tag of the language currently being used for locale
- *     settings
- */
-function getLocaleTag()
-{
-    $locale = LocaleModel::$current_locale;
-    if (!$locale) {
-        $locale_tag = guessLocale();
-        setLocaleObject($locale_tag);
-        return $locale_tag;
-    }
-    return $locale->getLocaleTag();
-}
-/**
- * Returns the current language directions.
- *
- * @return string ltr or rtl depending on if the language is left-to-right
- * or right-to-left
- */
-function getLocaleDirection()
-{
-    $locale = LocaleModel::$current_locale;
-    return $locale->getLocaleDirection();
-}
-/**
- * Returns the query statistics info for the current llocalt.
- *
- * @return array consisting of queries and elapses times for locale computations
- */
-function getLocaleQueryStatistics()
-{
-    $locale = LocaleModel::$current_locale;
-    $query_info = [];
-    $query_info['QUERY_LOG'] = $locale->db->query_log;
-    $query_info['TOTAL_ELAPSED_TIME'] = $locale->db->total_time;
-    return $query_info;
-}
-/**
- * Returns the current locales method of writing blocks (things like divs or
- * paragraphs).A language like English puts blocks one after another from the
- * top of the page to the bottom. Other languages like classical Chinese list
- * them from right to left.
- *
- * @return string  tb lr rl depending on the current locales block progression
- */
-function getBlockProgression()
-{
-    $locale = LocaleModel::$current_locale;
-    return $locale->getBlockProgression();
-
-}
-/**
- * Returns the writing mode of the current locale. This is a combination of the
- * locale direction and the block progression. For instance, for English the
- * writing mode is lr-tb (left-to-right top-to-bottom).
- *
- * @return string   the locales writing mode
- */
-function getWritingMode()
-{
-    $locale = LocaleModel::$current_locale;
-    return $locale->getWritingMode();
-
-}
-/**
- * Convert the string $str encoded in Windows-1256 into UTF-8
- *
- * @param string $str Windows-1256 string to convert
- * @return string the UTF-8 equivalent
- */
-function w1256ToUTF8($str)
-{
-    static $conv = [
-        0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008,
-        0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x000E, 0x000F, 0x0010, 0x0011,
-        0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 0x0018, 0x0019, 0x001A,
-        0x001B, 0x001C, 0x001D, 0x001E, 0x001F, 0x0020, 0x0021, 0x0022, 0x0023,
-        0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002A, 0x002B, 0x002C,
-        0x002D, 0x002E, 0x002F, 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035,
-        0x0036, 0x0037, 0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E,
-        0x003F, 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
-        0x0048, 0x0049, 0x004A, 0x004B, 0x004C, 0x004D, 0x004E, 0x004F, 0x0050,
-        0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059,
-        0x005A, 0x005B, 0x005C, 0x005D, 0x005E, 0x005F, 0x0060, 0x0061, 0x0062,
-        0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006A, 0x006B,
-        0x006C, 0x006D, 0x006E, 0x006F, 0x0070, 0x0071, 0x0072, 0x0073, 0x0074,
-        0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007A, 0x007B, 0x007C, 0x007D,
-        0x007E, 0x007F, 0x20AC, 0x067E, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020,
-        0x2021, 0x02C6, 0x2030, 0x0679, 0x2039, 0x0152, 0x0686, 0x0698, 0x0688,
-        0x06AF, 0x2018, 0x2020, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, 0x06A9,
-        0x2122, 0x0691, 0x203A, 0x0153, 0x200C, 0x200D, 0x06BA, 0x00A0, 0x060C,
-        0x00A2, 0x00A3, 0x00A4, 0x00A5, 0x00A6, 0x00A7, 0x00A8, 0x00A9, 0x06BE,
-        0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x00AF, 0x00B0, 0x00B1, 0x00B2, 0x00B3,
-        0x00B4, 0x00B5, 0x00B6, 0x00B7, 0x00B8, 0x00B9, 0x061B, 0x00BB, 0x00BC,
-        0x00BD, 0x00BE, 0x061F, 0x06C1, 0x0621, 0x0622, 0x0623, 0x0624, 0x0625,
-        0x0626, 0x0627, 0x0628, 0x0629, 0x062A, 0x062B, 0x062C, 0x062D, 0x062E,
-        0x062F, 0x0630, 0x0631, 0x0632, 0x0633, 0x0634, 0x0635, 0x0636, 0x00D7,
-        0x0637, 0x0638, 0x0639, 0x063A, 0x0640, 0x0641, 0x0642, 0x0643, 0x00E0,
-        0x0644, 0x00E2, 0x0645, 0x0646, 0x0647, 0x0648, 0x00E7, 0x00E8, 0x00E9,
-        0x00EA, 0x00EB, 0x0649, 0x064A, 0x00EE, 0x00EF, 0x064B, 0x064C, 0x064D,
-        0x064E, 0x00F4, 0x064F, 0x0650, 0x00F7, 0x0651, 0x00F9, 0x0652, 0x00FB,
-        0x00FC, 0x200E, 0x200F, 0x06D2
-    ];
-    $len = strlen($str);
-    $out = "";
-    for ($i = 0; $i < $len; $i++) {
-        $out .= utf8chr($conv[ord($str[$i])]);
-    }
-    return $out;
-}
-/**
- * Given a unicode codepoint convert it to UTF-8
- *
- * @param int $code  the codepoint to convert
- * @return string the corresponding UTF-8 string
- */
-function utf8chr($code)
-{
-    if ($code <= 0x7F)
-        return chr($code);
-    if ($code <= 0x7FF)
-        return pack("C*", ($code >> 6)+192, ($code & 63) + 128);
-    if ($code <= 0xFFFF)
-            return pack("C*", ($code >> 12)+224, (($code>>6) & 63) + 128,
-                ($code&63)+128);
-    if ($code <= 0x1FFFFF)
-        return pack("C*", ($code >> 18) + 240, (($code >> 12) & 63) + 128,
-            (($code >> 6) & 63) + 128, ($code & 63) + 128);
-    return '';
-}
-/**
- * Function for formatting a date string based on the locale.
- * @param $timestamp is the crawl time
- * @param $locale_tag is the tag for locale
- * @return string formatted date string
- */
-function formatDateByLocale($timestamp, $locale_tag)
-{
-    switch ($locale_tag) {
-        case 'de':
-            setlocale(LC_ALL,'deu');
-            return strftime("%B %d %Y %H:%M",$timestamp);
-        case 'en-US':
-            setlocale(LC_ALL,'enu');
-            return strftime("%B %d %Y %H:%M",$timestamp);
-        case 'es':
-            setlocale(LC_ALL,'esp');
-            return strftime("%B %d %Y %H:%M",$timestamp);
-        case 'fr-FR':
-            setlocale(LC_ALL,'fra');
-            return strftime("%B %d %Y %H:%M",$timestamp);
-        case 'it':
-            setlocale(LC_ALL,'ita');
-            return strftime("%B %d %Y %H:%M",$timestamp);
-        case 'ja':
-            setlocale(LC_ALL,'jpn');
-            return strftime("%B %d %Y %H:%M",$timestamp);
-        case 'ko':
-            setlocale(LC_ALL,'kor');
-            return strftime("%B %d %Y %H:%M",$timestamp);
-        case 'pl':
-            setlocale(LC_ALL,'plk');
-            return strftime("%B %d %Y %H:%M",$timestamp);
-        case 'ru':
-            setlocale(LC_ALL,'rus');
-            return strftime("%B %d %Y %H:%M",$timestamp);
-        case 'tr':
-            setlocale(LC_ALL,'trk');
-            return strftime("%B %d %Y %H:%M",$timestamp);
-        default:
-            return date("F d Y H:i", intval($timestamp));
-    }
-}
diff --git a/src/library/processors/PdfProcessor.php b/src/library/processors/PdfProcessor.php
index 9038a117e..551b17513 100755
--- a/src/library/processors/PdfProcessor.php
+++ b/src/library/processors/PdfProcessor.php
@@ -32,6 +32,9 @@ namespace seekquarry\yioop\library\processors;

 use seekquarry\yioop\configs as C;
 use seekquarry\yioop\Library as L;
+use seekquarry\yioop\library\ComputerVision;
+use seekquarry\yioop\library\UrlParser;
+
 /**
  * Used to create crawl summary information
  * for PDF files
@@ -77,7 +80,7 @@ class PdfProcessor extends TextProcessor
         $text = "";
         if (is_string($page)) {
             list($encoding, $title) = self::getEncodingTitle($page);
-            $text =  self::getText($page, $encoding);
+            $text =  self::getText($page, $url, $encoding);
         }
         if ($text == "") {
             $text = $url;
@@ -126,12 +129,14 @@ class PdfProcessor extends TextProcessor
      * Gets the text out of a PDF document
      *
      * @param string $pdf_string a string representing the PDF document
+     * @param $url  the url where the page contents came from,
+     *    used to canonicalize relative links
      * @param string $encoding which of the default (if any) PDF encoding
      *    formats is being used: MacRomanEncoding, WinAnsiEncoding,
      *    PDFDocEncoding, etc.
      * @return string text extracted from the document
      */
-    public static function getText($pdf_string, $encoding = "")
+    public static function getText($pdf_string, $url, $encoding = "")
     {
         $len = strlen($pdf_string);
         $cur_pos = 0;
@@ -139,11 +144,84 @@ class PdfProcessor extends TextProcessor
         $i = 0;
         set_error_handler(null);
         $state = "text";
+        $temp_dir = C\CRAWL_DIR . "/temp/";
+        if (!file_exists($temp_dir)) {
+             mkdir($temp_dir);
+        }
+        if (!file_exists($temp_dir)) {
+            return null;
+        }
+        $lang = UrlParser::getLang($url);
         while($cur_pos < $len) {
             list($cur_pos, $object_string) =
                 self::getNextObject($pdf_string, $cur_pos);
             $object_dictionary = self::getObjectDictionary($object_string);
-            if (self::objectDictionaryHas(
+            if (ComputerVision::ocrEnabled() &&
+                self::objectDictionaryHas($object_dictionary, ["Image"]) &&
+                self::objectDictionaryHas($object_dictionary, ["XObject"]) &&
+                self::objectDictionaryHas($object_dictionary, ["Width"]) &&
+                self::objectDictionaryHas($object_dictionary, ["Height"]) &&
+                !self::objectDictionaryHas($object_dictionary, ["ImageMask"])) {
+                $stream_data = ltrim(self::getObjectStream($object_string));
+                preg_match("/\/Width\s+(\d+)\b/", $object_dictionary, $matches);
+                $width = $matches[1] ?? 0;
+                preg_match("/\/Height\s+(\d+)\b/", $object_dictionary,
+                    $matches);
+                $height = $matches[1] ?? 0;
+                preg_match("/\/BitsPerComponent\s+(\d+)\b/", $object_dictionary,
+                    $matches);
+                $bits_per_component = $matches[1] ?? 8;
+                preg_match("/\/ColorSpace\s+(Device)?(Gray|RGB|CMYK)\b/",
+                    $object_dictionary, $matches);
+                $color_space = $matches[2] ?? "RGB";
+                $is_jpeg = preg_match("/\/Filter\s+\/DCTDecode\b/",
+                    $object_dictionary);
+                if (!$width || !$height || $color_space == "CMYK") {
+                    continue;
+                }
+                $is_rgb = ($color_space == "RGB");
+                if (self::objectDictionaryHas($object_dictionary,
+                    ["FlateDecode"])) {
+                    $stream_data = @gzuncompress($stream_data);
+                }
+                if ($is_jpeg) {
+                    $image  = imagecreatefromstring($stream_data);
+                } else {
+                    $image  = imagecreatetruecolor($width, $height);
+                    $pix_loc = 0;
+                    for($y = 0; $y < $height; $y++) {
+                        for($x = 0; $x < $width; $x++) {
+                            if ($is_rgb) {
+                                $r = empty($stream_data[$pix_loc]) ? 255 :
+                                    ord($stream_data[$pix_loc]);
+                                $g = empty($stream_data[$pix_loc + 1]) ? 255 :
+                                    ord($stream_data[$pix_loc + 1]);
+                                $b = empty($stream_data[$pix_loc + 2]) ? 255 :
+                                    ord($stream_data[$pix_loc + 2]);
+                                $pix_loc += 3;
+                            } else {
+                                $r = empty($stream_data[$pix_loc]) ? 255 :
+                                    ord($stream_data[$pix_loc]);
+                                $g = $r;
+                                $b = $r;
+                                $pix_loc++;
+                            }
+                            $color = imagecolorallocate($image, $r, $g, $b);
+                            imagesetpixel($image, $x, $y, $color);
+                        }
+                    }
+                }
+                $temp_file = $temp_dir . L\crawlHash($stream_data) . ".png";
+                if ($image) {
+                    imagepng($image, $temp_file);
+                    $ocr_data = ComputerVision::recognizeText($temp_file,
+                        [$lang]);
+                    if (!empty($ocr_data)) {
+                        $out  .= $ocr_data;
+                    }
+                    @unlink($temp_file);
+                }
+            } else if (self::objectDictionaryHas(
                 $object_dictionary, ["Type", "Font", "FontDescriptor"])) {
                 $state = "font";
                 continue;
diff --git a/src/library/processors/PngProcessor.php b/src/library/processors/PngProcessor.php
index a8145ad84..264d04ce7 100755
--- a/src/library/processors/PngProcessor.php
+++ b/src/library/processors/PngProcessor.php
@@ -31,6 +31,7 @@
 namespace seekquarry\yioop\library\processors;

 use seekquarry\yioop\configs as C;
+use seekquarry\yioop\library\ComputerVision;
 use seekquarry\yioop\library\UrlParser;

 /**
@@ -80,14 +81,23 @@ class PngProcessor extends ImageProcessor
             $summary = [];
             $this->addWidthHeightSummary($summary, $page);
             $summary[self::TITLE] = "";
+            $summary[self::DESCRIPTION] = UrlParser::getDocumentFilename($url)
+                . "\n";
+            if (ComputerVision::ocrEnabled()) {
+                set_error_handler(null);
+                $temp_file = $this->saveTempFile($page, $url, "png");
+                $lang = UrlParser::getLang($url);
+                $ocr_data = ComputerVision::recognizeText($temp_file, [$lang]);
+                if (!empty($ocr_data)) {
+                    $summary[self::DESCRIPTION] .= $ocr_data;
+                }
+                @unlink($temp_file);
+                set_error_handler(C\NS_CONFIGS . "yioop_error_handler");
+            }
             $xmp_data = $this->getXmpData($page);
             if ($xmp_data) {
-                $summary[self::DESCRIPTION] =
-                    UrlParser::getDocumentFilename($url) . "\nXMP Data\n".
+                $summary[self::DESCRIPTION] .= "\nXMP Data\n".
                     $xmp_data;
-            } else {
-                $summary[self::DESCRIPTION] =
-                    UrlParser::getDocumentFilename($url);
             }
             $summary[self::LINKS] = [];
             $summary[self::PAGE] =
diff --git a/src/locale/zh_CN/resources/term_weight.txt.gz b/src/locale/zh_CN/resources/term_weight.txt.gz
old mode 100644
new mode 100755
index 12823fa77..5f0c753af
Binary files a/src/locale/zh_CN/resources/term_weight.txt.gz and b/src/locale/zh_CN/resources/term_weight.txt.gz differ
diff --git a/tests/PdfProcessorTest.php b/tests/PdfProcessorTest.php
index 81cedaa4b..fe25bd561 100644
--- a/tests/PdfProcessorTest.php
+++ b/tests/PdfProcessorTest.php
@@ -32,6 +32,7 @@ namespace seekquarry\yioop\tests;

 use seekquarry\yioop\configs as C;
 use seekquarry\yioop\library as L;
+use seekquarry\yioop\library\ComputerVision;
 use seekquarry\yioop\library\CrawlConstants;
 use seekquarry\yioop\library\processors\PdfProcessor;
 use seekquarry\yioop\library\UnitTest;
@@ -51,12 +52,6 @@ class PdfProcessorTest extends UnitTest implements CrawlConstants
      */
     public function setUp()
     {
-        $pdf_object = new PdfProcessor();
-        $url = "http://www.yioop.com/test.pdf";
-        $filename = C\PARENT_DIR . "/tests/test_files/test.pdf";
-        $page = file_get_contents($filename);
-        $summary = $pdf_object->process($page, $url);
-        $this->test_objects['summary'] = $summary;
     }
     /**
      * Delete any files associated with our test on PdfProcessor (in this case
@@ -71,8 +66,12 @@ class PdfProcessorTest extends UnitTest implements CrawlConstants
      */
     public function wordExtractionTestCase()
     {
-        $words = explode(" ",
-            $this->test_objects['summary'][self::DESCRIPTION]);
+        $pdf_object = new PdfProcessor();
+        $url = "http://www.yioop.com/test.pdf";
+        $filename = C\PARENT_DIR . "/tests/test_files/test.pdf";
+        $page = file_get_contents($filename);
+        $summary = $pdf_object->process($page, $url);
+        $words = explode(" ", $summary[self::DESCRIPTION]);
         $this->assertTrue(in_array("Documentation", $words),
             "Word Extraction 1");
         $this->assertTrue(in_array("Yioop", $words),
@@ -80,4 +79,24 @@ class PdfProcessorTest extends UnitTest implements CrawlConstants
         $this->assertTrue(in_array("Open", $words),
             "Word Extraction 3");
     }
+    /**
+     *
+     */
+    public function textFromImageTestCase()
+    {
+        if (ComputerVision::ocrEnabled()) {
+            $pdf_object = new PdfProcessor();
+            $url = "http://www.yioop.com/test2.pdf";
+            $filename = C\PARENT_DIR . "/tests/test_files/test2.pdf";
+            $page = file_get_contents($filename);
+            $summary = $pdf_object->process($page, $url);
+            $words = explode(" ", $summary[self::DESCRIPTION]);
+            $this->assertTrue(in_array("Maureen", $words),
+                "Word From Image Extraction 1");
+            $this->assertTrue(in_array("Phantom", $words),
+                "Word From Image Extraction 2");
+            $this->assertTrue(in_array("playing", $words),
+                "Word From Image Extraction 3");
+        }
+    }
 }
diff --git a/tests/test_files/test2.pdf b/tests/test_files/test2.pdf
new file mode 100644
index 000000000..006ea6c52
Binary files /dev/null and b/tests/test_files/test2.pdf differ

ViewGit