Trying to apply Forrest's correction to entity tagging, a=chris

Chris Pollett [2020-07-12 18:Jul:th]
Trying to apply Forrest's correction to entity tagging, a=chris
Filename
src/library/ContextTagger.php
src/library/NamedEntityContextTagger.php
src/locale/ar/configure.ini
src/locale/bn/configure.ini
src/locale/de/configure.ini
src/locale/en_US/configure.ini
src/locale/es/configure.ini
src/locale/fa/configure.ini
src/locale/fr_FR/configure.ini
src/locale/he/configure.ini
src/locale/hi/configure.ini
src/locale/id/configure.ini
src/locale/it/configure.ini
src/locale/ja/configure.ini
src/locale/kn/configure.ini
src/locale/ko/configure.ini
src/locale/nl/configure.ini
src/locale/pl/configure.ini
src/locale/pt/configure.ini
src/locale/ru/configure.ini
src/locale/te/configure.ini
src/locale/th/configure.ini
src/locale/tl/configure.ini
src/locale/tr/configure.ini
src/locale/vi_VN/configure.ini
src/locale/zh_CN/configure.ini
diff --git a/src/library/ContextTagger.php b/src/library/ContextTagger.php
index 549760cae..5a10abc4d 100644
--- a/src/library/ContextTagger.php
+++ b/src/library/ContextTagger.php
@@ -129,6 +129,11 @@ abstract class ContextTagger
      *  before adding term to sentence term array
      * @param function $tag_callback callback function applied to a part of
      *  speech tag  before adding tag to sentence tag array
+     * @param bool $tag_on_array_chars for some kinds of text processing
+     *  it better to assume the tags are applied to each char within a term
+     *  rather than at the term level. For example, we might want to use
+     *  char within a term for name entity tagging. THis flag if true
+     *  says to do this; otherwise don't
      * @return array of separated sentences, each sentence having the format of
      *  [[terms...], [tags...]]
      *  Currently, the training data needs to fit Chinese Treebank format:
@@ -137,9 +142,10 @@ abstract class ContextTagger
      *  To adapt to other language, some modifications are needed
      */
     public static function processTexts($text_files, $term_tag_separator = "_",
-        $term_callback = null, $tag_callback = null)
+        $term_callback = null, $tag_callback = null,
+        $tag_on_array_chars = false)
     {
-        $ret = [];
+        $out = [];
         foreach($text_files as $text_file) {
             if (file_exists($text_file)) {
                 $fh = fopen($text_file, "r");
@@ -148,27 +154,36 @@ abstract class ContextTagger
                     if(strpos($line, '<') !== false) {
                         continue;
                     }
-                    $word_tag_pairs = preg_split("/[\s ]+/u", $line);
-                    if (!count($word_tag_pairs)) {
+                    $term_tag_pairs = preg_split("/[\s ]+/u", $line);
+                    if (!count($term_tag_pairs)) {
                         continue;
                     }
-                    $ret[] = [];
-                    $ret[count($ret) - 1][0] = [];
-                    $ret[count($ret) - 1][1] = [];
-                    foreach ($word_tag_pairs as $word_tag_pair) {
-                        $t = explode($term_tag_separator, $word_tag_pair);
+                    $out[] = [];
+                    $last_out = count($out) - 1;
+                    $out[$last_out][0] = [];
+                    $out[$last_out][1] = [];
+                    foreach ($term_tag_pairs as $term_tag_pair) {
+                        $t = explode($term_tag_separator, $term_tag_pair);
                         if (count($t) == 2) {
-                            $ret[count($ret) - 1][0][] =
-                                $term_callback ? $term_callback($t[0]) : $t[0];
-                            $ret[count($ret) - 1][1][] =
-                                $tag_callback ? $tag_callback($t[1]) : $t[1];
+                            $tag = $tag_callback ? $tag_callback($t[1]) : $t[1];
+                            if ($tag_on_array_chars) {
+                                $to_tags = preg_split('//u', $t[0], null,
+                                    PREG_SPLIT_NO_EMPTY);
+                            } else {
+                                $to_tags = [$t[0]];
+                            }
+                            foreach($to_tags as $to_tag) {
+                                $out[$last_out][0][] = $term_callback ?
+                                    $term_callback($to_tag) : $to_tag;
+                                $out[$last_out][1][] = $tag;
+                            }
                         }
                     }
                 }
                 fclose($fh);
             }
         }
-        return $ret;
+        return $out;
     }
     /**
      * Maps a term to a corresponding key if the term matches some simple
diff --git a/src/library/NamedEntityContextTagger.php b/src/library/NamedEntityContextTagger.php
index 53de40890..c402393fd 100644
--- a/src/library/NamedEntityContextTagger.php
+++ b/src/library/NamedEntityContextTagger.php
@@ -98,7 +98,7 @@ class NamedEntityContextTagger extends ContextTagger
         echo "Reading files... \n";
         // term_tag_sentences[sentence#] = [[words...], [tags...]]
         $term_tag_sentences = self::processTexts($text_files,
-            $term_tag_separator, $term_callback, $tag_callback);
+            $term_tag_separator, $term_callback, $tag_callback, true);
         $this->word_feature = [];
         $this->tag_set = [];
         $tag_index = 0;
diff --git a/src/locale/ar/configure.ini b/src/locale/ar/configure.ini
index 827fa57ba..d6adedb92 100755
--- a/src/locale/ar/configure.ini
+++ b/src/locale/ar/configure.ini
@@ -1778,7 +1778,7 @@ rss_layout_title = "بي إتش بي محرك البحث-يوب!:  %s"
 rss_layout_description = "نتائج البحث ل:  %s"
 ;
 ; View.php
-view_locale_version = "7"
+view_locale_version = "8"
 view_logo_alt_text = "Yioop"
 ;
 ; /src/views/helpers
diff --git a/src/locale/bn/configure.ini b/src/locale/bn/configure.ini
index 48f7cbeba..d4f4254c8 100755
--- a/src/locale/bn/configure.ini
+++ b/src/locale/bn/configure.ini
@@ -1778,7 +1778,7 @@ rss_layout_title = "পিএইচপি সার্চ ইঞ্জিন - Y
 rss_layout_description = "জন্য অনুসন্ধান ফলাফল: %s"
 ;
 ; View.php
-view_locale_version = "7"
+view_locale_version = "8"
 view_logo_alt_text = "Yioop"
 ;
 ; /src/views/helpers
diff --git a/src/locale/de/configure.ini b/src/locale/de/configure.ini
index 23cdf4c0b..1647e8b6e 100755
--- a/src/locale/de/configure.ini
+++ b/src/locale/de/configure.ini
@@ -1778,7 +1778,7 @@ rss_layout_title = "PHP Suchmaschine - Yioop! : %s"
 rss_layout_description = "Suchergebnisse f&uuml;r: %s"
 ;
 ; View.php
-view_locale_version = "7"
+view_locale_version = "8"
 view_logo_alt_text = "Yioop"
 ;
 ; /src/views/helpers
diff --git a/src/locale/en_US/configure.ini b/src/locale/en_US/configure.ini
index 55a51ceb1..2f168c048 100644
--- a/src/locale/en_US/configure.ini
+++ b/src/locale/en_US/configure.ini
@@ -1778,7 +1778,7 @@ rss_layout_title = "PHP Search Engine - Yioop! : %s"
 rss_layout_description = "Search results for: %s"
 ;
 ; View.php
-view_locale_version = "7"
+view_locale_version = "8"
 view_logo_alt_text = "Yioop"
 ;
 ; /src/views/helpers
diff --git a/src/locale/es/configure.ini b/src/locale/es/configure.ini
index 49a19c091..7994547e5 100755
--- a/src/locale/es/configure.ini
+++ b/src/locale/es/configure.ini
@@ -1778,7 +1778,7 @@ rss_layout_title = "PHP Motor de B&uacute;squeda - Yioop! : %s"
 rss_layout_description = "Resultados de la b&uacute;squeda por: %s"
 ;
 ; View.php
-view_locale_version = "7"
+view_locale_version = "8"
 view_logo_alt_text = "Yioop"
 ;
 ; /src/views/helpers
diff --git a/src/locale/fa/configure.ini b/src/locale/fa/configure.ini
index 61a82b9cf..021cf2edf 100755
--- a/src/locale/fa/configure.ini
+++ b/src/locale/fa/configure.ini
@@ -1778,7 +1778,7 @@ rss_layout_title = "موتور جستجوی PHP - Yioop! : %s"
 rss_layout_description = "نتایج جستجو برای: %s"
 ;
 ; View.php
-view_locale_version = "7"
+view_locale_version = "8"
 view_logo_alt_text = "Yioop"
 ;
 ; /src/views/helpers
diff --git a/src/locale/fr_FR/configure.ini b/src/locale/fr_FR/configure.ini
index 0dbcb7038..76c54bab2 100755
--- a/src/locale/fr_FR/configure.ini
+++ b/src/locale/fr_FR/configure.ini
@@ -1778,7 +1778,7 @@ rss_layout_title = "Moteur de recherche PHP -Yioop! %s"
 rss_layout_description = "%s r&eacute;sultats"
 ;
 ; View.php
-view_locale_version = "7"
+view_locale_version = "8"
 view_logo_alt_text = "Yioop"
 ;
 ; /src/views/helpers
diff --git a/src/locale/he/configure.ini b/src/locale/he/configure.ini
index 41036cc28..4bf1c0807 100755
--- a/src/locale/he/configure.ini
+++ b/src/locale/he/configure.ini
@@ -1778,7 +1778,7 @@ rss_layout_title = "PHP מנוע חיפוש - Yioop! : %s"
 rss_layout_description = "תוצאות חיפוש עבור: %s"
 ;
 ; View.php
-view_locale_version = "7"
+view_locale_version = "8"
 view_logo_alt_text = "Yioop"
 ;
 ; /src/views/helpers
diff --git a/src/locale/hi/configure.ini b/src/locale/hi/configure.ini
index 54c8c646e..3db494619 100755
--- a/src/locale/hi/configure.ini
+++ b/src/locale/hi/configure.ini
@@ -1778,7 +1778,7 @@ rss_layout_title = "PHP खोज इंजन - Yioop! : %s"
 rss_layout_description = "खोज परिणाम के लिए: %s"
 ;
 ; View.php
-view_locale_version = "7"
+view_locale_version = "8"
 view_logo_alt_text = "Yioop"
 ;
 ; /src/views/helpers
diff --git a/src/locale/id/configure.ini b/src/locale/id/configure.ini
index ab109d8f3..73a883527 100755
--- a/src/locale/id/configure.ini
+++ b/src/locale/id/configure.ini
@@ -1778,7 +1778,7 @@ rss_layout_title = "Mesin Pencari PHP - Yioop! : %s"
 rss_layout_description = "Hasil pencarian untuk: %s"
 ;
 ; View.php
-view_locale_version = "7"
+view_locale_version = "8"
 view_logo_alt_text = "Yioop"
 ;
 ; /src/views/helpers
diff --git a/src/locale/it/configure.ini b/src/locale/it/configure.ini
index 76dd0fdb0..714af639d 100755
--- a/src/locale/it/configure.ini
+++ b/src/locale/it/configure.ini
@@ -1778,7 +1778,7 @@ rss_layout_title = "Yioop! Motore di Ricerca in PHP: %s"
 rss_layout_description = "Risultati di ricerca per: %s"
 ;
 ; View.php
-view_locale_version = "7"
+view_locale_version = "8"
 view_logo_alt_text = "Yioop"
 ;
 ; /src/views/helpers
diff --git a/src/locale/ja/configure.ini b/src/locale/ja/configure.ini
index 4b492fe9b..fba1b3c64 100755
--- a/src/locale/ja/configure.ini
+++ b/src/locale/ja/configure.ini
@@ -1778,7 +1778,7 @@ rss_layout_title = "PHPの検索エンジン-Yioop! :%s"
 rss_layout_description = "検索結果:%s"
 ;
 ; View.php
-view_locale_version = "7"
+view_locale_version = "8"
 view_logo_alt_text = "Yioop"
 ;
 ; /src/views/helpers
diff --git a/src/locale/kn/configure.ini b/src/locale/kn/configure.ini
index 8f6d6d309..686855317 100755
--- a/src/locale/kn/configure.ini
+++ b/src/locale/kn/configure.ini
@@ -1778,7 +1778,7 @@ rss_layout_title = "ಪಿಹೆಚಪಿ ಶೋಧನಾ ಯಂತ್ರ - ಯ
 rss_layout_description = "ಈ ಶೋಧನಾ ಫಲಿತಾಂಶಗಳು ನಿಮ್ಮ ಪ್ರಶ್ನೆ: %s  ಯ ಉತ್ತರ"
 ;
 ; View.php
-view_locale_version = "7"
+view_locale_version = "8"
 view_logo_alt_text = "Yioop"
 ;
 ; /src/views/helpers
diff --git a/src/locale/ko/configure.ini b/src/locale/ko/configure.ini
index a0cc690b3..73681afaa 100755
--- a/src/locale/ko/configure.ini
+++ b/src/locale/ko/configure.ini
@@ -1778,7 +1778,7 @@ rss_layout_title = "PHP 검색 엔진 - Yioop! : %s"
 rss_layout_description = "%s 에 대한 서치 결과:"
 ;
 ; View.php
-view_locale_version = "7"
+view_locale_version = "8"
 view_logo_alt_text = "Yioop"
 ;
 ; /src/views/helpers
diff --git a/src/locale/nl/configure.ini b/src/locale/nl/configure.ini
index 7e45b9a10..d5d49eedc 100644
--- a/src/locale/nl/configure.ini
+++ b/src/locale/nl/configure.ini
@@ -1778,7 +1778,7 @@ rss_layout_title = "PHP Search Engine - Yioop! :%S"
 rss_layout_description = "Zoek resultaten voor: %s"
 ;
 ; View.php
-view_locale_version = "7"
+view_locale_version = "8"
 view_logo_alt_text = "Yioop"
 ;
 ; /src/views/helpers
diff --git a/src/locale/pl/configure.ini b/src/locale/pl/configure.ini
index af53b6509..3c045a9fd 100755
--- a/src/locale/pl/configure.ini
+++ b/src/locale/pl/configure.ini
@@ -1778,7 +1778,7 @@ rss_layout_title = "Wyszukaj silniku PHP - Yioop! : %s"
 rss_layout_description = "Wyniki wyszukiwania dla: %s"
 ;
 ; View.php
-view_locale_version = "7"
+view_locale_version = "8"
 view_logo_alt_text = "Yioop"
 ;
 ; /src/views/helpers
diff --git a/src/locale/pt/configure.ini b/src/locale/pt/configure.ini
index f7853ae13..3689a8d27 100755
--- a/src/locale/pt/configure.ini
+++ b/src/locale/pt/configure.ini
@@ -1778,7 +1778,7 @@ rss_layout_title = "PHP Motor de Busca - Yioop! : %s"
 rss_layout_description = "Resultados da pesquisa para: %s"
 ;
 ; View.php
-view_locale_version = "7"
+view_locale_version = "8"
 view_logo_alt_text = "Yioop"
 ;
 ; /src/views/helpers
diff --git a/src/locale/ru/configure.ini b/src/locale/ru/configure.ini
index 41d532707..8ea318114 100755
--- a/src/locale/ru/configure.ini
+++ b/src/locale/ru/configure.ini
@@ -1778,7 +1778,7 @@ rss_layout_title = "Поиск движке PHP - Yioop! : %s"
 rss_layout_description = "Результаты поиска для: %s"
 ;
 ; View.php
-view_locale_version = "7"
+view_locale_version = "8"
 view_logo_alt_text = "Yioop"
 ;
 ; /src/views/helpers
diff --git a/src/locale/te/configure.ini b/src/locale/te/configure.ini
index 6ea29dd33..a40acf366 100644
--- a/src/locale/te/configure.ini
+++ b/src/locale/te/configure.ini
@@ -1778,7 +1778,7 @@ rss_layout_title = "PHP శోధన ఇంజిన్ - Yioop! : %s"
 rss_layout_description = "కోసం శోధన ఫలితాలు: %s"
 ;
 ; View.php
-view_locale_version = "7"
+view_locale_version = "8"
 view_logo_alt_text = "Yioop"
 ;
 ; /src/views/helpers
diff --git a/src/locale/th/configure.ini b/src/locale/th/configure.ini
index c172be812..7e46d6352 100755
--- a/src/locale/th/configure.ini
+++ b/src/locale/th/configure.ini
@@ -1778,7 +1778,7 @@ rss_layout_title = "ค้นหาเกี่ยวกับ phpquery เค
 rss_layout_description = "ผลการค้นหาสำหรับ:ทั้งหมด %s"
 ;
 ; View.php
-view_locale_version = "7"
+view_locale_version = "8"
 view_logo_alt_text = "Yioop"
 ;
 ; /src/views/helpers
diff --git a/src/locale/tl/configure.ini b/src/locale/tl/configure.ini
index a172c32d9..e3080e41e 100644
--- a/src/locale/tl/configure.ini
+++ b/src/locale/tl/configure.ini
@@ -1778,7 +1778,7 @@ rss_layout_title = "PHP Search Engine - Yioop! : %s"
 rss_layout_description = "Ang mga resulta ng paghahanap para sa: %s"
 ;
 ; View.php
-view_locale_version = "7"
+view_locale_version = "8"
 view_logo_alt_text = "Yioop"
 ;
 ; /src/views/helpers
diff --git a/src/locale/tr/configure.ini b/src/locale/tr/configure.ini
index b063f16bc..c720d16e1 100755
--- a/src/locale/tr/configure.ini
+++ b/src/locale/tr/configure.ini
@@ -1778,7 +1778,7 @@ rss_layout_title = "PHP Arama Motoru - Yioop! : %s"
 rss_layout_description = "%s i&ccedil;in arama sonu&ccedil;ları: "
 ;
 ; View.php
-view_locale_version = "7"
+view_locale_version = "8"
 view_logo_alt_text = "Yioop"
 ;
 ; /src/views/helpers
diff --git a/src/locale/vi_VN/configure.ini b/src/locale/vi_VN/configure.ini
index e4dd22063..4b7fe49c3 100755
--- a/src/locale/vi_VN/configure.ini
+++ b/src/locale/vi_VN/configure.ini
@@ -1778,7 +1778,7 @@ rss_layout_title = "C&ocirc;ng Cụ T&igrave;m kiếm PHP - Yioop! : %s"
 rss_layout_description = "Kết quả tìm kiếm: %s"
 ;
 ; View.php
-view_locale_version = "7"
+view_locale_version = "8"
 view_logo_alt_text = "Yioop"
 ;
 ; /src/views/helpers
diff --git a/src/locale/zh_CN/configure.ini b/src/locale/zh_CN/configure.ini
index 57845ebdb..5dd4122be 100755
--- a/src/locale/zh_CN/configure.ini
+++ b/src/locale/zh_CN/configure.ini
@@ -1778,7 +1778,7 @@ rss_layout_title = "PHP搜索引擎-Yioop! :%s"
 rss_layout_description = "搜索结果为:%s"
 ;
 ; View.php
-view_locale_version = "7"
+view_locale_version = "8"
 view_logo_alt_text = "Yioop"
 ;
 ; /src/views/helpers
ViewGit