diff --git a/src/library/ContextTagger.php b/src/library/ContextTagger.php index 549760cae..5a10abc4d 100644 --- a/src/library/ContextTagger.php +++ b/src/library/ContextTagger.php @@ -129,6 +129,11 @@ abstract class ContextTagger * before adding term to sentence term array * @param function $tag_callback callback function applied to a part of * speech tag before adding tag to sentence tag array + * @param bool $tag_on_array_chars for some kinds of text processing + * it better to assume the tags are applied to each char within a term + * rather than at the term level. For example, we might want to use + * char within a term for name entity tagging. THis flag if true + * says to do this; otherwise don't * @return array of separated sentences, each sentence having the format of * [[terms...], [tags...]] * Currently, the training data needs to fit Chinese Treebank format: @@ -137,9 +142,10 @@ abstract class ContextTagger * To adapt to other language, some modifications are needed */ public static function processTexts($text_files, $term_tag_separator = "_", - $term_callback = null, $tag_callback = null) + $term_callback = null, $tag_callback = null, + $tag_on_array_chars = false) { - $ret = []; + $out = []; foreach($text_files as $text_file) { if (file_exists($text_file)) { $fh = fopen($text_file, "r"); @@ -148,27 +154,36 @@ abstract class ContextTagger if(strpos($line, '<') !== false) { continue; } - $word_tag_pairs = preg_split("/[\s ]+/u", $line); - if (!count($word_tag_pairs)) { + $term_tag_pairs = preg_split("/[\s ]+/u", $line); + if (!count($term_tag_pairs)) { continue; } - $ret[] = []; - $ret[count($ret) - 1][0] = []; - $ret[count($ret) - 1][1] = []; - foreach ($word_tag_pairs as $word_tag_pair) { - $t = explode($term_tag_separator, $word_tag_pair); + $out[] = []; + $last_out = count($out) - 1; + $out[$last_out][0] = []; + $out[$last_out][1] = []; + foreach ($term_tag_pairs as $term_tag_pair) { + $t = explode($term_tag_separator, $term_tag_pair); if (count($t) == 2) { - $ret[count($ret) - 1][0][] = - $term_callback ? $term_callback($t[0]) : $t[0]; - $ret[count($ret) - 1][1][] = - $tag_callback ? $tag_callback($t[1]) : $t[1]; + $tag = $tag_callback ? $tag_callback($t[1]) : $t[1]; + if ($tag_on_array_chars) { + $to_tags = preg_split('//u', $t[0], null, + PREG_SPLIT_NO_EMPTY); + } else { + $to_tags = [$t[0]]; + } + foreach($to_tags as $to_tag) { + $out[$last_out][0][] = $term_callback ? + $term_callback($to_tag) : $to_tag; + $out[$last_out][1][] = $tag; + } } } } fclose($fh); } } - return $ret; + return $out; } /** * Maps a term to a corresponding key if the term matches some simple diff --git a/src/library/NamedEntityContextTagger.php b/src/library/NamedEntityContextTagger.php index 53de40890..c402393fd 100644 --- a/src/library/NamedEntityContextTagger.php +++ b/src/library/NamedEntityContextTagger.php @@ -98,7 +98,7 @@ class NamedEntityContextTagger extends ContextTagger echo "Reading files... \n"; // term_tag_sentences[sentence#] = [[words...], [tags...]] $term_tag_sentences = self::processTexts($text_files, - $term_tag_separator, $term_callback, $tag_callback); + $term_tag_separator, $term_callback, $tag_callback, true); $this->word_feature = []; $this->tag_set = []; $tag_index = 0; diff --git a/src/locale/ar/configure.ini b/src/locale/ar/configure.ini index 827fa57ba..d6adedb92 100755 --- a/src/locale/ar/configure.ini +++ b/src/locale/ar/configure.ini @@ -1778,7 +1778,7 @@ rss_layout_title = "بي إتش بي محرك البحث-يوب!: %s" rss_layout_description = "نتائج البحث ل: %s" ; ; View.php -view_locale_version = "7" +view_locale_version = "8" view_logo_alt_text = "Yioop" ; ; /src/views/helpers diff --git a/src/locale/bn/configure.ini b/src/locale/bn/configure.ini index 48f7cbeba..d4f4254c8 100755 --- a/src/locale/bn/configure.ini +++ b/src/locale/bn/configure.ini @@ -1778,7 +1778,7 @@ rss_layout_title = "পিএইচপি সার্চ ইঞ্জিন - Y rss_layout_description = "জন্য অনুসন্ধান ফলাফল: %s" ; ; View.php -view_locale_version = "7" +view_locale_version = "8" view_logo_alt_text = "Yioop" ; ; /src/views/helpers diff --git a/src/locale/de/configure.ini b/src/locale/de/configure.ini index 23cdf4c0b..1647e8b6e 100755 --- a/src/locale/de/configure.ini +++ b/src/locale/de/configure.ini @@ -1778,7 +1778,7 @@ rss_layout_title = "PHP Suchmaschine - Yioop! : %s" rss_layout_description = "Suchergebnisse für: %s" ; ; View.php -view_locale_version = "7" +view_locale_version = "8" view_logo_alt_text = "Yioop" ; ; /src/views/helpers diff --git a/src/locale/en_US/configure.ini b/src/locale/en_US/configure.ini index 55a51ceb1..2f168c048 100644 --- a/src/locale/en_US/configure.ini +++ b/src/locale/en_US/configure.ini @@ -1778,7 +1778,7 @@ rss_layout_title = "PHP Search Engine - Yioop! : %s" rss_layout_description = "Search results for: %s" ; ; View.php -view_locale_version = "7" +view_locale_version = "8" view_logo_alt_text = "Yioop" ; ; /src/views/helpers diff --git a/src/locale/es/configure.ini b/src/locale/es/configure.ini index 49a19c091..7994547e5 100755 --- a/src/locale/es/configure.ini +++ b/src/locale/es/configure.ini @@ -1778,7 +1778,7 @@ rss_layout_title = "PHP Motor de Búsqueda - Yioop! : %s" rss_layout_description = "Resultados de la búsqueda por: %s" ; ; View.php -view_locale_version = "7" +view_locale_version = "8" view_logo_alt_text = "Yioop" ; ; /src/views/helpers diff --git a/src/locale/fa/configure.ini b/src/locale/fa/configure.ini index 61a82b9cf..021cf2edf 100755 --- a/src/locale/fa/configure.ini +++ b/src/locale/fa/configure.ini @@ -1778,7 +1778,7 @@ rss_layout_title = "موتور جستجوی PHP - Yioop! : %s" rss_layout_description = "نتایج جستجو برای: %s" ; ; View.php -view_locale_version = "7" +view_locale_version = "8" view_logo_alt_text = "Yioop" ; ; /src/views/helpers diff --git a/src/locale/fr_FR/configure.ini b/src/locale/fr_FR/configure.ini index 0dbcb7038..76c54bab2 100755 --- a/src/locale/fr_FR/configure.ini +++ b/src/locale/fr_FR/configure.ini @@ -1778,7 +1778,7 @@ rss_layout_title = "Moteur de recherche PHP -Yioop! %s" rss_layout_description = "%s résultats" ; ; View.php -view_locale_version = "7" +view_locale_version = "8" view_logo_alt_text = "Yioop" ; ; /src/views/helpers diff --git a/src/locale/he/configure.ini b/src/locale/he/configure.ini index 41036cc28..4bf1c0807 100755 --- a/src/locale/he/configure.ini +++ b/src/locale/he/configure.ini @@ -1778,7 +1778,7 @@ rss_layout_title = "PHP מנוע חיפוש - Yioop! : %s" rss_layout_description = "תוצאות חיפוש עבור: %s" ; ; View.php -view_locale_version = "7" +view_locale_version = "8" view_logo_alt_text = "Yioop" ; ; /src/views/helpers diff --git a/src/locale/hi/configure.ini b/src/locale/hi/configure.ini index 54c8c646e..3db494619 100755 --- a/src/locale/hi/configure.ini +++ b/src/locale/hi/configure.ini @@ -1778,7 +1778,7 @@ rss_layout_title = "PHP खोज इंजन - Yioop! : %s" rss_layout_description = "खोज परिणाम के लिए: %s" ; ; View.php -view_locale_version = "7" +view_locale_version = "8" view_logo_alt_text = "Yioop" ; ; /src/views/helpers diff --git a/src/locale/id/configure.ini b/src/locale/id/configure.ini index ab109d8f3..73a883527 100755 --- a/src/locale/id/configure.ini +++ b/src/locale/id/configure.ini @@ -1778,7 +1778,7 @@ rss_layout_title = "Mesin Pencari PHP - Yioop! : %s" rss_layout_description = "Hasil pencarian untuk: %s" ; ; View.php -view_locale_version = "7" +view_locale_version = "8" view_logo_alt_text = "Yioop" ; ; /src/views/helpers diff --git a/src/locale/it/configure.ini b/src/locale/it/configure.ini index 76dd0fdb0..714af639d 100755 --- a/src/locale/it/configure.ini +++ b/src/locale/it/configure.ini @@ -1778,7 +1778,7 @@ rss_layout_title = "Yioop! Motore di Ricerca in PHP: %s" rss_layout_description = "Risultati di ricerca per: %s" ; ; View.php -view_locale_version = "7" +view_locale_version = "8" view_logo_alt_text = "Yioop" ; ; /src/views/helpers diff --git a/src/locale/ja/configure.ini b/src/locale/ja/configure.ini index 4b492fe9b..fba1b3c64 100755 --- a/src/locale/ja/configure.ini +++ b/src/locale/ja/configure.ini @@ -1778,7 +1778,7 @@ rss_layout_title = "PHPの検索エンジン-Yioop! :%s" rss_layout_description = "検索結果:%s" ; ; View.php -view_locale_version = "7" +view_locale_version = "8" view_logo_alt_text = "Yioop" ; ; /src/views/helpers diff --git a/src/locale/kn/configure.ini b/src/locale/kn/configure.ini index 8f6d6d309..686855317 100755 --- a/src/locale/kn/configure.ini +++ b/src/locale/kn/configure.ini @@ -1778,7 +1778,7 @@ rss_layout_title = "ಪಿಹೆಚಪಿ ಶೋಧನಾ ಯಂತ್ರ - ಯ rss_layout_description = "ಈ ಶೋಧನಾ ಫಲಿತಾಂಶಗಳು ನಿಮ್ಮ ಪ್ರಶ್ನೆ: %s ಯ ಉತ್ತರ" ; ; View.php -view_locale_version = "7" +view_locale_version = "8" view_logo_alt_text = "Yioop" ; ; /src/views/helpers diff --git a/src/locale/ko/configure.ini b/src/locale/ko/configure.ini index a0cc690b3..73681afaa 100755 --- a/src/locale/ko/configure.ini +++ b/src/locale/ko/configure.ini @@ -1778,7 +1778,7 @@ rss_layout_title = "PHP 검색 엔진 - Yioop! : %s" rss_layout_description = "%s 에 대한 서치 결과:" ; ; View.php -view_locale_version = "7" +view_locale_version = "8" view_logo_alt_text = "Yioop" ; ; /src/views/helpers diff --git a/src/locale/nl/configure.ini b/src/locale/nl/configure.ini index 7e45b9a10..d5d49eedc 100644 --- a/src/locale/nl/configure.ini +++ b/src/locale/nl/configure.ini @@ -1778,7 +1778,7 @@ rss_layout_title = "PHP Search Engine - Yioop! :%S" rss_layout_description = "Zoek resultaten voor: %s" ; ; View.php -view_locale_version = "7" +view_locale_version = "8" view_logo_alt_text = "Yioop" ; ; /src/views/helpers diff --git a/src/locale/pl/configure.ini b/src/locale/pl/configure.ini index af53b6509..3c045a9fd 100755 --- a/src/locale/pl/configure.ini +++ b/src/locale/pl/configure.ini @@ -1778,7 +1778,7 @@ rss_layout_title = "Wyszukaj silniku PHP - Yioop! : %s" rss_layout_description = "Wyniki wyszukiwania dla: %s" ; ; View.php -view_locale_version = "7" +view_locale_version = "8" view_logo_alt_text = "Yioop" ; ; /src/views/helpers diff --git a/src/locale/pt/configure.ini b/src/locale/pt/configure.ini index f7853ae13..3689a8d27 100755 --- a/src/locale/pt/configure.ini +++ b/src/locale/pt/configure.ini @@ -1778,7 +1778,7 @@ rss_layout_title = "PHP Motor de Busca - Yioop! : %s" rss_layout_description = "Resultados da pesquisa para: %s" ; ; View.php -view_locale_version = "7" +view_locale_version = "8" view_logo_alt_text = "Yioop" ; ; /src/views/helpers diff --git a/src/locale/ru/configure.ini b/src/locale/ru/configure.ini index 41d532707..8ea318114 100755 --- a/src/locale/ru/configure.ini +++ b/src/locale/ru/configure.ini @@ -1778,7 +1778,7 @@ rss_layout_title = "Поиск движке PHP - Yioop! : %s" rss_layout_description = "Результаты поиска для: %s" ; ; View.php -view_locale_version = "7" +view_locale_version = "8" view_logo_alt_text = "Yioop" ; ; /src/views/helpers diff --git a/src/locale/te/configure.ini b/src/locale/te/configure.ini index 6ea29dd33..a40acf366 100644 --- a/src/locale/te/configure.ini +++ b/src/locale/te/configure.ini @@ -1778,7 +1778,7 @@ rss_layout_title = "PHP శోధన ఇంజిన్ - Yioop! : %s" rss_layout_description = "కోసం శోధన ఫలితాలు: %s" ; ; View.php -view_locale_version = "7" +view_locale_version = "8" view_logo_alt_text = "Yioop" ; ; /src/views/helpers diff --git a/src/locale/th/configure.ini b/src/locale/th/configure.ini index c172be812..7e46d6352 100755 --- a/src/locale/th/configure.ini +++ b/src/locale/th/configure.ini @@ -1778,7 +1778,7 @@ rss_layout_title = "ค้นหาเกี่ยวกับ phpquery เค rss_layout_description = "ผลการค้นหาสำหรับ:ทั้งหมด %s" ; ; View.php -view_locale_version = "7" +view_locale_version = "8" view_logo_alt_text = "Yioop" ; ; /src/views/helpers diff --git a/src/locale/tl/configure.ini b/src/locale/tl/configure.ini index a172c32d9..e3080e41e 100644 --- a/src/locale/tl/configure.ini +++ b/src/locale/tl/configure.ini @@ -1778,7 +1778,7 @@ rss_layout_title = "PHP Search Engine - Yioop! : %s" rss_layout_description = "Ang mga resulta ng paghahanap para sa: %s" ; ; View.php -view_locale_version = "7" +view_locale_version = "8" view_logo_alt_text = "Yioop" ; ; /src/views/helpers diff --git a/src/locale/tr/configure.ini b/src/locale/tr/configure.ini index b063f16bc..c720d16e1 100755 --- a/src/locale/tr/configure.ini +++ b/src/locale/tr/configure.ini @@ -1778,7 +1778,7 @@ rss_layout_title = "PHP Arama Motoru - Yioop! : %s" rss_layout_description = "%s için arama sonuçları: " ; ; View.php -view_locale_version = "7" +view_locale_version = "8" view_logo_alt_text = "Yioop" ; ; /src/views/helpers diff --git a/src/locale/vi_VN/configure.ini b/src/locale/vi_VN/configure.ini index e4dd22063..4b7fe49c3 100755 --- a/src/locale/vi_VN/configure.ini +++ b/src/locale/vi_VN/configure.ini @@ -1778,7 +1778,7 @@ rss_layout_title = "Công Cụ Tìm kiếm PHP - Yioop! : %s" rss_layout_description = "Kết quả tìm kiếm: %s" ; ; View.php -view_locale_version = "7" +view_locale_version = "8" view_logo_alt_text = "Yioop" ; ; /src/views/helpers diff --git a/src/locale/zh_CN/configure.ini b/src/locale/zh_CN/configure.ini index 57845ebdb..5dd4122be 100755 --- a/src/locale/zh_CN/configure.ini +++ b/src/locale/zh_CN/configure.ini @@ -1778,7 +1778,7 @@ rss_layout_title = "PHP搜索引擎-Yioop! :%s" rss_layout_description = "搜索结果为:%s" ; ; View.php -view_locale_version = "7" +view_locale_version = "8" view_logo_alt_text = "Yioop" ; ; /src/views/helpers