diff --git a/controllers/components/crawl_component.php b/controllers/components/crawl_component.php
index 07654d02b..7539a0cb2 100644
--- a/controllers/components/crawl_component.php
+++ b/controllers/components/crawl_component.php
@@ -1183,7 +1183,7 @@ class CrawlComponent extends Component implements CrawlConstants
$site[self::SERVER] = "unknown";
$site[self::SERVER_VERSION] = "unknown";
$site[self::OPERATING_SYSTEM] = "unknown";
- $site[self::LANG] = 'en';
+ $site[self::LANG] = 'en-US';
$site[self::JUST_METAS] = false;
if (isset($_REQUEST['page_type']) &&
in_array($_REQUEST['page_type'], $data['MIME_TYPES'])) {
diff --git a/lib/locale_functions.php b/lib/locale_functions.php
index 570bc855e..8709bf9e2 100755
--- a/lib/locale_functions.php
+++ b/lib/locale_functions.php
@@ -97,7 +97,7 @@ function guessLocale()
*/
function guessLocaleFromString($phrase_string, $locale_tag = null)
{
- $query_string = $phrase_string;
+ $original_phrase_string = $phrase_string;
$locale_tag = ($locale_tag == null) ? getLocaleTag() : $locale_tag;
$sub = PUNCT."|[0-9]|\s";
$phrase_string = preg_replace('/'.$sub.'/u', "", $phrase_string);
@@ -148,7 +148,7 @@ function guessLocaleFromString($phrase_string, $locale_tag = null)
}
}
if ($locale_tag == 'en-US') {
- $locale_tag = checkQuery($query_string);
+ $locale_tag = checkQuery($original_phrase_string);
}
return $locale_tag;
}
diff --git a/lib/processors/text_processor.php b/lib/processors/text_processor.php
index 73c634bfd..050caa321 100755
--- a/lib/processors/text_processor.php
+++ b/lib/processors/text_processor.php
@@ -59,6 +59,10 @@ require_once BASE_DIR."/lib/summarizers/graph_based_summarizer.php";
* So can extract parts of the URL if need to guess lang
*/
require_once BASE_DIR."/lib/url_parser.php";
+/**
+ * To try to guess locale's from string samples
+ */
+require_once BASE_DIR."/lib/locale_functions.php";
/**
* Parent class common to all processors used to create crawl summary
* information that involves basically text data
@@ -96,8 +100,7 @@ class TextProcessor extends PageProcessor
$summary[self::DESCRIPTION] = mb_substr($page, 0,
self::$max_description_len);
}
- $summary[self::LANG] = self::calculateLang(
- $summary[self::DESCRIPTION]);
+ $summary[self::LANG] = $lang;
$summary[self::LINKS] = self::extractHttpHttpsUrls($page);
$summary[self::PAGE] = "<html><body><div><pre>".
strip_tags($page)."</pre></div></body></html>";
@@ -118,23 +121,10 @@ class TextProcessor extends PageProcessor
{
if ($url != null) {
$lang = UrlParser::getLang($url);
- if ($lang != null) return $lang;
+ if ($lang != null) { return $lang; }
}
if ($sample_text != null){
- $words = mb_split("[[:space:]]|".PUNCT, $sample_text);
- $num_words = count($words);
- $ascii_count = 0;
- foreach ($words as $word) {
- if (strlen($word) == mb_strlen($word)) {
- $ascii_count++;
- }
- }
- // crude, but let's guess ASCII == english
- if ($ascii_count/$num_words > EN_RATIO) {
- $lang = 'en';
- } else {
- $lang = null;
- }
+ $lang = guessLocaleFromString($sample_text);
} else {
$lang = null;
}
diff --git a/locale/fa/resources/tokenizer.php b/locale/fa/resources/tokenizer.php
index a7e0a1533..fdccec0cb 100755
--- a/locale/fa/resources/tokenizer.php
+++ b/locale/fa/resources/tokenizer.php
@@ -28,11 +28,204 @@
*/
if (!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}
/**
- * Persian specific tokenization code. Typically, tokenizer.php
- * either contains a stemmer for the language in question or
- * it specifies how many characters in a char gram
+ * Persian specific tokenization code. In particular, it has a stemmer,
+ * The stemmer is my stab at porting Nick Patch's Perl port,
+ * https://metacpan.org/pod/Lingua::Stem::UniNE::FA, of the
+ * stemming algorithm by Ljiljana Dolamic and Jacques
+ * Savoy of the University of Neuchâtel
+ * http://members.unine.ch/jacques.savoy/clef/persianStemmerUnicode.txt
+ * Here given a word, its stem is that part of the word that
+ * is common to all its inflected variants. For example,
+ * tall is common to tall, taller, tallest. A stemmer takes
+ * a word and tries to produce its stem.
*
* @author Chris Pollett
* @package seek_quarry\locale\fa
*/
-$GLOBALS['CHARGRAMS']['fa'] = 5;
+class FaTokenizer
+{
+ /**
+ * Words we don't want to be stemmed
+ * @var array
+ */
+ static $no_stem_list = array();
+ /**
+ * Stub function which could be used for a word segmenter.
+ * Such a segmenter on input thisisabunchofwords would output
+ * this is a bunch of words
+ *
+ * @param string $pre_segment before segmentation
+ * @return string should return string with words separated by space
+ * in this case does nothing
+ */
+ static function segment($pre_segment)
+ {
+ return $pre_segment;
+ }
+ /**
+ * Removes the stop words from the page (used for Word Cloud generation)
+ *
+ * @param string $page the page to remove stop words from.
+ * @return string $page with no stop words
+ */
+ static function stopwordsRemover($page)
+ {
+ $stop_words = array(
+ "در", "به", "از", "كه", "مي", "اين", "است", "را", "با", "هاي",
+ "براي", "آن", "يك", "شود", "شده","خود", "ها", "كرد", "شد", "اي",
+ "تا", "كند", "بر", "بود", "گفت", "نيز", "وي", "هم", "كنند",
+ "دارد", "ما", "كرده", "يا", "اما", "بايد", "دو", "اند", "هر",
+ "خواهد", "او", "مورد", "آنها", "باشد", "ديگر", "مردم", "نمي",
+ "بين", "پيش", "پس", "اگر", "همه", "صورت", "يكي", "هستند",
+ "بي", "من", "دهد", "هزار", "نيست", "استفاده", "داد", "داشته",
+ "راه", "داشت", "چه", "همچنين", "كردند", "داده", "بوده",
+ "دارند", "همين", "ميليون", "سوي", "شوند", "بيشتر", "بسيار",
+ "روي", "گرفته", "هايي", "تواند", "اول", "نام", "هيچ", "چند",
+ "جديد", "بيش", "شدن", "كردن", "كنيم", "نشان", "حتي", "اينكه",
+ "ولی", "توسط", "چنين", "برخي", "نه", "ديروز", "دوم",
+ "درباره", "بعد", "مختلف", "گيرد", "شما", "گفته", "آنان",
+ "بار", "طور", "گرفت", "دهند", "گذاري", "بسياري", "طي",
+ "بودند", "ميليارد", "بدون", "تمام", "كل", "تر",
+ "براساس", "شدند", "ترين", "امروز", "باشند", "ندارد",
+ "چون", "قابل", "گويد", "ديگري", "همان", "خواهند",
+ "قبل", "آمده", "اكنون", "تحت", "طريق", "گيري", "جاي",
+ "هنوز", "چرا", "البته", "كنيد", "سازي", "سوم", "كنم",
+ "بلكه", "زير", "توانند", "ضمن", "فقط", "بودن", "حق",
+ "آيد", "وقتي", "اش", "يابد", "نخستين", "مقابل", "خدمات",
+ "امسال", "تاكنون", "مانند", "تازه", "آورد", "فكر",
+ "آنچه", "نخست", "نشده", "شايد", "چهار", "جريان",
+ "پنج", "ساخته", "زيرا", "نزديك", "برداري", "كسي",
+ "ريزي", "رفت", "گردد", "مثل", "آمد", "ام", "بهترين",
+ "دانست", "كمتر", "دادن", "تمامي", "جلوگيري",
+ "بيشتري", "ايم", "ناشي", "چيزي", "آنكه", "بالا",
+ "بنابراين", "ايشان", "بعضي", "دادند", "داشتند",
+ "برخوردار", "نخواهد", "هنگام", "نبايد", "غير", "نبود",
+ "ديده", "وگو", "داريم", "چگونه", "بندي", "خواست", "فوق", "ده",
+ "نوعي", "هستيم", "ديگران", "همچنان", "سراسر", "ندارند",
+ "گروهي", "سعي", "روزهاي", "آنجا", "يكديگر", "كردم",
+ "بيست", "بروز", "سپس", "رفته", "آورده", "نمايد",
+ "باشيم", "گويند", "زياد", "خويش", "همواره", "گذاشته",
+ "شش", "نداشته", "شناسي", "خواهيم", "آباد", "داشتن",
+ "نظير", "همچون", "باره", "نكرده", "شان", "سابق",
+ "هفت", "دانند", "جايي", "بی", "جز", "زیرِ", "رویِ",
+ "سریِ", "تویِ", "جلویِ", "پیشِ", "عقبِ", "بالایِ",
+ "خارجِ", "وسطِ", "بیرونِ", "سویِ", "کنارِ", "پاعینِ",
+ "نزدِ", "نزدیکِ","دنبالِ", "حدودِ", "برابرِ", "طبقِ",
+ "مانندِ", "ضدِّ", "هنگامِ", "برایِ", "مثلِ", "بارة",
+ "اثرِ", "تولِ", "علّتِ", "سمتِ", "عنوانِ", "قصدِ",
+ "روب", "جدا", "کی", "که", "چیست", "هست", "کجا", "کجاست",
+ "کَی", "چطور", "کدام", "آیا", "مگر", "چندین",
+ "یک", "چیزی", "دیگر", "کسی", "بعری", "هیچ", "چیز",
+ "جا", "کس", "هرگز", "یا", "تنها", "بلکه", "خیاه",
+ "بله", "بلی", "آره", "آری", "مرسی", "البتّه",
+ "لطفاً", "ّه", "انکه",
+ "وقتیکه", "همین", "پیش", "مدّتی", "هنگامی", "مان", "تان"
+ );
+ $page = preg_replace('/\b('.implode('|',$stop_words).')\b/u', '',
+ mb_strtolower($page));
+ return $page;
+ }
+ /**
+ * Computes the stem of an Persian word
+ *
+ * @param string $word the string to stem
+ * @return string the stem of $word
+ */
+ static function stem($word)
+ {
+ if (in_array($word, self::$no_stem_list)) {
+ return $word;
+ }
+ $original_word = $word;
+ $word = mb_strtolower($word);
+ $word = self::removeKasra($word);
+ $word = self::removeSuffix($word);
+ $word = self::removeKasra($word);
+ return $word;
+ }
+ /**
+ * Removes a Kasra diacritic mark if appears
+ * at the end of a word.
+ * @param string $word word to remove mark from
+ * @return string result of removal
+ */
+ static function removeKasra($word)
+ {
+ if(mb_strlen($word) < 5) {
+ return $word;
+ }
+ $kasra = json_decode('"\u0650"');
+ $word = preg_replace('/'.$kasra.'$/u', "", $word);
+ return $word;
+ }
+ /**
+ * Removes common Persian suffixes
+ *
+ * @param string $word to remove suffixes from
+ * @return string result of suffix removal
+ */
+ static function removeSuffix($word)
+ {
+ $length = mb_strlen($word);
+ if ($length > 7) {
+ $modified_word = preg_replace("/(?:
+ آباد | باره | بندی | بندي | ترین | ترين | ریزی |
+ ريزي | سازی | سازي | گیری | گيري | هایی | هايي
+ ) $/xu", "", $word);
+ if($modified_word != $word) {
+ return $modified_word;
+ }
+ }
+ if ($length > 6) {
+ $modified_word = preg_replace("/(?:
+ اند | ایم | ايم | شان | های | هاي
+ ) $/xu", "", $word);
+ if($modified_word != $word) {
+ return $modified_word;
+ }
+ }
+ if ($length > 5) {
+ $modified_word = preg_replace("/ ان $/xu", "", $word);
+ if($modified_word != $word) {
+ return self::normalize($word);
+ }
+ $modified_word = preg_replace("/(?:
+ ات | اش | ام | تر | را | ون | ها | هء | ین | ين
+ ) $/xu", "", $word);
+ if($modified_word != $word) {
+ return $modified_word;
+ }
+ }
+ if ($length > 3) {
+ $modified_word = preg_replace("/(?: ت | ش | م | ه | ی | ي ) $/xu",
+ "", $word);
+ if($modified_word != $word) {
+ return $modified_word;
+ }
+ }
+ return $word;
+ }
+ /**
+ * Performs additional end word stripping
+ *
+ * @param string $word to remove suffixes from
+ * @return string result of suffix removal
+ */
+ static function normalize($word)
+ {
+ $length = mb_strlen($word);
+ if($length < 4) {
+ return $word;
+ }
+ $modified_word = preg_replace("/(?: ت | ر | ش | گ | م | ى ) $/xu", "",
+ $word);
+ if($modified_word != $word) {
+ $word = $modified_word;
+ if(mb_strlen($word) < 4) {
+ return $word;
+ }
+ $word =~ preg_replace("/(?: ی | ي ) $/xu", "", $word);
+ }
+ return $word;
+ }
+}