Initial pass at adding a Persian Stemmer, a=chris

Chris Pollett [2015-06-06 23:Jun:th]

Initial pass at adding a Persian Stemmer, a=chris

Filename
controllers/components/crawl_component.php
lib/locale_functions.php
lib/processors/text_processor.php
locale/fa/resources/tokenizer.php

diff --git a/controllers/components/crawl_component.php b/controllers/components/crawl_component.php
index 07654d02b..7539a0cb2 100644
--- a/controllers/components/crawl_component.php
+++ b/controllers/components/crawl_component.php
@@ -1183,7 +1183,7 @@ class CrawlComponent extends Component implements CrawlConstants
             $site[self::SERVER] = "unknown";
             $site[self::SERVER_VERSION] = "unknown";
             $site[self::OPERATING_SYSTEM] = "unknown";
-            $site[self::LANG] = 'en';
+            $site[self::LANG] = 'en-US';
             $site[self::JUST_METAS] = false;
             if (isset($_REQUEST['page_type']) &&
                 in_array($_REQUEST['page_type'], $data['MIME_TYPES'])) {
diff --git a/lib/locale_functions.php b/lib/locale_functions.php
index 570bc855e..8709bf9e2 100755
--- a/lib/locale_functions.php
+++ b/lib/locale_functions.php
@@ -97,7 +97,7 @@ function guessLocale()
  */
 function guessLocaleFromString($phrase_string, $locale_tag = null)
 {
-    $query_string = $phrase_string;
+    $original_phrase_string = $phrase_string;
     $locale_tag = ($locale_tag == null) ? getLocaleTag() : $locale_tag;
     $sub = PUNCT."|[0-9]|\s";
     $phrase_string = preg_replace('/'.$sub.'/u', "", $phrase_string);
@@ -148,7 +148,7 @@ function guessLocaleFromString($phrase_string, $locale_tag = null)
         }
     }
     if ($locale_tag == 'en-US') {
-        $locale_tag = checkQuery($query_string);
+        $locale_tag = checkQuery($original_phrase_string);
     }
     return $locale_tag;
 }
diff --git a/lib/processors/text_processor.php b/lib/processors/text_processor.php
index 73c634bfd..050caa321 100755
--- a/lib/processors/text_processor.php
+++ b/lib/processors/text_processor.php
@@ -59,6 +59,10 @@ require_once BASE_DIR."/lib/summarizers/graph_based_summarizer.php";
  * So can extract parts of the URL if need to guess lang
  */
 require_once BASE_DIR."/lib/url_parser.php";
+/**
+ * To try to guess locale's from string samples
+ */
+require_once BASE_DIR."/lib/locale_functions.php";
 /**
  * Parent class common to all processors used to create crawl summary
  * information  that involves basically text data
@@ -96,8 +100,7 @@ class TextProcessor extends PageProcessor
                 $summary[self::DESCRIPTION] = mb_substr($page, 0,
                     self::$max_description_len);
             }
-            $summary[self::LANG] = self::calculateLang(
-                $summary[self::DESCRIPTION]);
+            $summary[self::LANG] = $lang;
             $summary[self::LINKS] = self::extractHttpHttpsUrls($page);
             $summary[self::PAGE] = "<html><body><div><pre>".
                 strip_tags($page)."</pre></div></body></html>";
@@ -118,23 +121,10 @@ class TextProcessor extends PageProcessor
     {
         if ($url != null) {
             $lang = UrlParser::getLang($url);
-            if ($lang != null) return $lang;
+            if ($lang != null) { return $lang; }
         }
         if ($sample_text != null){
-            $words = mb_split("[[:space:]]|".PUNCT, $sample_text);
-            $num_words = count($words);
-            $ascii_count = 0;
-            foreach ($words as $word) {
-                if (strlen($word) == mb_strlen($word)) {
-                    $ascii_count++;
-                }
-            }
-            // crude, but let's guess ASCII == english
-            if ($ascii_count/$num_words > EN_RATIO) {
-                $lang = 'en';
-            } else {
-                $lang = null;
-            }
+            $lang = guessLocaleFromString($sample_text);
         } else {
             $lang = null;
         }
diff --git a/locale/fa/resources/tokenizer.php b/locale/fa/resources/tokenizer.php
index a7e0a1533..fdccec0cb 100755
--- a/locale/fa/resources/tokenizer.php
+++ b/locale/fa/resources/tokenizer.php
@@ -28,11 +28,204 @@
  */
 if (!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}
 /**
- * Persian specific tokenization code. Typically, tokenizer.php
- * either contains a stemmer for the language in question or
- * it specifies how many characters in a char gram
+ * Persian specific tokenization code. In particular, it has a stemmer,
+ * The stemmer is my stab at porting Nick Patch's Perl port,
+ * https://metacpan.org/pod/Lingua::Stem::UniNE::FA, of the
+ * stemming algorithm by Ljiljana Dolamic and Jacques
+ * Savoy of the University of Neuchâtel
+ * http://members.unine.ch/jacques.savoy/clef/persianStemmerUnicode.txt
+ * Here given a word, its stem is that part of the word that
+ * is common to all its inflected variants. For example,
+ * tall is common to tall, taller, tallest. A stemmer takes
+ * a word and tries to produce its stem.
  *
  * @author Chris Pollett
  * @package seek_quarry\locale\fa
  */
-$GLOBALS['CHARGRAMS']['fa'] = 5;
+class FaTokenizer
+{
+    /**
+     * Words we don't want to be stemmed
+     * @var array
+     */
+    static $no_stem_list = array();
+    /**
+     * Stub function which could be used for a word segmenter.
+     * Such a segmenter on input thisisabunchofwords would output
+     * this is a bunch of words
+     *
+     * @param string $pre_segment  before segmentation
+     * @return string should return string with words separated by space
+     *     in this case does nothing
+     */
+    static function segment($pre_segment)
+    {
+        return $pre_segment;
+    }
+    /**
+     * Removes the stop words from the page (used for Word Cloud generation)
+     *
+     * @param string $page the page to remove stop words from.
+     * @return string $page with no stop words
+     */
+    static function stopwordsRemover($page)
+    {
+        $stop_words = array(
+            "در", "به", "از", "كه", "مي", "اين", "است", "را", "با", "هاي",
+            "براي", "آن", "يك", "شود", "شده","خود", "ها", "كرد", "شد", "اي",
+            "تا", "كند", "بر", "بود", "گفت", "نيز", "وي", "هم", "كنند",
+            "دارد", "ما", "كرده", "يا", "اما", "بايد", "دو", "اند", "هر",
+            "خواهد", "او", "مورد", "آنها", "باشد", "ديگر", "مردم", "نمي",
+            "بين", "پيش", "پس", "اگر", "همه", "صورت", "يكي", "هستند",
+            "بي", "من", "دهد", "هزار", "نيست", "استفاده", "داد", "داشته",
+            "راه", "داشت", "چه", "همچنين", "كردند", "داده", "بوده",
+            "دارند", "همين", "ميليون", "سوي", "شوند", "بيشتر", "بسيار",
+            "روي", "گرفته", "هايي", "تواند", "اول", "نام", "هيچ", "چند",
+            "جديد", "بيش", "شدن", "كردن", "كنيم", "نشان", "حتي", "اينكه",
+            "ولی", "توسط", "چنين", "برخي", "نه", "ديروز", "دوم",
+            "درباره", "بعد", "مختلف", "گيرد", "شما", "گفته", "آنان",
+            "بار", "طور", "گرفت", "دهند", "گذاري", "بسياري", "طي",
+            "بودند", "ميليارد", "بدون", "تمام", "كل", "تر",
+            "براساس", "شدند", "ترين", "امروز", "باشند", "ندارد",
+            "چون", "قابل", "گويد", "ديگري", "همان", "خواهند",
+            "قبل", "آمده", "اكنون", "تحت", "طريق", "گيري", "جاي",
+            "هنوز", "چرا", "البته", "كنيد", "سازي", "سوم", "كنم",
+            "بلكه", "زير", "توانند", "ضمن", "فقط", "بودن", "حق",
+            "آيد", "وقتي", "اش", "يابد", "نخستين", "مقابل", "خدمات",
+            "امسال", "تاكنون", "مانند", "تازه", "آورد", "فكر",
+            "آنچه", "نخست", "نشده", "شايد", "چهار", "جريان",
+            "پنج", "ساخته", "زيرا", "نزديك", "برداري", "كسي",
+            "ريزي", "رفت", "گردد", "مثل", "آمد", "ام", "بهترين",
+            "دانست", "كمتر", "دادن", "تمامي", "جلوگيري",
+            "بيشتري", "ايم", "ناشي", "چيزي", "آنكه", "بالا",
+            "بنابراين", "ايشان", "بعضي", "دادند", "داشتند",
+            "برخوردار", "نخواهد", "هنگام", "نبايد", "غير", "نبود",
+            "ديده", "وگو", "داريم", "چگونه", "بندي", "خواست", "فوق", "ده",
+            "نوعي", "هستيم", "ديگران", "همچنان", "سراسر", "ندارند",
+            "گروهي", "سعي", "روزهاي", "آنجا", "يكديگر", "كردم",
+            "بيست", "بروز", "سپس", "رفته", "آورده", "نمايد",
+            "باشيم", "گويند", "زياد", "خويش", "همواره", "گذاشته",
+            "شش", "نداشته", "شناسي", "خواهيم", "آباد", "داشتن",
+            "نظير", "همچون", "باره", "نكرده", "شان", "سابق",
+            "هفت", "دانند", "جايي", "بی", "جز", "زیرِ", "رویِ",
+            "سریِ", "تویِ", "جلویِ", "پیشِ", "عقبِ", "بالایِ",
+            "خارجِ", "وسطِ", "بیرونِ", "سویِ", "کنارِ", "پاعینِ",
+            "نزدِ", "نزدیکِ","دنبالِ", "حدودِ", "برابرِ", "طبقِ",
+            "مانندِ", "ضدِّ", "هنگامِ", "برایِ", "مثلِ", "بارة",
+            "اثرِ", "تولِ", "علّتِ", "سمتِ", "عنوانِ", "قصدِ",
+            "روب", "جدا", "کی", "که", "چیست", "هست", "کجا", "کجاست",
+            "کَی", "چطور", "کدام", "آیا", "مگر", "چندین",
+            "یک", "چیزی", "دیگر", "کسی", "بعری", "هیچ", "چیز",
+            "جا", "کس", "هرگز", "یا", "تنها", "بلکه", "خیاه",
+            "بله", "بلی", "آره", "آری", "مرسی", "البتّه",
+            "لطفاً", "ّه", "انکه",
+            "وقتیکه", "همین", "پیش", "مدّتی", "هنگامی", "مان", "تان"
+            );
+        $page = preg_replace('/\b('.implode('|',$stop_words).')\b/u', '',
+            mb_strtolower($page));
+        return $page;
+    }
+    /**
+     * Computes the stem of an Persian word
+     *
+     * @param string $word the string to stem
+     * @return string the stem of $word
+     */
+    static function stem($word)
+    {
+        if (in_array($word, self::$no_stem_list)) {
+            return $word;
+        }
+        $original_word = $word;
+        $word = mb_strtolower($word);
+        $word = self::removeKasra($word);
+        $word = self::removeSuffix($word);
+        $word = self::removeKasra($word);
+        return $word;
+    }
+    /**
+     * Removes a Kasra diacritic mark if appears
+     * at the end of a word.
+     * @param string $word word to remove mark from
+     * @return string result of removal
+     */
+    static function removeKasra($word)
+    {
+        if(mb_strlen($word) < 5) {
+            return $word;
+        }
+        $kasra = json_decode('"\u0650"');
+        $word = preg_replace('/'.$kasra.'$/u', "", $word);
+        return $word;
+    }
+    /**
+     * Removes common Persian suffixes
+     *
+     * @param string $word to remove suffixes from
+     * @return string result of suffix removal
+     */
+    static function removeSuffix($word)
+    {
+        $length = mb_strlen($word);
+        if ($length > 7) {
+            $modified_word = preg_replace("/(?:
+                آباد | باره | بندی | بندي | ترین | ترين | ریزی |
+                ريزي | سازی | سازي | گیری | گيري | هایی | هايي
+                ) $/xu", "", $word);
+            if($modified_word != $word) {
+                return $modified_word;
+            }
+        }
+        if ($length > 6) {
+            $modified_word = preg_replace("/(?:
+                    اند | ایم | ايم | شان | های | هاي
+                ) $/xu", "", $word);
+            if($modified_word != $word) {
+                return $modified_word;
+            }
+        }
+        if ($length > 5) {
+            $modified_word = preg_replace("/ ان $/xu", "", $word);
+            if($modified_word != $word) {
+                return self::normalize($word);
+            }
+            $modified_word = preg_replace("/(?:
+                    ات | اش | ام | تر | را | ون | ها | هء | ین | ين
+                ) $/xu", "", $word);
+            if($modified_word != $word) {
+                return $modified_word;
+            }
+        }
+        if ($length > 3) {
+            $modified_word = preg_replace("/(?: ت | ش | م | ه | ی | ي ) $/xu",
+                "", $word);
+            if($modified_word != $word) {
+                return $modified_word;
+            }
+        }
+        return $word;
+    }
+    /**
+     * Performs additional end word stripping
+     *
+     * @param string $word to remove suffixes from
+     * @return string result of suffix removal
+     */
+    static function normalize($word)
+    {
+        $length = mb_strlen($word);
+        if($length < 4) {
+            return $word;
+        }
+        $modified_word = preg_replace("/(?: ت | ر | ش | گ | م | ى ) $/xu", "",
+            $word);
+        if($modified_word != $word) {
+            $word = $modified_word;
+            if(mb_strlen($word) < 4) {
+                return $word;
+            }
+            $word =~ preg_replace("/(?: ی | ي ) $/xu", "", $word);
+        }
+        return $word;
+    }
+}

ViewGit