Add unit tests for persian stemmer, a=chris

Chris Pollett [2015-06-07 22:Jun:th]
Add unit tests for persian stemmer, a=chris
Filename
locale/fa/resources/tokenizer.php
tests/fa_tokenizer_test.php
tests/fr_tokenizer_test.php
tests/test_files/persian_stemmer/input_vocabulary.txt
tests/test_files/persian_stemmer/stemmed_result.txt
diff --git a/locale/fa/resources/tokenizer.php b/locale/fa/resources/tokenizer.php
index fdccec0cb..6edca2cf3 100755
--- a/locale/fa/resources/tokenizer.php
+++ b/locale/fa/resources/tokenizer.php
@@ -32,8 +32,9 @@ if (!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}
  * The stemmer is my stab at porting Nick Patch's Perl port,
  * https://metacpan.org/pod/Lingua::Stem::UniNE::FA, of the
  * stemming algorithm by Ljiljana Dolamic and Jacques
- * Savoy of the University of Neuchâtel
+ * Savoy of the University of Neuchâtel. The Java version of this is at
  * http://members.unine.ch/jacques.savoy/clef/persianStemmerUnicode.txt
+ * (beware of Java's handling of Unicode).
  * Here given a word, its stem is that part of the word that
  * is common to all its inflected variants. For example,
  * tall is common to tall, taller, tallest. A stemmer takes
diff --git a/tests/fa_tokenizer_test.php b/tests/fa_tokenizer_test.php
new file mode 100644
index 000000000..53c8e2a7d
--- /dev/null
+++ b/tests/fa_tokenizer_test.php
@@ -0,0 +1,98 @@
+<?php
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009 - 2015  Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * END LICENSE
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2015
+ * @filesource
+ */
+if (!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}
+/**
+ * Load the French Tokenizer via phrase_parser (5.4 hack)
+ */
+require_once BASE_DIR."/lib/phrase_parser.php";
+/**
+ * Load the run function
+ */
+require_once BASE_DIR.'lib/unit_test.php';
+/**
+ * Code used to test the Persian stemming algorithm. The inputs for the
+ * algorithm came from the sample text file for the Hamshahri Collection
+ * found at http://ece.ut.ac.ir/DBRG/Hamshahri/download.html
+ * The stemmed results come from the Java program that the PHP stemmer is
+ * based off of at
+ * http://members.unine.ch/jacques.savoy/clef/persianStemmerArabic.txt
+ *
+ * @author Chris Pollett
+ * @package seek_quarry\test
+ */
+class FaTokenizerTest extends UnitTest
+{
+    /**
+     * Each test we set up a new Persian Tokenizer object
+     */
+    function setUp()
+    {
+        $this->test_objects['FILE1'] = PhraseParser::getTokenizer("fa");
+    }
+    /**
+     * Nothing done for unit test tear done
+     */
+    function tearDown()
+    {
+    }
+    /**
+     * Tests whether the stem function for the Persian stemming algorithm
+     * stems words according to the rules of stemming. The function tests stem
+     * by calling stem with the words in $test_words and compares the results
+     * with the stem words in $stem_words
+     *
+     * $test_words is an array containing a set of words in French provided in
+     * the snowball web page
+     * $stem_words is an array containing the stems for words in $test_words
+     */
+    function stemmerTestCase()
+    {
+        $stem_dir = BASE_DIR.'/tests/test_files/persian_stemmer';
+        //Test word set from snowball
+        $test_words = file("$stem_dir/input_vocabulary.txt");
+        //Stem word set from snowball for comparing results
+        $stem_words = file("$stem_dir/stemmed_result.txt");
+        /**
+         * check if function stem correctly stems the words in $test_words by
+         * comparing results with stem words in $stem_words
+         */
+        for ($i = 0; $i < count($test_words); $i++) {
+            $word = trim($test_words[$i]);
+            if (in_array($word, FaTokenizer::$no_stem_list) ||
+                strlen($word) < 3) { continue; }
+            $stem = trim($stem_words[$i]);
+            $word_stem = $this->test_objects['FILE1']->stem($word);
+            $this->assertEqual($word_stem,
+                    $stem,"function stem correctly stems
+                    $word to $stem");
+        }
+    }
+}
diff --git a/tests/fr_tokenizer_test.php b/tests/fr_tokenizer_test.php
index c1d9b3daf..b10de919d 100644
--- a/tests/fr_tokenizer_test.php
+++ b/tests/fr_tokenizer_test.php
@@ -63,7 +63,7 @@ class FrTokenizerTest extends UnitTest
     {
     }
     /**
-     * Tests whether the stem funtion for the French stemming algorithm
+     * Tests whether the stem function for the French stemming algorithm
      * stems words according to the rules of stemming. The function tests stem
      * by calling stem with the words in $test_words and compares the results
      * with the stem words in $stem_words
diff --git a/tests/test_files/persian_stemmer/input_vocabulary.txt b/tests/test_files/persian_stemmer/input_vocabulary.txt
new file mode 100644
index 000000000..33b647b58
--- /dev/null
+++ b/tests/test_files/persian_stemmer/input_vocabulary.txt
@@ -0,0 +1,176 @@
+زندگي
+مورچه
+ديده
+خسته
+نتيجه
+حادثه
+اتفاقي
+دستش
+براي
+ساعتي
+نقطه
+برمي
+دوباره
+كشيده
+مورچه
+غريزه
+عملي
+كرده
+خودش
+آينده
+مزاياي
+استفاده
+درازي
+پاياني
+ميليونها
+گذشته
+ميلياردها
+گذشت
+برنامه
+ميليونها
+حيات
+پاياني
+بخواهيم
+برسيم
+انتهايي
+نيست
+جايي
+رسيم
+نوشته
+تابلوهاي
+نقاشي
+طباطبايي
+جايگاه
+ويژه
+نمايشگاهي
+نگارخانه
+نمايش
+عمومي
+طباطبايي
+سالي
+نقاشي
+همواره
+كرده
+درباره
+سوژه
+روزهاي
+زندگي
+بودم
+زماني
+رفتم
+مورچه
+ديدم
+تصميم
+گرفتم
+بكشم
+نظرم
+طراحي
+ادامه
+دادم
+نقاشي
+تغييراتي
+اينكه
+حشره
+زندگي
+منظمي
+برايم
+نقاشي
+طباطبايي
+عليرغم
+موضوعات
+ساده
+تركيببندي
+بعدي
+آميزي
+استحكام
+نقوش
+پرده
+نظمي
+دروني
+آناتومي
+اندام
+ديگري
+مشخصه
+اصلي
+تقسيم
+بندي
+گانه
+آنكه
+نقاشيهاي
+طباطبايي
+سرهايشان
+بزرگترين
+اندام
+آميزي
+گرفته
+نقاش
+تغييرات
+اندام
+چيست
+باآنكه
+نقاش
+گفته
+نقاشيها
+طباطبايي
+ظاهري
+انسانها
+هيبتي
+حشره
+گونه
+تابلوهاي
+نقاش
+گوشه
+هايي
+زندگي
+اندام
+سياه
+زمينه
+خاكستري
+بادكنكهاي
+رنگي
+عروسي
+اندام
+چهره
+حالات
+صورت
+اندام
+كننده
+احساسات
+تابلوي
+نقاشي
+بلكه
+تمام
+مورچه
+انسانهاحسي
+سايه
+احساساتي
+عنكبوت
+مادري
+تفاهم
+دوستي
+نكته
+نقاشي
+صورت
+اسليمي
+ايراني
+حاشيه
+تابلوها
+رنگهايي
+زنده
+بصري
+مورچه
+انسانهاست
+هماهنگي
+دروني
+مفهوم
+مورچه
+انسانها
+كشيده
+رنگهاي
+ميوه
+انساني
+افزايش
+گروهي
+جاودانگي
+زندگي
+
diff --git a/tests/test_files/persian_stemmer/stemmed_result.txt b/tests/test_files/persian_stemmer/stemmed_result.txt
new file mode 100644
index 000000000..cf9702d5e
--- /dev/null
+++ b/tests/test_files/persian_stemmer/stemmed_result.txt
@@ -0,0 +1,175 @@
+زندگ
+مورچ
+ديد
+خست
+نتيج
+حادث
+اتفاق
+دست
+برا
+ساعت
+نقط
+برم
+دوبار
+كشيد
+مورچ
+غريز
+عمل
+كرد
+خود
+آيند
+مزايا
+استفاد
+دراز
+پايان
+ميليون
+گذشت
+ميليارد
+گذش
+برنام
+ميليون
+حيا
+پايان
+بخواهي
+برسي
+انتهاي
+نيس
+جاي
+رسي
+نوشت
+تابلو
+نقاش
+طباطباي
+جايگا
+ويژ
+نمايشگاه
+نگارخان
+نماي
+عموم
+طباطباي
+سال
+نقاش
+هموار
+كرد
+دربار
+سوژ
+روزها
+زندگ
+بود
+زمان
+رفت
+مورچ
+ديد
+تصمي
+گرفت
+بكش
+نظر
+طراح
+ادام
+داد
+نقاش
+تغييرات
+اينك
+حشر
+زندگ
+منظم
+براي
+نقاش
+طباطباي
+عليرغ
+موضوع
+ساد
+تركيب
+بعد
+آميز
+استحك
+نقو
+پرد
+نظم
+درون
+آناتوم
+اندا
+ديگر
+مشخص
+اصل
+تقسي
+بند
+گان
+آنك
+نقاشي
+طباطباي
+سرهاي
+بزرگ
+اندا
+آميز
+گرفت
+نقا
+تغيير
+اندا
+چيس
+باآنك
+نقا
+گفت
+نقاشي
+طباطباي
+ظاهر
+انسان
+هيبت
+حشر
+گون
+تابلو
+نقا
+گوش
+هاي
+زندگ
+اندا
+سيا
+زمين
+خاكستر
+بادكنك
+رنگ
+عروس
+اندا
+چهر
+حالا
+صور
+اندا
+كنند
+احساس
+تابلو
+نقاش
+بلك
+تما
+مورچ
+انسانهاحس
+ساي
+احساسات
+عنكبو
+مادر
+تفاه
+دوست
+نكت
+نقاش
+صور
+اسليم
+ايران
+حاشي
+تابلو
+رنگهاي
+زند
+بصر
+مورچ
+انسانهاس
+هماهنگ
+درون
+مفهو
+مورچ
+انسان
+كشيد
+رنگها
+ميو
+انسان
+افزاي
+گروه
+جاودانگ
+زندگ
ViewGit