Add all missing comments/documentation for Yioop 7.0, Refactor StochasticSegmentor, POS, and Named Entity code, Fix bug in Email Message urls, Fix paging links in Manageuers, a=chris

Chris Pollett [2020-07-10 23:Jul:th]
Add all missing comments/documentation for Yioop 7.0, Refactor StochasticSegmentor, POS, and Named Entity code, Fix bug in Email Message urls, Fix paging links in Manageuers, a=chris
Filename
src/configs/CreditConfig.php
src/configs/TokenTool.php
src/controllers/AdminController.php
src/controllers/Controller.php
src/controllers/CrawlController.php
src/controllers/FetchController.php
src/controllers/GroupController.php
src/controllers/RegisterController.php
src/controllers/ResourceController.php
src/controllers/SearchController.php
src/controllers/components/AccountaccessComponent.php
src/controllers/components/Component.php
src/controllers/components/CrawlComponent.php
src/controllers/components/SocialComponent.php
src/controllers/components/StoreComponent.php
src/examples/StockBot.php
src/examples/WeatherBot.php
src/executables/Fetcher.php
src/executables/MediaUpdater.php
src/executables/QueueServer.php
src/index.php
src/library/BTree.php
src/library/BloomFilterBundle.php
src/library/Bzip2BlockIterator.php
src/library/ComputerVision.php
src/library/ContextTagger.php
src/library/ContextWeightedNamedEntityRecognizer.php
src/library/ContextWeightedPosTagger.php
src/library/CrawlDaemon.php
src/library/DoubleIndexBundle.php
src/library/FeedArchiveBundle.php
src/library/FetchUrl.php
src/library/IndexArchiveBundle.php
src/library/IndexDictionary.php
src/library/IndexManager.php
src/library/IndexShard.php
src/library/LocaleFunctions.php
src/library/MailServer.php
src/library/NamedEntityContextTagger.php
src/library/PageRuleParser.php
src/library/PartOfSpeechContextTagger.php
src/library/PhraseParser.php
src/library/StochasticTermSegmenter.php
src/library/SuffixTree.php
src/library/UpgradeFunctions.php
src/library/Utility.php
src/library/WebArchive.php
src/library/WebArchiveBundle.php
src/library/WebQueueBundle.php
src/library/WebSite.php
src/library/archive_bundle_iterators/OdpRdfArchiveBundleIterator.php
src/library/archive_bundle_iterators/TextArchiveBundleIterator.php
src/library/classifiers/Classifier.php
src/library/index_bundle_iterators/DocIterator.php
src/library/index_bundle_iterators/GroupIterator.php
src/library/index_bundle_iterators/IndexBundleIterator.php
src/library/index_bundle_iterators/IntersectIterator.php
src/library/index_bundle_iterators/NetworkIterator.php
src/library/indexing_plugins/IndexingPlugin.php
src/library/indexing_plugins/WordfilterPlugin.php
src/library/media_jobs/AnalyticsJob.php
src/library/media_jobs/FeedsUpdateJob.php
src/library/processors/ImageProcessor.php
src/library/processors/PageProcessor.php
src/library/processors/TextProcessor.php
src/library/summarizers/Summarizer.php
src/locale/en_US/resources/Tokenizer.php
src/locale/hi/resources/Tokenizer.php
src/locale/it/resources/Tokenizer.php
src/locale/zh_CN/resources/Tokenizer.php
src/locale/zh_CN/resources/ner_weight.txt.gz
src/locale/zh_CN/resources/pos_weight.txt.gz
src/locale/zh_CN/resources/term_weight.txt.gz
src/models/GroupModel.php
src/models/LocaleModel.php
src/models/Model.php
src/models/PhraseModel.php
src/models/SearchverticalsModel.php
src/models/SourceModel.php
src/models/TrendingModel.php
src/views/AdminView.php
src/views/ComponentView.php
src/views/CrawlstatusView.php
src/views/elements/AdminElement.php
src/views/elements/AdminbarElement.php
src/views/elements/BotstoryElement.php
src/views/elements/GroupfeedElement.php
src/views/elements/ManageclassifiersElement.php
src/views/elements/ManagecreditsElement.php
src/views/elements/ManagerolesElement.php
src/views/elements/ManageusersElement.php
src/views/elements/MediajobsElement.php
src/views/elements/MixcrawlsElement.php
src/views/elements/PaginationElement.php
src/views/elements/SearchElement.php
src/views/elements/SearchsourcesElement.php
src/views/elements/SideadvertisementElement.php
src/views/elements/TopadvertisementElement.php
src/views/helpers/FeedsHelper.php
tests/PdfProcessorTest.php
diff --git a/src/configs/CreditConfig.php b/src/configs/CreditConfig.php
index ab7a2f623..d5ac15ec1 100644
--- a/src/configs/CreditConfig.php
+++ b/src/configs/CreditConfig.php
@@ -59,11 +59,11 @@ class CreditConfig
      * @param float $amount dollar amount to charge the card
      * @param string $token token issued for transaction from the card
      *      processing agency
-     * @param string& $message message to use as for reason for charge
+     * @param string &$message message to use as for reason for charge
      * @return bool whether or not the charge was successful
      */
     public static function charge($amount, $token, &$message)
     {
         return true;
     }
-}
\ No newline at end of file
+}
diff --git a/src/configs/TokenTool.php b/src/configs/TokenTool.php
index a9266b22b..2bf749e7d 100644
--- a/src/configs/TokenTool.php
+++ b/src/configs/TokenTool.php
@@ -27,7 +27,6 @@
  *
  * A description of its usage is given in the $usage global variable
  *
- *
  * @author Ravi Dhillon  ravi.dhillon@yahoo.com, Chris Pollett (modified for n
  *     ngrams)
  * @license https://www.gnu.org/licenses/ GPL3
@@ -148,8 +147,8 @@ where from Wikipedia source to extract:
 A knowledge wiki entry is a search wiki page which is displayed on a given
 query usually in a callout box. TokenTool.php can be used to create such
 entries based on the first paragraph of a Wikipedia page which matches the
-query. At the same time TokenTool.php is doing htis it can also use
-thhe infoboxes on wiki pages to generate a initial list of potential seed
+query. At the same time TokenTool.php is doing this it can also use
+the infoboxes on wiki pages to generate a initial list of potential seed
 sites for a web crawl. The syntax to create knowledge wiki seed sites is:

 php TokenTool.php kwiki-seeds locale page_count_file wiki_locale_dump \
@@ -270,6 +269,27 @@ switch ($argv[1]) {
         makeKwikiEntriesGetSeedSites($argv[2], $argv[3], $argv[4],
             $argv[5], $argv[6]);
         break;
+    case "pos-tagger":
+        $file_path = PREP_DIR . "/";
+        if (!isset($argv[3])) {
+           echo $usage;
+        }
+        $texts = [];
+        for($i = 4; $i < count($argv); $i++) {
+            $files = glob($file_path . $argv[$i]);
+            if (count($files) == 0) {
+                echo "error: {$file_path}{$argv[i]}: File not found\n";
+                exit();
+            }
+            $texts = array_merge($texts, $files);
+        }
+        $pos_tagger = new ContextWeightedPosTagger($argv[2]);
+        if ($pos_tagger->train($texts, "/")) {
+            echo "Success\n";
+        } else {
+            echo "Failed\n";
+        }
+        break;
     case "segment-filter":
         $file_path = PREP_DIR . "/";
         if (!file_exists($file_path . $argv[3])) {
@@ -283,14 +303,14 @@ switch ($argv[1]) {
         if (!isset($argv[3])) {
            echo $usage;
         }
-        $texts=[];
+        $texts = [];
         for($i = 4; $i < count($argv); $i++) {
             $files = glob($file_path . $argv[$i]);
             if (count($files) == 0) {
-                echo "error: {$file_path}{$argv[i]}: File not found\n";
+                echo "error: {$file_path}{$argv[$i]}: File not found\n";
                 exit();
             }
-            $texts = array_merge ($texts, $files);
+            $texts = array_merge($texts, $files);
         }
         $segmenter = new StochasticTermSegmenter($argv[2]);
         if ($segmenter->train($texts, $argv[3])) {
@@ -315,7 +335,22 @@ if (!PROFILE) {
     exit();
 }
 /**
+ * Generates knowledge wiki callouts for search results pages based
+ * on the first paragraph of a Wikipedia Page that matches a give qeury.
+ * Also generates an initial list of potential seed sites for a crawl
+ * based off urls scraped from the wiki pages.
  *
+ * @param string $locale_tag the IANA language tag of the locale to
+ *   create knowledge wiki entries and seed sites for
+ * @param string $page_count_file the file name of a a wiki page count dump
+ *  file (or folder of such files). Such a file contains the names of wiki
+ *  pages and how many times they were accessed
+ * @param string $wiki_dump_file  a dump of wikipedia pages and meta pages
+ * @param int $max_entries maximum number of kwiki entries to create.
+ *  Will pick the one with the highest counts in $page_count_file
+ * @param int $max_seed_sites maximum number of seed sites to add
+ *  to Yioop's set of seed sites. Again chooses those with highest
+ *  page count score
  */
 function makeKwikiEntriesGetSeedSites($locale_tag, $page_count_file,
     $wiki_dump_file, $max_entries, $max_seed_sites)
@@ -423,10 +458,10 @@ function makeKwikiEntriesGetSeedSites($locale_tag, $page_count_file,
                 continue;
             }
             $text = str_replace($infobox_offset[0], "\n", $text);
-            $text = removeTags($text, "{", "}", "");
-            $text = removeTags($text, "&lt;!--", "--&gt;", "");
+            $text = removeTags($text, "{", "}");
+            $text = removeTags($text, "&lt;!--", "--&gt;");
             $text = preg_replace('/\&lt\;ref[^\>]+\/\&gt;/u', " ", $text);
-            $text = removeTags($text, "&lt;ref", "/ref&gt;", "");
+            $text = removeTags($text, "&lt;ref", "/ref&gt;");
             $text = preg_replace('/\[\[[^\[\]]+\|([^\[\|]+)\]\]/u', "$1",
                 $text);
             $text = preg_replace('/\[\[(File|Image)\:(.+)\]\]/u', "", $text);
@@ -512,7 +547,12 @@ function makeKwikiEntriesGetSeedSites($locale_tag, $page_count_file,
     $crawl_model->setSeedInfo($seed_info);
 }
 /**
- *
+ * Gets the next wiki page from a file handle pointing to the wiki dump file
+ * @param resource $fr file handle (might be a  compressed file handle,
+ *  for example, corresponding to gzopen of bzopen)
+ * @param function $read a function for reading from thhe given file handle
+ * @param int $block_size size of blocks to use when reading
+ * @param string & $input_buffer used to buffer data from the wiki dump file
  */
 function getNextPage($fr, $read, $block_size, &$input_buffer)
 {
@@ -532,9 +572,13 @@ function getNextPage($fr, $read, $block_size, &$input_buffer)
     return $page;
 }
 /**
- *
+ * Remove all occurrence of a open close tag pairs from $text
+ * @param string $text to remove tag pair from
+ * @param string $open string pattern for open tag
+ * @param string $close string pattern for close tag
+ * @return string text after tag removed
  */
-function removeTags($text, $open, $close, $tag)
+function removeTags($text, $open, $close)
 {
     $old_text = "";
     while ($text != $old_text) {
@@ -550,7 +594,20 @@ function removeTags($text, $open, $close, $tag)
     return $text;
 }
 /**
+ * Get a substring offset pair matching the input open close brace tag pattern
  *
+ * @param string $page source text to search for the tag in
+ *   For example, lala {{infobox {{blah yoyoy}} }} dada.
+ * @param string $brace_open character sequence starting the tag region. For
+ *  example {{
+ * @param string $brace_close character sequence ending the tag region. For
+ *  example }}
+ * @param string $tag tag that might be associated with the opening of the
+ *  the sequence. For example infobox.
+ * @param int $offset offset to start searching from
+ * @return array ordered pair [substring containing the brace tag, offset after
+ *  the tag]. If had  "lala {{infobox {{blah yoyoy}} }} dada" as input and
+ *  searched on {{, }}, infobox, 0 would get ["{{infobox {{blah yoyoy}}", 31]
  */
 function getBraceTag($page, $brace_open, $brace_close, $tag, $offset = 0)
 {
@@ -586,7 +643,12 @@ function getBraceTag($page, $brace_open, $brace_close, $tag, $offset = 0)
     return [$outer_contents, $end_pos];
 }
 /**
- *
+ * Get the outer contents of an xml open/close tag pair from
+ * a text source together with a new offset location after
+ * @param string $page text source to search the tag pair in
+ * @param string $tag the xml tag to look for
+ * @param int $offset offset to start searching after for the open/close pair
+ * @param array ordered pair [outer contents, new offset]
  */
 function getTagOffsetPage($page, $tag, $offset = 0)
 {
@@ -604,7 +666,16 @@ function getTagOffsetPage($page, $tag, $offset = 0)
     return [$outer_contents, $end_pos];
 }
 /**
+ * Returns title and page counts of the top $max_pages many entries
+ * in a $page_count_file for a locale $locale_tag
  *
+ * @param string $page_count_file page count file to use to search for title
+ *  counts with respect to a locale
+ * @param string $locale_tag locale to get top pages for
+ * @param int $max_pages number of pages
+ * @param array $title_counts title counts that migt have come from analyzing
+ *  a previous file. These will be in the output and contribute to $max_pages
+ * @return array $title_counts wiki page titles => num_views associative array
  */
 function getTopPages($page_count_file, $locale_tag, $max_pages,
     $title_counts = [])
diff --git a/src/controllers/AdminController.php b/src/controllers/AdminController.php
index 45c348f8c..003dcfb64 100755
--- a/src/controllers/AdminController.php
+++ b/src/controllers/AdminController.php
@@ -351,8 +351,14 @@ class AdminController extends Controller implements CrawlConstants
         return $data;
     }
     /**
-     * @param array $user_activities
-     * @return array
+     * For a given user's access and the list component and activities
+     * return a list of translated names of components associated to a
+     * list of user accessible activities for that component
+     *
+     * @param array $user_activities a list of activities that a
+     *  user is allowed to access
+     * @return array of translated name of component => [list of user accessible
+     *  actvitities]
      */
     public static function computeComponentActivities($user_activities)
     {
@@ -488,8 +494,8 @@ class AdminController extends Controller implements CrawlConstants
     /**
      * Used to update the yioop installation profile based on $_REQUEST data
      *
-     * @param array& $data field data to be sent to the view
-     * @param array& $profile used to contain the current and updated profile
+     * @param array &$data field data to be sent to the view
+     * @param array &$profile used to contain the current and updated profile
      *     field values
      * @param array $check_box_fields fields whose data comes from a html
      *     checkbox
@@ -551,7 +557,7 @@ class AdminController extends Controller implements CrawlConstants
      * are used by manageUsers, manageRoles, manageGroups, to do advanced
      * search of the entity they are responsible for.
      *
-     * @param array& $data modified to contain the field data needed for
+     * @param array &$data modified to contain the field data needed for
      *     the view to draw the search form
      * @param string activity in which this search is being conducted
      * @param array $comparison_fields those fields of the entity
diff --git a/src/controllers/Controller.php b/src/controllers/Controller.php
index 57a0bde0b..826d0b7bb 100755
--- a/src/controllers/Controller.php
+++ b/src/controllers/Controller.php
@@ -84,11 +84,12 @@ abstract class Controller
      */
     public $web_site;
     /**
-     *
+    * Array of instances of components used by this controller
+     * @var array
      */
     public $component_instances;
     /**
-     * Array of instances of views  used by this controller
+     * Array of instances of views used by this controller
      * @var array
      */
     public $view_instances = [];
@@ -399,7 +400,7 @@ abstract class Controller
      * data sources, rather than directly make a call to the model to get the
      * data it might be passed directly to this method.
      *
-     * @param array& $data used to send data to the view will be updated by
+     * @param array &$data used to send data to the view will be updated by
      *     this method with row and paging data
      * @param mixed $field_or_model if an object, this is assumed to be a model
      *     and so the getRows method of this model is called to get row data,
@@ -905,7 +906,7 @@ abstract class Controller
      * @param string $line_type does additional cleaning depending on the type
      *     of the lines. For instance, if is "url" then a line not beginning
      *     with a url scheme will have http:// prepended.
-     * @return $lines an array of clean lines
+     * @return array $lines an array of clean lines
      */
     public function convertStringCleanArray($str, $line_type="url")
     {
@@ -1018,7 +1019,7 @@ abstract class Controller
      * controller this function can be used to initialize the field variables
      * used to write the appropriate Javascripts
      *
-     * @param array& $data data to be used in drawing the view
+     * @param array &$data data to be used in drawing the view
      * @param bool $ads_off whether or not ads are turned off so that this
      *      method should do nothing
      */
diff --git a/src/controllers/CrawlController.php b/src/controllers/CrawlController.php
index 83bd41888..006e655e2 100644
--- a/src/controllers/CrawlController.php
+++ b/src/controllers/CrawlController.php
@@ -37,8 +37,8 @@ use seekquarry\yioop\library\MediaConstants;
 use seekquarry\yioop\library\UrlParser;

 /**
- * Controller used to manage networked installations of Yioop where
- * there might be mutliple queue_servers and a name_server. Command
+ * Controller used to manage networked installations of Yioop
+ * where there might be mulliple QueueServers and a NameServer. Command
  * sent to the nameserver web page are mapped out to queue_servers
  * using this controller. Each method of the controller essentially
  * mimics one method of CrawlModel, PhraseModel, or in general anything
@@ -325,7 +325,9 @@ class CrawlController extends Controller implements CrawlConstants
             null, $num_fetchers);
     }
     /**
-     *
+     * Wrapper call to the source model method that deletes the news feed
+     * and trending data stored in this Yioop instance
+     * @see SourceModel::clearFeedData
      */
     public function clearFeedData()
     {
diff --git a/src/controllers/FetchController.php b/src/controllers/FetchController.php
index 4dc928310..ea9cc910f 100755
--- a/src/controllers/FetchController.php
+++ b/src/controllers/FetchController.php
@@ -547,7 +547,7 @@ class FetchController extends Controller implements CrawlConstants
      * $time to a subfolder $day of a folder $dir
      *
      * @param string $schedule_name the name of the kind of schedule being saved
-     * @param string& $data_string encoded, compressed, serialized data the
+     * @param string &$data_string encoded, compressed, serialized data the
      *     schedule is to contain
      */
     public function addScheduleToScheduleDirectory($schedule_name,
diff --git a/src/controllers/GroupController.php b/src/controllers/GroupController.php
index 2a1036e55..fdd5fe288 100644
--- a/src/controllers/GroupController.php
+++ b/src/controllers/GroupController.php
@@ -159,9 +159,9 @@ class GroupController extends Controller implements CrawlConstants
      *
      * @param string $format can be one of rss, json, or serialize,
      *      if different, default HTML GroupView used.
-     * @param string& $view variable used to set the view in calling
+     * @param string &$view variable used to set the view in calling
      *     method
-     * @param array& $data used to send data to the view for drawing
+     * @param array &$data used to send data to the view for drawing
      */
     public function setupViewFormatOutput($format, &$view, &$data)
     {
diff --git a/src/controllers/RegisterController.php b/src/controllers/RegisterController.php
index 94ba65783..c1e5abab5 100755
--- a/src/controllers/RegisterController.php
+++ b/src/controllers/RegisterController.php
@@ -458,9 +458,9 @@ class RegisterController extends Controller implements CrawlConstants
             $user['FIRST_NAME'], $user['LAST_NAME'])."\n";
         $message .= tl('register_controller_recover_body')."\n";
         $time = time();
-        $message .= C\BASE_URL.
+        $message .= C\BASE_URL .
             "?c=register&a=recoverComplete&user=" .
-            $user['USER_NAME'].
+            $user['USER_NAME'] .
             "&hash=".urlencode(L\crawlCrypt(
                 $user['HASH'] . $time . $user['USER_NAME'].C\AUTH_KEY)) .
             "&time=" . $time ;
@@ -826,7 +826,7 @@ class RegisterController extends Controller implements CrawlConstants
      * Sets up the graphical captcha view
      * Draws the string for graphical captcha
      *
-     * @param array& $data used by view to draw any dynamic content
+     * @param array &$data used by view to draw any dynamic content
      *     in this case we append a field "CAPTCHA_IMAGE" with a data
      *     url of the captcha to draw.
      */
@@ -837,8 +837,7 @@ class RegisterController extends Controller implements CrawlConstants
         }
         unset($_SESSION["captcha_text"]);
         // defines captcha text
-        $characters_for_captcha = '123456789'.
-            'abcdefghijklmnpqrstuvwxyz'.
+        $characters_for_captcha = '123456789abcdefghijklmnpqrstuvwxyz'.
             'ABCDEFGHIJKLMNPQRSTUVWXYZ';
         $len = strlen($characters_for_captcha);
         // selecting letters for captcha
@@ -966,12 +965,12 @@ class RegisterController extends Controller implements CrawlConstants
      * $activity_success. If $activity was not initially equal to
      * $activity_success then this method does nothing.
      *
-     * @param string& $activity current tentative activity
+     * @param string &$activity current tentative activity
      * @param string $activity_success activity to test for and to test prereqs
      *     for.
      * @param string $activity_fail if prereqs not met which acitivity to switch
      *     to
-     * @param array& $data data to help render the view this controller draws
+     * @param array &$data data to help render the view this controller draws
      */
     public function preactivityPrerequisiteCheck(&$activity,
         $activity_success, $activity_fail, &$data)
@@ -1018,7 +1017,7 @@ class RegisterController extends Controller implements CrawlConstants
      * missing fields on a create account or recover account form.
      * also adds error info if try to create an existing using.
      *
-     * @param array& $data contains info for the view on which the above
+     * @param array &$data contains info for the view on which the above
      *     forms are to be drawn.
      */
     public function dataIntegrityCheck(&$data)
@@ -1075,7 +1074,7 @@ class RegisterController extends Controller implements CrawlConstants
      * in blank values for missing fields into a "MISSING"
      * array
      *
-     * @param array& $data an array of data to be sent to the view
+     * @param array &$data an array of data to be sent to the view
      *     After this method is done it will have cleaned versions
      *     of the $_REQUEST variables from create or recover account
      *     forms as well as a "MISSING" field which is an array of
diff --git a/src/controllers/ResourceController.php b/src/controllers/ResourceController.php
index b79966e77..604d4319e 100644
--- a/src/controllers/ResourceController.php
+++ b/src/controllers/ResourceController.php
@@ -180,9 +180,13 @@ class ResourceController extends Controller implements CrawlConstants
         }
     }
     /**
-     *
-     * @param bool $is_src_folder
-     * @param array
+     * Returns the file system folder where resources are stored
+     * making use of the n field for the name of the resource, its type,
+     * the sf field describing the desired subfolder
+     * and whether this is a request for a thumbnail or a object
+     * @param bool $is_src_folder should we look in the base directory
+     *  (src folder) or work_directory to try to find the resource
+     * @return array ordered pair [path beneath base folder to file, basefolder]
      */
     public function getNameAndBaseFolder($is_src_folder = false)
     {
diff --git a/src/controllers/SearchController.php b/src/controllers/SearchController.php
index 364ade925..97b3f23cb 100755
--- a/src/controllers/SearchController.php
+++ b/src/controllers/SearchController.php
@@ -382,7 +382,7 @@ class SearchController extends Controller implements CrawlConstants
      * against to prevent CSRF attacks, just after someone logged out, or
      * a bot session (googlebot, etc) so remove the query request
      *
-     * @param array& $data that will eventually be sent to the view. We might
+     * @param array &$data that will eventually be sent to the view. We might
      *     update with error messages
      * @return array consisting of (query based on user info, whether
      *     if a cache request highlighting should be userd, what activity
@@ -732,7 +732,7 @@ class SearchController extends Controller implements CrawlConstants
      * @param int $raw should validate against list of known crawls or an
      *     internal (say network) query that doesn't require validation
      *     (faster without).
-     * @param array& $data that will eventually be sent to the view. We set
+     * @param array &$data that will eventually be sent to the view. We set
      *     the 'its' (index_time_stamp) field here
      * @return array consisting of index timestamp of crawl or mix in use,
      *     $index_info an array of info about that index, and $save_timestamp
@@ -861,7 +861,7 @@ class SearchController extends Controller implements CrawlConstants
      * @param string $view name of view class search results are for
      * @param array $subsearches an array of data about each subsearch to draw
      *     to the view
-     * @param array& $data that will eventually be sent to the view for
+     * @param array &$data that will eventually be sent to the view for
      *     rendering. This method adds fields to the array
      */
     public function addSearchViewData($index_info, $no_query, $raw, $view,
@@ -1084,7 +1084,7 @@ EOD;
      * Searches the database for the most relevant pages for the supplied search
      * terms. Renders the results to the HTML page.
      *
-     * @param array& $data an array of view data that will be updated to include
+     * @param array &$data an array of view data that will be updated to include
      *     at most results_per_page many search results
      * @param string $query a string containing the words to search on
      * @param string $activity besides a straight search for words query,
@@ -2287,7 +2287,7 @@ EOD;
      *     indexes
      * @param array $queue_servers is an array containing URLs for queue
      *     servers
-     * @return [$all_crawl_times, $all_crawl_items] is an array containing
+     * @return array [$all_crawl_times, $all_crawl_items] is an array containing
      *     an array of crawl times and an array of their respective crawl items
      */
     public function getCrawlItems($url, $crawl_times, $queue_servers)
@@ -2577,7 +2577,7 @@ EOD;
      * libraries used to display cache pages
      *
      * @param DOMDocument $dom used to create new nodes
-     * @param DomElement& $node what to add script node to
+     * @param DomElement &$node what to add script node to
      */
     public function addCacheJavascriptTags($dom, &$node)
     {
diff --git a/src/controllers/components/AccountaccessComponent.php b/src/controllers/components/AccountaccessComponent.php
index 46f4e3a08..082c5c755 100644
--- a/src/controllers/components/AccountaccessComponent.php
+++ b/src/controllers/components/AccountaccessComponent.php
@@ -775,7 +775,7 @@ class AccountaccessComponent extends Component
      * and $_REQUEST['role_sorts']. Information about these roles is added as
      * fields to $data[NUM_USER_ROLES'] and $data['USER_ROLES']
      *
-     * @param array& $data data for the manageUsers view.
+     * @param array &$data data for the manageUsers view.
      * @param int $user_id user to look up roles for
      */
     public function getUserRolesData(&$data, $user_id)
@@ -830,7 +830,7 @@ class AccountaccessComponent extends Component
      * about these roles is added as
      * fields to $data[NUM_USER_GROUPS'] and $data['USER_GROUPS']
      *
-     * @param array& $data data for the manageUsers view.
+     * @param array &$data data for the manageUsers view.
      * @param int $user_id user to look up roles for
      */
     public function getUserGroupsData(&$data, $user_id)
diff --git a/src/controllers/components/Component.php b/src/controllers/components/Component.php
index 15ea4bc57..a692a73d1 100644
--- a/src/controllers/components/Component.php
+++ b/src/controllers/components/Component.php
@@ -87,7 +87,7 @@ class Component
      * and to send any localizations needed from PHP to Javascript-land
      * It is used by both Crawl and SocialComponent
      *
-     * @param array& $data an asscoiative array of data to be used by the
+     * @param array &$data an asscoiative array of data to be used by the
      *     view and layout that the wiki editor will be drawn on
      *     This method tacks on to INCLUDE_SCRIPTS to make the layout load
      *     wiki.js.
diff --git a/src/controllers/components/CrawlComponent.php b/src/controllers/components/CrawlComponent.php
index 09491daec..a1d32ae78 100644
--- a/src/controllers/components/CrawlComponent.php
+++ b/src/controllers/components/CrawlComponent.php
@@ -618,7 +618,7 @@ class CrawlComponent extends Component implements CrawlConstants
      * Called from @see manageCrawls to start a new crawl on the machines
      * $machine_urls. Updates $data array with crawl start message
      *
-     * @param array& $data an array of info to supply to AdminView
+     * @param array &$data an array of info to supply to AdminView
      * @param array $request_fields if start crawl fails this is a list of
      *      request fields to preserve in the redirect message
      */
@@ -690,7 +690,7 @@ class CrawlComponent extends Component implements CrawlConstants
      * Reads the parameters for a crawl from an array gotten from a crawl.ini
      * file
      *
-     * @param array& $crawl_params parameters to write to queue_server
+     * @param array &$crawl_params parameters to write to queue_server
      * @param array $seed_info data from crawl.ini file
      */
     public function getCrawlParametersFromSeedInfo(&$crawl_params, $seed_info)
@@ -773,7 +773,7 @@ class CrawlComponent extends Component implements CrawlConstants
      * crawl (or current crawl) to be carried out by the machines
      * $machine_urls. Updates $data array to be supplied to AdminView
      *
-     * @param array& $data an array of info to supply to AdminView
+     * @param array &$data an array of info to supply to AdminView
      * @param array $machine_urls string urls of machines managed by this
      * Yioop name server on which to perform the crawl
      */
@@ -1148,7 +1148,7 @@ class CrawlComponent extends Component implements CrawlConstants
      * Called from @see manageCrawls to read in the file with statistics
      * information about a crawl. This file is computed by @see AnalyticsJob
      *
-     * @param array& $data an array of info to supply to AdminView
+     * @param array &$data an array of info to supply to AdminView
      * @param array $machine_urls machines that are being used in crawl
      * Yioop name server on which to perform the crawl
      */
diff --git a/src/controllers/components/SocialComponent.php b/src/controllers/components/SocialComponent.php
index d414b747f..8c4601a2a 100644
--- a/src/controllers/components/SocialComponent.php
+++ b/src/controllers/components/SocialComponent.php
@@ -1062,7 +1062,7 @@ class SocialComponent extends Component implements CrawlConstants
      * membership in a group if the group is By Request or Public
      * Request
      *
-     * @param array& $data field variables to be drawn to view,
+     * @param array &$data field variables to be drawn to view,
      *      we modify the SCRIPT component of this with a message
      *      regarding success of not of add attempt.
      * @param int $add_id group id to be added
@@ -1138,7 +1138,7 @@ class SocialComponent extends Component implements CrawlConstants
      * $_REQUEST['user_filter']. Information about these roles is added as
      * fields to $data[NUM_USERS_GROUP'] and $data['GROUP_USERS']
      *
-     * @param array& $data data for the manageGroups view.
+     * @param array &$data data for the manageGroups view.
      * @param int $group_id group to look up users for
      */
     public function getGroupUsersData(&$data, $group_id)
@@ -1187,9 +1187,9 @@ class SocialComponent extends Component implements CrawlConstants
      * if the current group is to be modfied, and if so, to call model to
      * handle the update
      *
-     * @param array& $data used to add any information messages for the view
+     * @param array &$data used to add any information messages for the view
      *     about changes or non-changes to the model
-     * @param array& $group current group which might be altered
+     * @param array &$group current group which might be altered
      * @param array $update_fields which fields in the current group might be
      *     changed. Elements of this array are triples, the name of the
      *     group field, name of the request field to use for data, and an
@@ -1376,7 +1376,7 @@ class SocialComponent extends Component implements CrawlConstants
                     if (in_array($group['REGISTER_TYPE'],
                         [C\PUBLIC_BROWSE_REQUEST_JOIN, C\PUBLIC_JOIN])) {
                         $post_url = B\feedsUrl("thread", $parent_item["ID"],
-                            true, "group") . "preserve=true\n";
+                            true, "group", false) . "preserve=true\n";
                     }
                     $subject = tl('social_component_thread_notification',
                         $parent_item['TITLE']);
@@ -1539,7 +1539,7 @@ class SocialComponent extends Component implements CrawlConstants
                     $subject = tl('social_component_new_thread_mail',
                         $group['GROUP_NAME']);
                     $post_url = B\feedsUrl("thread", $thread_id, true,
-                        "group")."preserve=true\n";
+                        "group", false)."preserve=true\n";
                     $body = tl('social_component_new_thread_body',
                         $group['GROUP_NAME'])."\n".
                         "\"".$title."\"\n".
@@ -2301,7 +2301,7 @@ class SocialComponent extends Component implements CrawlConstants
     /**
      * Handles requests to reading, editing, viewing history, reverting, etc
      * wiki pages
-     * @return $data an associative array of form variables used to draw
+     * @return array $data an associative array of form variables used to draw
      *     the appropriate wiki page
      */
     public function wiki()
@@ -3047,7 +3047,7 @@ EOD;
      * page and to update the recent page impressions so that this can be
      * calculated
      *
-     * @param array& $data $data data to be sent to the view, will be modified
+     * @param array &$data $data data to be sent to the view, will be modified
      *  according to impression info.
      * @param int $user_id id of the user requesting to change the given wiki
      *  page
@@ -3115,7 +3115,7 @@ EOD;
      * of reading a media list or to help find resources in the case of a
      * user using edit mode
      *
-     * @param array& $data data to be sent to the view. The
+     * @param array &$data data to be sent to the view. The
      *  $data["RESOURCES_INFO"]['resources'] array of resources will be
      *  sorted according to the wiki page's settings as given in
      *  $data["HEAD"]['sort']
@@ -3153,7 +3153,7 @@ EOD;
      * refactoring still needs some work. Hence, the awkward parameter list
      * below.
      *
-     * @param array& $data $data data to be sent to the view, will be modified
+     * @param array &$data $data data to be sent to the view, will be modified
      *  according to the edit action.
      * @param int $user_id id of the user requesting to change the given wiki
      *  page
@@ -3578,7 +3578,7 @@ EOD;
      * needed to display a single media item on a media list. The name of
      * the media item to be display is expected to come from $_REQUEST['n'].
      *
-     * @param array& $data array of field variables for view will be modified
+     * @param array &$data array of field variables for view will be modified
      *  by this function
      * @param int $group_id id of group wiki page belongs to
      * @param int $page_id id of wiki page
@@ -4009,7 +4009,7 @@ EOD;
     /**
      * Used to create Javascript used to toggle a wiki page's settings control
      *
-     * @param array& $data will contain in SCRIPT field neccessary Javascript
+     * @param array &$data will contain in SCRIPT field neccessary Javascript
      *  to pass to view.
      */
     private function initializeWikiPageToggle(&$data)
diff --git a/src/controllers/components/StoreComponent.php b/src/controllers/components/StoreComponent.php
index 0b54be2fa..02bb162c6 100644
--- a/src/controllers/components/StoreComponent.php
+++ b/src/controllers/components/StoreComponent.php
@@ -534,7 +534,7 @@ class StoreComponent extends Component
     /**
      * Trim white spaces callback for array_walk
      *
-     * @param string& $value string to remove initial and trailing whitespace
+     * @param string &$value string to remove initial and trailing whitespace
      *      from
      */
     public function trim_value(&$value)
diff --git a/src/examples/StockBot.php b/src/examples/StockBot.php
index 8a32aafcb..49adc6f6a 100644
--- a/src/examples/StockBot.php
+++ b/src/examples/StockBot.php
@@ -24,8 +24,6 @@
  *
  * @author Harika Nukala harikanukala9@gmail.co
  *      (updated after yahoo stock quotes went dark, by Chris Pollett)
- * @package seek_quarry
- * @subpackage examples
  * @license https://www.gnu.org/licenses/ GPL3
  * @link https://www.seekquarry.com/
  * @copyright 2009 - 2020
diff --git a/src/examples/WeatherBot.php b/src/examples/WeatherBot.php
index b930ea989..c58ad966b 100644
--- a/src/examples/WeatherBot.php
+++ b/src/examples/WeatherBot.php
@@ -23,8 +23,6 @@
  *  END LICENSE
  *
  * @author Harika Nukala harika.nukala@sjsu.edu
- * @package seek_quarry
- * @subpackage examples
  * @license https://www.gnu.org/licenses/ GPL3
  * @link https://www.seekquarry.com/
  * @copyright 2009 - 2020
diff --git a/src/executables/Fetcher.php b/src/executables/Fetcher.php
index 22485c591..2bb94b995 100755
--- a/src/executables/Fetcher.php
+++ b/src/executables/Fetcher.php
@@ -1418,7 +1418,7 @@ class Fetcher implements CrawlConstants
      * Sets parameters for fetching based on provided info struct
      * ($info typically would come from the queue server)
      *
-     * @param array& $info struct with info about the kind of crawl, timestamp
+     * @param array &$info struct with info about the kind of crawl, timestamp
      * of index, crawl order, etc.
      */
     public function setCrawlParamsFromArray(&$info)
@@ -1706,7 +1706,7 @@ class Fetcher implements CrawlConstants
      * for which no content was downloaded so that they can be scheduled
      * to be crawled again.
      *
-     * @param array& $site_pages pages to sort
+     * @param array &$site_pages pages to sort
      * @return an array conisting of two array downloaded pages and
      * not downloaded pages.
      */
@@ -2126,8 +2126,8 @@ class Fetcher implements CrawlConstants
      * Adds thumbs for websites with a self::THUMB_URL field by downloading the
      *  linked to images and making a thumb from it.
      *
-     * @param array& $sites associative array of web sites information to add
-     *  thumbs   for. At least one site in the array should have a
+     * @param array &$sites associative array of web sites information to add
+     *  thumbs for. At least one site in the array should have a
      *  self::THUMB_URL field that we want have the thumb of
      */
     public function getPageThumbs(&$sites)
@@ -2246,7 +2246,7 @@ class Fetcher implements CrawlConstants
      * Then a crude estimate of the information contained in the links test:
      * strlen(gzip(text)) is used to extract the best remaining links.
      *
-     * @param array& $doc_info a string with a CrawlConstants::LINKS subarray
+     * @param array &$doc_info a string with a CrawlConstants::LINKS subarray
      * This subarray in turn contains url => text pairs.
      * @param string $field field for links default is CrawlConstants::LINKS
      * @param int $member_cache_time says how long allowed and disallowed url
@@ -2289,8 +2289,8 @@ class Fetcher implements CrawlConstants
      *
      * @param int $i index to copy to
      * @param array $site web page info to copy
-     * @param array& $summarized_site_pages array of summaries of web pages
-     * @param array& $stored_site_pages array of cache info of web pages
+     * @param array &$summarized_site_pages array of summaries of web pages
+     * @param array &$stored_site_pages array of cache info of web pages
      */
     public function copySiteFields($i, $site,
         &$summarized_site_pages, &$stored_site_pages)
@@ -2334,11 +2334,11 @@ class Fetcher implements CrawlConstants
      * documents to the summaried_size_pages and stored_site_pages
      * arrays constructed during the execution of processFetchPages()
      *
-     * @param int& $i index to begin adding subdocs at
+     * @param int &$i index to begin adding subdocs at
      * @param array $site web page that subdocs were from and from
      *     which some subdoc summary info is copied
-     * @param array& $summarized_site_pages array of summaries of web pages
-     * @param array& $stored_site_pages array of cache info of web pages
+     * @param array &$summarized_site_pages array of summaries of web pages
+     * @param array &$stored_site_pages array of cache info of web pages
      */
     public function processSubdocs(&$i, $site,
         &$summarized_site_pages, &$stored_site_pages)
diff --git a/src/executables/MediaUpdater.php b/src/executables/MediaUpdater.php
index 2172a7b98..bfea35d8c 100644
--- a/src/executables/MediaUpdater.php
+++ b/src/executables/MediaUpdater.php
@@ -41,7 +41,8 @@ use seekquarry\yioop\library\WikiParser;

 if (php_sapi_name() != 'cli' ||
     defined("seekquarry\\yioop\\configs\\IS_OWN_WEB_SERVER")) {
-    echo "BAD REQUEST"; exit();
+    echo "BAD REQUEST";
+    exit();
 }
 /** We do want logging, but crawl model and others will try to turn off
  * if we don't set this
@@ -178,7 +179,11 @@ class MediaUpdater implements CrawlConstants
         L\crawlLog("Done checking Name Server for Media Updater properties");
     }
     /**
-     * @param array $jobs_list
+     * Given a list of MediaUpdate jobs, updates $this->jobs to contain
+     * instantiated objects of the the corresponding jobs, requiring
+     * classes that have not been loaded yet as needed.
+     *
+     * @param array $jobs_list list of MediaUpdater jobs
      */
     public function loadJobs($jobs_list)
     {
diff --git a/src/executables/QueueServer.php b/src/executables/QueueServer.php
index 910bbea39..c4dac30c3 100755
--- a/src/executables/QueueServer.php
+++ b/src/executables/QueueServer.php
@@ -116,14 +116,21 @@ class QueueServer implements CrawlConstants, Join
      */
     public $channel;
     /**
+     * Controls whether a repeating crawl (negative man no) is being done
+     * and if so its frequency in second
      * @var int
      */
     public $repeat_type;
     /**
+     * If a crawl quiescent period is being used with the crawl, then
+     * this stores the time of day at which that period starts
      * @var string
      */
     public $sleep_start;
     /**
+     * If a crawl quiescent period is being used with the crawl, then
+     * this sproperty will be positive and indicate the number of seconds
+     * duration for the quiescent period.
      * @var string
      */
     public $sleep_duration;
@@ -2369,7 +2376,7 @@ class QueueServer implements CrawlConstants, Join
      * pastes more than MAX_FETCH_SIZE many urls into the initial seed sites
      * of a crawl in the  UI.
      *
-     * @param array& $sites array containing to crawl data
+     * @param array &$sites array containing to crawl data
      */
     public function dumpBigScheduleToSmall(&$sites)
     {
@@ -2917,7 +2924,7 @@ class QueueServer implements CrawlConstants, Join
      * This function is used to schedule slots for crawl-delayed host.
      *
      * @param int $index location to begin searching for an empty slot
-     * @param array& $arr list of slots to look in
+     * @param array &$arr list of slots to look in
      * @return int index of first available slot
      */
     public function getEarliestSlot($index, &$arr)
diff --git a/src/index.php b/src/index.php
index 3a797e438..d18f111b1 100644
--- a/src/index.php
+++ b/src/index.php
@@ -89,7 +89,7 @@ function bootstrap($web_site = null, $start_new_session = true)
         to images containing HTML. Also, might help against PRSSI attacks.
         */
         if ($start_new_session) {
-            if (checkCookieConsent($web_site)) {
+            if (checkCookieConsent()) {
                 $options = ['name' => C\SESSION_NAME,
                     'cookie_lifetime' => C\COOKIE_LIFETIME];
                 if (C\nsdefined("SECURE_COOKIE") && C\SECURE_COOKIE) {
@@ -197,9 +197,15 @@ function bootstrap($web_site = null, $start_new_session = true)
     $controller->processRequest();
 }
 /**
+ * Checks if a cookie consent form was obtained. This
+ * This function returns true if a session cookie
+ * was received from the browser, or a form variable
+ * saying cookies are okay was received, or the cookie
+ * Yioop profile says the consent mechanism is disabled
  *
+ * @return bool cookie consent (true) else false
  */
-function checkCookieConsent($web_site)
+function checkCookieConsent()
 {
     if (C\PROFILE && intval(C\COOKIE_LIFETIME) > 0 &&
         empty($_COOKIE[C\SESSION_NAME])
@@ -215,7 +221,8 @@ function checkCookieConsent($web_site)
  * Developers can add new routes by creating a Routes class in
  * the app_dir with a static method getRoutes which should return
  * an associating array of incoming_path => handler function
- * @param object $web_site
+ * @param object $web_site used to send error pages if configuration
+ *  fails
  */
 function configureRewrites($web_site)
 {
@@ -317,8 +324,11 @@ function configureRewrites($web_site)
     }
 }
 /**
- * @param array $route_args
- * @return bool
+ * Used to handle routes that will eventually just serve
+ * files from either the APP_DIR
+ * These include files like css, scripts, suggest tries, images, and videos.
+ * @param array $route_args of url parts (split on slash)
+ * @return bool whether was able to compute a route or not
  */
 function routeAppFile($route_args)
 {
@@ -394,7 +404,11 @@ function routeAppFile($route_args)
     return false;
 }
 /**
- *
+ * Used to handle routes that will eventually just serve
+ * files from either the BASE_DIR
+ * These include files like css, scripts, images, and robots.txt.
+ * @param array $route_args of url parts (split on slash).
+ * @return bool whether was able to compute a route or not
  */
 function routeBaseFile($route_args)
 {
@@ -532,12 +546,16 @@ function routeFeeds($route_args)
  * @param bool $with_delim whether it should be terminated with nothing or
  *      ? or &
  * @param string $controller which controller is being used to access the
- *      feed: usuall admin or group
+ *      feed: usually admin or group
+ * @param bool $use_short_base_url whether to create the url as a relative
+ *   url using C\SHORT_BASE_URL or as a full url using  C\BASE_URL
+ *   (the latter is useful for mail notifications)
  * @return string url for the page in question
  */
-function feedsUrl($type, $id, $with_delim = false, $controller = "group")
+function feedsUrl($type, $id, $with_delim = false, $controller = "group",
+    $use_short_base_url = true)
 {
-    $base_url = C\SHORT_BASE_URL;
+    $base_url = ($use_short_base_url) ? C\SHORT_BASE_URL : C\BASE_URL;
     if (C\REDIRECTS_ON && $controller == 'group') {
         $delim = ($with_delim) ? "?" : "";
         $path = ($type == "") ? "group" : "$type/$id";
diff --git a/src/library/BTree.php b/src/library/BTree.php
index 576ed61b9..34bd02579 100644
--- a/src/library/BTree.php
+++ b/src/library/BTree.php
@@ -431,7 +431,7 @@ class BTree
     }
     /**
      * Deletes key-value pair from a leaf node in a B-Tree
-     * @param object& $node is the leaf node containing the key-value pair
+     * @param BTNode &$node is the leaf node containing the key-value pair
      * @param int $pos in node to delete
      */
     public function deleteFromLeaf(&$node, $pos)
@@ -455,7 +455,7 @@ class BTree
     }
     /**
      * Deletes key-value pair from a non-leaf node in a B-Tree
-     * @param object& $node is the non-leaf node containing the key-value pair
+     * @param BTNode &$node is the non-leaf node containing the key-value pair
      * @param int $pos link position in node to delete
      */
     public function deleteFromNonLeaf(&$node, $pos)
@@ -630,9 +630,9 @@ class BTree
      * Gives a child node an extra key by moving a key from the parent to the
      * child node, and by moving a key from the child's right sibling to the
      * parent node
-     * @param object& $parent is the parent node
-     * @param object& $child is the child node
-     * @param object& $next is the $child's right sibling node
+     * @param BTNode &$parent is the parent node
+     * @param BTNode &$child is the child node
+     * @param BTNode &$next is the $child's right sibling node
      * @param int $pos is the link from $parent to $child
      */
     public function adjustChildUsingRightSiblingAndParent(&$parent, &$child,
diff --git a/src/library/BloomFilterBundle.php b/src/library/BloomFilterBundle.php
index c112be516..8fd9d4150 100644
--- a/src/library/BloomFilterBundle.php
+++ b/src/library/BloomFilterBundle.php
@@ -133,7 +133,7 @@ class BloomFilterBundle
      * Removes from the passed array those elements $elt who either are in
      * the filter bundle or whose $elt[$field_name] is in the bundle.
      *
-     * @param array& $arr the array to remove elements from
+     * @param array &$arr the array to remove elements from
      * @param array $field_names if not null an array of field names of $arr
      *     to use to do filtering
      */
diff --git a/src/library/Bzip2BlockIterator.php b/src/library/Bzip2BlockIterator.php
index b9ca7e22f..1608deb5d 100644
--- a/src/library/Bzip2BlockIterator.php
+++ b/src/library/Bzip2BlockIterator.php
@@ -289,8 +289,8 @@ class BZip2BlockIterator
      * Computes a new bzip2 block portions and bits left over after adding
      * $bytes to the passed $block.
      *
-     * @param string& $block the block to add to
-     * @param int& $bits used to hold bits left over
+     * @param string &$block the block to add to
+     * @param int &$bits used to hold bits left over
      * @param string $bytes what to add to the bzip block
      * @param int $num_extra_bits how many extra bits there are
      */
diff --git a/src/library/ComputerVision.php b/src/library/ComputerVision.php
index 91802bd8c..1539190e3 100644
--- a/src/library/ComputerVision.php
+++ b/src/library/ComputerVision.php
@@ -42,16 +42,29 @@ require_once __DIR__ . "/Utility.php";
  */
 require_once __DIR__ . "/LocaleFunctions.php";
 /**
- *
+ * Class used to encapsulate verious methods related to computer
+ * vision that might be useful for indexing documents. These
+ * include recognizing text in images
  */
 class ComputerVision
 {
+    /**
+     * Returns whether or not this Yioop system can recognize text in images
+     * Currently, this is down using the tesseract external program, so this
+     * method checks if a path to that program has been defined.
+     * @return bool whether a path to tesseract has been defined.
+     */
     public static function ocrEnabled()
     {
         return C\nsdefined("TESSERACT");
     }
     /**
+     * Given a file path to a image file and set of target languages, returns
+     * the text in those languages that the image contained
      *
+     * @param string $image_path a filepath to an image
+     * @param array $langs locale_tags of languages we want to extract text for
+     * @return string text extracted from image
      */
     public static function recognizeText($image_path,
         $langs = [C\DEFAULT_LOCALE])
diff --git a/src/library/ContextTagger.php b/src/library/ContextTagger.php
new file mode 100644
index 000000000..ed4482074
--- /dev/null
+++ b/src/library/ContextTagger.php
@@ -0,0 +1,413 @@
+<?php
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009 - 2020  Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @license https://www.gnu.org/licenses/ GPL3
+ * @link https://www.seekquarry.com/
+ * @copyright 2009 - 2019
+ * @filesource
+ */
+
+namespace seekquarry\yioop\library;
+
+use seekquarry\yioop\configs as C;
+/**
+ * Abstract, base context tagger class.
+ * A context tagger is used to apply a sequence of labels to a sequence terms
+ * or characters of text based on a surrounding context. Context Taggers
+ * typically make use of n-gram context of a term such as the n/2 - terms
+ * before and after the term and maybe the earlier tags from a same phrase or
+ * sentence to make prediction
+ *
+ * @author Chris Pollett
+ */
+abstract class ContextTagger
+{
+    /**
+     * Locale tag of language this recognizer is for
+     * @var string
+     */
+    public $lang;
+    /**
+     * The name of the file where the tagging model should be stored and read
+     * from
+     * @var string
+     */
+    public $tagger_file = "tagger.txt.gz";
+    /**
+     * Complete file system path to the file where the tagging model should be
+     * stored and read from
+     * @var string
+     */
+    public $tagger_path = "";
+    /**
+     * 2D weights for features involving the prior two words to the
+     * current word and the next two words after the current word
+     * For a given word position, one has vector, that gives te
+     * value for each term in the complete training term set, unknown term set,
+     * and rule based tag term set, what its weight is
+     * Determined during training
+     * @var array
+     */
+    public $word_feature;
+    /**
+     * The bias vector for features we are training
+     *
+     * Determined during training
+     * @var array
+     */
+    public $bias;
+    /**
+     * The weights for features involving the prior two tags to the
+     * current word whose tag we are trying to determine
+     * Determined during training
+     * @var array
+     */
+    public $tag_feature;
+    /**
+     * Array of strings for each possible tag for a term
+     * associated as [tag => tag index]
+     * @var array
+     */
+    public $tag_set;
+    /**
+     * Minimum allowed value for a weight component
+     * @var float
+     */
+    public $min_w;
+    /**
+     * Maximum allowed value for a weight component
+     * @var float
+     */
+    public $max_w;
+    /**
+     * Tokenizer for the language this tagger tags for
+     * @var Tokenizer
+     */
+    public $tokenizer;
+    /**
+     * Constructor for the ContextTagger.
+     * Sets the language this tagger tags for and sets up the path for
+     * where it should be stored
+     * @param string $lang locale tag of the language this tagger tags is for
+     */
+    public function __construct($lang)
+    {
+        $lang = str_replace("-", "_", $lang);
+        $this->lang = $lang;
+        $this->tagger_path = C\LOCALE_DIR . "/$lang/resources/" .
+            $this->tagger_file;
+        $this->tokenizer = PhraseParser::getTokenizer($lang);
+    }
+    /**
+     * Converts training data from the format tagged sentence with terms of the
+     * form term_tag into a pair of arrays [[terms_in_sentence],
+     *  [tags_in_sentence]]
+     * @param mixed $text_files can be a file or an array of file names
+     * @param string $term_tag_separator separator used to separate term and tag
+     *  for terms in input sentence
+     * @param function $term_callback callback function applied to a term
+     *  before adding term to sentence term array
+     * @param function $tag_callback callback function applied to a part of
+     *  speech tag  before adding tag to sentence tag array
+     * @return array of separated sentences, each sentence having the format of
+     *  [[terms...], [tags...]]
+     *  Currently, the training data needs to fit Chinese Treebank format:
+     *  term followed by a underscore and followed by the tag
+     *  e.g. "新_VA 的_DEC 南斯拉夫_NR 会国_NN"
+     *  To adapt to other language, some modifications are needed
+     */
+    public static function processTexts($text_files, $term_tag_separator = "_",
+        $term_callback = null, $tag_callback = null)
+    {
+        $ret = [];
+        foreach($text_files as $text_file) {
+            if (file_exists($text_file)) {
+                $fh = fopen($text_file, "r");
+                while (!feof($fh))  {
+                    $line = fgets($fh);
+                    if(strpos($line, '<') !== false) {
+                        continue;
+                    }
+                    $word_tag_pairs = preg_split("/[\s ]+/u", $line);
+                    if (!count($word_tag_pairs)) {
+                        continue;
+                    }
+                    $ret[] = [];
+                    $ret[count($ret) - 1][0] = [];
+                    $ret[count($ret) - 1][1] = [];
+                    foreach ($word_tag_pairs as $word_tag_pair) {
+                        $t = explode($term_tag_separator, $word_tag_pair);
+                        if (count($t) == 2) {
+                            $ret[count($ret) - 1][0][] =
+                                $term_callback ? $term_callback($t[0]) : $t[0];
+                            $ret[count($ret)-1][1][] =
+                                $tag_callback ? $tag_callback($t[1]) : $t[1];
+                        }
+                    }
+                }
+                fclose($fh);
+            }
+        }
+        return $ret;
+    }
+    /**
+     * Maps a term to a corresponding key if the term matches some simple
+     * pattern such as being a number
+     * @param string $term is the term to be checked
+     * @return mixed either the int key for those matrices of just the term
+     *  itself if the tokenizer does not ave the method getPosKey for the
+     *  current language
+     */
+    public function getKey($term)
+    {
+        if (!empty($this->tokenizer) && method_exists($this->tokenizer,
+            "getPosKey")) {
+            return $this->tokenizer::getPosKey($term);
+        }
+        return $term;
+    }
+    /**
+     * Given a sentence (array $terms), find the key for the term at position
+     * $index
+     * @param int $index position of term to get key for
+     * @param array $terms an array of terms typically from and in the order of
+     *  a sentence
+     * @return mixed key position in word_feature weights and bias arrays
+     *  could be either an int, or the term itself, or the simple rule
+     *  based part of speec it belongs to
+     */
+    public function getIndex($index, $terms)
+    {
+        if ($index < 0) {
+            $k = $index - 2;
+        } else if ($index >= count($terms)) {
+            $k = $index - count($terms) - 2;
+        } else {
+            $k = $this->getKey($terms[$index]);
+        }
+        return $k;
+    }
+    /**
+     * Save the trained weight to disk
+     */
+    public function saveWeights()
+    {
+        $out = [];
+        $out["min_w"] = $this->min_w;
+        $out["max_w"] = $this->max_w;
+        $out["w"] = [];
+        foreach(array_keys($this->word_feature) as $key) {
+            $out["w"][$key] = $this->packW($key);
+        }
+        foreach(array_keys($this->tag_feature) as $key) {
+            $out["t"][$key] = $this->packT($key);
+        }
+        $out["b"] = $this->packB();
+        $out["tag_set"] = $this->tag_set;
+        echo "Saving...";
+        file_put_contents($this->tagger_path,
+            gzencode(serialize($out), 9));
+        echo " ok\n";
+    }
+    /**
+     * Load the trained data from disk
+     * @param bool $for_training whether we are continuing to train (true) or
+     *  whether we are using the loaded data for prediction
+     */
+    public function loadWeights($for_training = false)
+    {
+        if (!file_exists($this->tagger_path)) {
+            echo "$this->tagger_path does not exist!";
+            exit();
+        }
+        $f = unserialize(gzdecode(file_get_contents($this->tagger_path)),
+            ['allowed_classes' => false]);
+        $this->word_feature = $f["w"];
+        $this->tag_feature = $f["t"] ?? [];
+        $this->bias = $f["b"];
+        $this->min_w = $f["min_w"];
+        $this->max_w = $f["max_w"];
+        $this->tag_set = $f["tag_set"];
+        if ($for_training) {
+            foreach(array_keys($this->word_feature) as $key) {
+                $this->word_feature[$key] = $this->unpackW($key);
+            }
+            foreach(array_keys($this->tag_feature) as $key) {
+                $this->tag_feature[$key] = $this->unpackT($key);
+            }
+            $this->bias = $this->unpackB();
+        }
+    }
+    /**
+     * Pack the bias vector represented as an array into a string
+     * @return string the bias vector packed as a string
+     */
+    public function packB()
+    {
+        return pack("f*", ...$this->bias);
+    }
+    /**
+     * Unpack the bias represented as a string into an array
+     * @return array the bias vector unpacked from a string
+     */
+    public function unpackB()
+    {
+        return array_merge(unpack("f" . strval(count($this->tag_set)),
+            $this->bias));
+    }
+    /**
+     * Pack the tag_feature represented as an array into a string
+     * @param int $key in tag_feature set corresponding to a part of speech
+     * @return string packed tag_feature vector
+     */
+    public function packT($key)
+    {
+        return pack("f*", ...$this->tag_feature[$key]);
+    }
+    /**
+     * Unpack the tag_feature represented as a string into an array
+     * @param int $key in tag_feature set corresponding to a part of speech
+     * @return array unpacked tag_feature vector
+     */
+    public function unpackT($key)
+    {
+        return array_merge(unpack("f" . strval(count($this->tag_set)),
+            $this->tag_feature[$key]));
+    }
+    /**
+     * Pack the weights matrix to a string for a particular part of speech key
+     * @param int $key index corresponding to a part of speech according to
+     *  $this->tag_set
+     * @return string the packed weights matrix
+     */
+    public function packW($key)
+    {
+        $bin_str = "";
+        foreach ($this->word_feature[$key] as $i => $t) {
+            foreach ($t as $u) {
+                $v = 65535 * ($u - $this->min_w) /
+                    ($this->max_w - $this->min_w);
+                $bin_str .= pack("S", intval($v));
+            }
+        }
+        return $bin_str;
+    }
+    /**
+     * Unpack the weight matrix for a given part of speech key. This
+     * is a 5 x term_set_size matrix the 5 rows corresponds to
+     * -2, -1, 0, 1, 2, locations in a 5-gram.
+     * An (i, j) entry roughly gives the probability of the j term in location i
+     * having the part of speech given by $key
+     * @param int $key in word_feature set corresponding to a part of speech
+     * @return array of weights corresponding to that key
+     */
+    public function unpackW($key)
+    {
+        $weights = [];
+        $size = count($this->tag_set);
+        for ($i = 0; $i < 5; $i++) {
+            $weights[$i - 2] = array_merge(unpack("S" . strval($size),
+                $this->word_feature[$key], 2 * $i * count($this->tag_set)));
+            for($j = 0; $j < $size; $j++) {
+                $weights[$i - 2][$j] = ($weights[$i - 2][$j] / 65535) *
+                    ($this->max_w - $this->min_w) + $this->min_w;
+            }
+        }
+        return $weights;
+    }
+    /**
+     * Get the bias value for a tag
+     * @param int $tag_index the index of tag's value within the bias string
+     * @return float bias value for tag
+     */
+    public function getB($tag_index)
+    {
+        return unpack("f", $this->bias, $tag_index * 4)[1];
+    }
+    /**
+     * Set the bias value for tag
+     * @param int $tag_index the index of tag's value within the bias string
+     * @param float $value bias value to associate to tag
+     */
+    public function setB($tag_index, $value)
+    {
+        $this->bias = substr_replace($this->bias, pack("f", $value),
+            $tag_index * 4, 4);
+    }
+    /**
+     * Get the tag feature value for tag
+     * @param int $key in tag_feature set corresponding to a part of speech
+     * @param int $tag_index the index of tag's value within the tag feature
+     *  string
+     * @return float tag feature value for tag
+     */
+    public function getT($key, $tag_index)
+    {
+        return unpack("f", $this->tag_feature[$key], $tag_index * 4)[1];
+    }
+    /**
+     * Get the weight value for term at position for tag
+     * @param string $term to get weight of
+     * @param int $position of term within the current 5-gram
+     * @param int $tag_index index of the particular tag we are trying to see
+     *  the term's weight for
+     * @return float
+     */
+    public function getW($term, $position, $tag_index)
+    {
+        $t = unpack("S", $this->word_feature[$term], 2 * ($position + 2) *
+            count($this->tag_set) + $tag_index * 2)[1] / 65535 *
+            ($this->max_w - $this->min_w) + $this->min_w;;
+        return $t;
+    }
+    /**
+     * Uses text files to train a tagger for terms or chars in a  document
+     * @param mixed $text_files with training data. These can be a file or
+     *  an array of file names.
+     * @param string $term_tag_separator separator used to separate term and tag
+     *  for terms in input sentence
+     * @param float $learning_rate learning rate when cycling over data trying
+     *  to minimize the cross-entropy loss in the prediction of the tag of the
+     *  middle term.
+     * @param int $num_epoch number of times to cycle through the
+     *  complete data set. Default value of 1200 seems to avoid overfitting
+     * @param function $term_callback callback function applied to a term
+     *  before adding term to sentence term array as part of processing and
+     *  training with a sentence.
+     * @param function $tag_callback callback function applied to a part of
+     *  speech tag  before adding tag to sentence tag array as part of
+     *  processing and training with a sentence.
+     */
+    public abstract function train($text_files, $term_tag_separator = "-",
+        $learning_rate = 0.1, $num_epoch = 1200, $term_callback = null,
+        $tag_callback = null, $resume = false);
+    /**
+     * Predicts a tagging for all elements of $sentence
+     *
+     * @param mixed $sentence is an array of segmented terms/chars
+     *  or a string that will be split on white space
+     * @return array predicted tags. The ith entry in the returned results
+     *  is the tag of ith element of $sentence
+     */
+    public abstract function predict($sentence);
+}
diff --git a/src/library/ContextWeightedNamedEntityRecognizer.php b/src/library/ContextWeightedNamedEntityRecognizer.php
deleted file mode 100644
index e6b11d88e..000000000
--- a/src/library/ContextWeightedNamedEntityRecognizer.php
+++ /dev/null
@@ -1,606 +0,0 @@
-<?php
-/**
- * SeekQuarry/Yioop --
- * Open Source Pure PHP Search Engine, Crawler, and Indexer
- *
- * Copyright (C) 2009 - 2019  Chris Pollett chris@pollett.org
- *
- * LICENSE:
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <https://www.gnu.org/licenses/>.
- *
- * @author Xianghong Sun sxh19911230@gmail.com
- * @license https://www.gnu.org/licenses/ GPL3
- * @link https://www.seekquarry.com/
- * @copyright 2009 - 2019
- * @filesource
- */
-namespace seekquarry\yioop\library;
-
-use seekquarry\yioop\configs as C;
-use seekquarry\yioop\locale\zh_CN\resources as ZH;
-
-/**
- * Machine learning based NER tagger. Typically, ContextWeightedNERTagger.php
- * can train the language with some dataset and predict
- * the tag given a list of word.
- *
- * @author Xianghong Sun
- */
-class ContextWeightedNamedEntityRecognizer
-{
-    /**
-     * Current Language, only tested on Simplified Chinese
-     * Might be extensable for other languages in the furture
-     * @var string
-     */
-    public $lang;
-    /**
-     * The word weight feature
-     * y = wx + b
-     * Generized by training method
-     * @var array
-     */
-    public $word_feature;
-    /**
-     * The tag weight feature
-     * y = wx + b
-     * Generized by training method
-     * @var array
-     */
-    public $tag_feature;
-    /**
-     * The bias
-     * y = wx + b
-     * Generized by training method
-     * @var array
-     */
-    public $bias;
-     /**
-     * All Possiable tag set
-     * Generized by training method
-     * @var associative array [tag => tag index]
-     */
-    private $tag_set;
-     /**
-     * The constructer of the pos tagger
-     * To extend to other languages, some work are needed:
-     * Define $this->getKeyImpl, $this->rule_defined_key
-     * See Chinese example.
-     * @param @string $lang describes current langauge
-     * @param @book $packed describes how weight and bias would look like
-     */
-    public function __construct($lang)
-    {
-        switch($lang) {
-            case("zh_CN"):
-            case("zh-CH"):
-                $this->lang = "zh_CN";
-                break;
-            default:
-                $this->lang = $lang;
-        }
-    }
-
-    /**
-     * A function that process the trainning data
-     * @param @mixed $text_files can be a file or an array of file names
-     * @return @array of seperated sentences, each sentenfce have the format of
-     *               [[words...],[tags...]]
-     * Data format MSRA:
-     * 我们/o 是/o 受到/o 郑振铎/nr 先生/o 、/o 阿英/nr 先生/o 著作/o 的/o
-     * 启示/o ,/o 从/o 个人/o 条件/o 出发/o ,/o 瞄准/o 现代/o 出版/o 史/o
-     * 研究/o 的/o 空白/o ,/o 重点/o 集/o 藏/o 解放区/o 、/o 国民党/nt 毁/o
-     * 禁/o 出版物/o 。/o
-     * To adapt to other language, some modifications are needed
-     */
-    public static function processTexts($text_files, $term_tag_splier="/",
-        $term_process = null, $tag_process = null)
-    {
-        $ret=[];
-        foreach($text_files as $text_file) {
-            if (file_exists($text_file)) {
-                $fn = fopen($text_file,"r");
-                while(! feof($fn))  {
-                    $line = fgets($fn);
-                    if(strpos($line, '<') !== false) {
-                        continue;
-                    }
-                    $word_tag_pairs = preg_split("/[\s ]+/u", $line);
-                    if (!count($word_tag_pairs)) {
-                        continue;
-                    }
-                    $ret[] = [];
-                    $ret[count($ret)-1][0] = [];
-                    $ret[count($ret)-1][1] = [];
-                    foreach ($word_tag_pairs as $word_tag_pair) {
-                        $t = explode("/", $word_tag_pair);
-                        if (count($t) == 2) {
-                            $tag = $tag_process ? $tag_process($t[1]) : $t[1];
-                            foreach(preg_split('//u', $t[0], null,
-                                PREG_SPLIT_NO_EMPTY) as $ch) {
-                                $ret[count($ret)-1][0][] =
-                                    $term_process ? $term_process($ch) : $ch;
-                                $ret[count($ret)-1][1][] = $tag;
-                            }
-                        }
-                    }
-                }
-                fclose($fn);
-            }
-        }
-        return $ret;
-    }
-
-    /**
-    * Function to train a data
-    * Notice: This function might run very long time, depending on training set
-    * @param @mixed $text_files are training data
-    *               can be a file or an array of file names
-    * @param @float $learning_rate
-    * @param @int  $max_epoch 1200 might be a good one,
-    *           the weight will overfit if it's greater than this number
-    * @param @function $term_process is a preporcess on term before training
-    * @param @function $tag_process is a preporcess on tag before training
-    */
-    public function train($text_files, $learning_rate=0.1, $max_epoch = 1200,
-        $term_process = null, $tag_process = null)
-    {
-        if (is_string($text_files)) {
-            $text_files = [$text_files];
-        }
-        echo "Reading files\n";
-        // term_tag_sentences[sentence#]=[[words...],[tags...]]
-        $term_tag_sentences = self::processTexts($text_files,
-            $term_process, $tag_process);
-        $this->word_feature=[];
-        $this->tag_set=[];
-        $tag_index = 0;
-        for ($i = -4; $i <= -1; $i++) {
-            $this->word_feature[$i] = [];
-        }
-        foreach ($term_tag_sentences as $term_tag_pairs) {
-            $terms=$term_tag_pairs[0];
-            $tags=$term_tag_pairs[1];
-            $this->tag_feature["start"]=[];
-            $this->tag_feature["start-start"]=[];
-            for ($i = 0; $i < count($terms); $i++) {
-                if (!isset($this->tag_set[$tags[$i]])) {
-                    $this->tag_set[$tags[$i]] = $tag_index++;
-                }
-                if ($i == 0) {}
-                else if ($i == 1) {
-                    if (!isset($this->tag_feature["start-".$tags[$i-1]])) {
-                        $this->tag_feature["start-".$tags[$i-1]]=[];
-                    }
-                    if (!isset($this->tag_feature[$tags[$i-1]])) {
-                        $this->tag_feature[$tags[$i-1]]=[];
-                    }
-                } else {
-                    if (!isset($this->tag_feature[$tags[$i-2] . "-" .
-                        $tags[$i-1]])) {
-                        $this->tag_feature[$tags[$i-2]."-".$tags[$i-1]] = [];
-                    }
-                    if (!isset($this->tag_feature[$tags[$i-1]])) {
-                        $this->tag_feature[$tags[$i-1]]=[];
-                    }
-                }
-                if (!isset($this->word_feature[$terms[$i]])) {
-                    $this->word_feature[$terms[$i]] = [];
-                }
-            }
-        }
-        foreach (array_keys($this->word_feature) as $key) {
-            for ($i=-2; $i<=2;$i++) {
-                if (!isset($this->word_feature[$key][$i])) {
-                    $this->word_feature[$key][$i] = [];
-                }
-                foreach($this->tag_set as $possiable_tag => $tag_index) {
-                    if (!isset($this->word_feature[$key][$i][$tag_index])) {
-                        $this->word_feature[$key][$i][$tag_index] = 0;
-                    }
-                }
-            }
-        }
-        foreach (array_keys($this->tag_feature) as $key) {
-            foreach($this->tag_set as $possiable_tag => $tag_index) {
-                if (!isset($this->tag_feature[$key][$tag_index])) {
-                    $this->tag_feature[$key][$tag_index] = 0;
-                }
-            }
-        }
-        foreach($this->tag_set as $possiable_tag => $tag_index) {
-            if (!isset($this->bias[$tag_index])) {
-                $this->bias[$tag_index] = 0;
-            }
-        }
-        echo "Training...\n";
-        //train the weight
-        $cross_entropy_loss = 1;
-        $pre_cross_entropy_loss = 2;
-        for ($epoch = 0; ($epoch < $max_epoch) &&
-            $pre_cross_entropy_loss - $cross_entropy_loss > 0.000001;
-            $epoch++) {
-            $this->min_w=0;
-            $this->max_w=0;
-            $time = time();
-            $dy_dw = [];
-            $dy_dw_n = [];
-            $pre_cross_entropy_loss = $cross_entropy_loss;
-            $cross_entropy_loss = 0;
-            $cross_entropy_loss_n = 0;
-
-            $dy_db=[];
-            $dy_db_n=[];
-
-            $dy_dt=[];
-            $dy_dt_n=[];
-            for($i = 0; $i < count($this->tag_set); $i++) {
-                $dy_db[$i] = 0;
-                $dy_db_n[$i] = 0;
-            }
-            //for each sentence
-            foreach ($term_tag_sentences as $term_tag_pairs) {
-                $terms=$term_tag_pairs[0];
-                $tags=$term_tag_pairs[1];
-                for ($i = 0; $i < count($terms); $i++) {
-                    $k=[];
-                    for ($j=-2; $j<=2;$j++) {
-                        $k[$j]= $this->getIndex($i+$j,$terms);
-                    }
-                    foreach ($this->tag_set as $possiable_tag => $tag_index) {
-                        $equality = $possiable_tag == $tags[$i] ? 1 : 0;
-                        $sum=0;
-                        //5 words including itself
-                        for ($j=-2; $j<=2;$j++) {
-                            $sum += $this->word_feature[$k[$j]][$j][$tag_index];
-                        }
-                        //previous 2 tags
-                        if ($i == 0) {
-                            $tf1="start";
-                            $tf2="start-start";
-                        } else if ($i == 1) {
-                            $tf1=$tags[$i-1];
-                            $tf2="start-".$tags[$i-1];
-                        } else {
-                            $tf1=$tags[$i-1];
-                            $tf2=$tags[$i-2]."-".$tags[$i-1];
-                        }
-                        $sum += $this->tag_feature[$tf1][$tag_index];
-                        $sum += $this->tag_feature[$tf2][$tag_index];
-                        //bias
-                        $sum += $this->bias[$tag_index];
-                        $sigmoid = 1 / (1 + exp(-1 * $sum));
-                        for ($j=-2; $j<=2;$j++) {
-                            if (!isset($dy_dw[$k[$j]])) {
-                                $dy_dw[$k[$j]] = [];
-                                $dy_dw_n[$k[$j]] = [];
-                            }
-                            if (!isset($dy_dw[$k[$j]][$j])) {
-                                $dy_dw[$k[$j]][$j] = [];
-                                $dy_dw_n[$k[$j]][$j] = [];
-                            }
-                            if (!isset($dy_dw[$k[$j]][$j][$tag_index])) {
-                                $dy_dw[$k[$j]][$j][$tag_index] = 0;
-                                $dy_dw_n[$k[$j]][$j][$tag_index] = 0;
-                            }
-
-                            $dy_dw[$k[$j]][$j][$tag_index] +=
-                                ($sigmoid - $equality);
-                            $dy_dw_n[$k[$j]][$j][$tag_index] += 1;
-
-                        }
-                        //dy_dt
-                        if (!isset($dy_dt[$tf1])) {
-                            $dy_dt[$tf1] = [];
-                            $dy_dt_n[$tf1] = [];
-                        }
-                        if (!isset($dy_dt[$tf1][$tag_index])) {
-                            $dy_dt[$tf1][$tag_index] = 0;
-                            $dy_dt_n[$tf1][$tag_index] = 0;
-                        }
-                        if (!isset($dy_dt[$tf2])) {
-                            $dy_dt[$tf2] = [];
-                            $dy_dt_n[$tf2] = [];
-                        }
-                        if (!isset($dy_dt[$tf2][$tag_index])) {
-                            $dy_dt[$tf2][$tag_index] = 0;
-                            $dy_dt_n[$tf2][$tag_index] = 0;
-                        }
-                        $dy_dt[$tf1][$tag_index] += ($sigmoid - $equality);
-                        $dy_dt_n[$tf1][$tag_index] += 1;
-                        $dy_dt[$tf2][$tag_index] += ($sigmoid - $equality);
-                        $dy_dt_n[$tf2][$tag_index] += 1;
-                        //dy_db
-                        $dy_db[$tag_index] += ($sigmoid - $equality);
-                        $dy_db_n[$tag_index] += 1;
-                        $cross_entropy_loss+=
-                            - $equality*log($sigmoid)
-                            - (1-$equality)*log(1-$sigmoid);
-                        $cross_entropy_loss_n++;
-                    }
-                }
-            }
-            $cross_entropy_loss /= $cross_entropy_loss_n;
-            $duration = time() - $time;
-            echo "epoch {$epoch} cross_entropy {$cross_entropy_loss}".
-                " Takes {$duration} seconds\n";
-            foreach ($dy_dw as $i =>$v1) {
-                foreach ($v1 as $j =>$v2) {
-                    foreach ($v2 as $k =>$v3) {
-                        $this->word_feature[$i][$j][$k] -=
-                            $dy_dw[$i][$j][$k] /
-                            $dy_dw_n[$i][$j][$k] *
-                            $learning_rate;
-                        if ($this->word_feature[$i][$j][$k] < $this->min_w) {
-                            $this->min_w = $this->word_feature[$i][$j][$k];
-                        }
-                        if ($this->word_feature[$i][$j][$k] > $this->max_w) {
-                            $this->max_w = $this->word_feature[$i][$j][$k];
-                        }
-                    }
-                }
-            }
-            foreach ($dy_dt as $i => $v1) {
-                foreach ($v1 as $j => $v2) {
-                    $this->tag_feature[$i][$j] -=
-                        $dy_dt[$i][$j] /
-                        $dy_dt_n[$i][$j] *
-                        $learning_rate;
-                }
-            }
-            foreach ($dy_db as $k => $v) {
-                $this->bias[$k]-=
-                    $dy_db[$k] /
-                    $dy_db_n[$k] *
-                    $learning_rate;
-            }
-            if ($epoch % 10 == 9 ) {
-                $this->save_weight();
-            }
-        }
-        $this->save_weight();
-        return true;
-    }
-    /**
-     * The primary function to predit the tag
-     * @param mixed $sentence is an array of segmented words/terms
-     *     or a string needs to be splited by $splitter
-     * @param function $splitter to process $sentence if $sentence
-     *                 is a string
-     * @return @array all predicted named entities with its tag
-     *                ex. [["郑振铎","nr"],["国民党","nt"]]
-     */
-    public function predict($sentence, $delimiter="",$splitter=null)
-    {
-        if (!is_array($sentence)) {
-            if ($sentence == "") {
-                $terms=[];
-            } else {
-                $terms=preg_split("/[\s]+/",$sentence);
-            }
-        } else {
-            $terms=$sentence;
-        }
-        if (!count($terms)) {
-            return [];
-        }
-        if (!$this->word_feature) {
-            $this->load_weight();
-        }
-        $result = [];
-        for($i = 0; $i < count($terms); $i++) {
-            $term = $terms[$i];
-            $score =[];
-            foreach($this->tag_set as $possiable_tag => $tag_index) {
-                $score[$possiable_tag]=0;
-                for ($j=-2; $j <=2; $j++) {
-                    $k=$this->getIndex($i+$j, $terms);
-                    if (isset($this->word_feature[$k])) {
-                        $score[$possiable_tag] +=
-                                $this->getW($k,$j,$tag_index);
-                    }
-                }
-                if ($i == 0) {
-                    $tf1="start";
-                    $tf2="start-start";
-                } else if ($i == 1) {
-                    $tf1=$result[$i-1];
-                    $tf2="start-".$result[$i-1];
-                } else {
-                    $tf1=$result[$i-1];
-                    $tf2=$result[$i-2]."-".$result[$i-1];
-                }
-                $score[$possiable_tag] += $this->getT($tf1,$tag_index);
-                $score[$possiable_tag] += $this->getT($tf2,$tag_index);
-                $score[$possiable_tag] += $this->getB($tag_index);
-            }
-            $result[]=array_keys($score, max($score))[0];
-        }
-        $pre_tag='o';
-        $current_entity=null;
-        $ret=[];
-        for ($i = 0; $i < count($terms); $i++) {
-            if ($pre_tag != $result[$i] && $pre_tag != "o") {
-                if (mb_strlen($current_entity) < 10) {
-                    $ret[]=[$current_entity,$pre_tag];
-                }
-                $current_entity=null;
-            }
-            if ($result[$i] != "o") {
-                if ($current_entity) {
-                    $current_entity.=$delimiter.$terms[$i];
-                } else {
-                    $current_entity=$terms[$i];
-                }
-            }
-            $pre_tag=$result[$i];
-        }
-        return $ret;
-    }
-    /**
-     * A list of private helper functions
-     * Given a setence ($term), find the key at position $index
-     */
-    private function getIndex($index, $terms)
-    {
-        if ($index < 0) $k = $index - 2;
-        else if ($index >= count($terms)) {
-            $k = $index - count($terms) - 2;
-        }
-        else {
-            $k = $terms[$index];
-        }
-        return $k;
-    }
-
-    /**
-     * save the trained weight to disk
-     */
-    private function save_weight()
-    {
-        $out_file = C\LOCALE_DIR . "/{$this->lang}/resources/ner_weight.txt.gz";
-        $out = [];
-        $out["min_w"] = $this->min_w;
-        $out["max_w"] = $this->max_w;
-        $out["w"]=[];
-        foreach(array_keys($this->word_feature) as $key) {
-            $out["w"][$key] = $this->pack_w($key);
-        }
-        foreach(array_keys($this->tag_feature) as $key) {
-            $out["t"][$key] = $this->pack_t($key);
-        }
-        $out["b"] = $this->pack_b();
-        $out["tag_set"] = $this->tag_set;
-        echo "Saving...";
-        file_put_contents($out_file,
-            gzencode(serialize($out),9));
-        echo " ok\n";
-    }
-    /**
-     * load the trained weight from disk
-     */
-    private function load_weight($trainning_load=false)
-    {
-        $dic_file
-            = C\LOCALE_DIR . "/{$this->lang}/resources/ner_weight.txt.gz";
-        if (!file_exists($dic_file)) {
-            echo "$dic_file does not exist!";
-            exit();
-        }
-        $f = unserialize(gzdecode(file_get_contents($dic_file))
-            ,['allowed_classes' => false]);
-        $this->word_feature=$f["w"];
-        $this->tag_feature=$f["t"];
-        $this->bias=$f["b"];
-        $this->min_w=$f["min_w"];
-        $this->max_w=$f["max_w"];
-        $this->tag_set=$f["tag_set"];
-        if ($trainning_load) {
-            foreach(array_keys($this->word_feature) as $key) {
-                $this->word_feature[$key] = $this->unpack_w($key);
-            }
-            foreach(array_keys($this->tag_feature) as $key) {
-                $this->tag_feature[$key] = $this->unpack_t($key);
-            }
-            $this->bias = $this->unpack_b();
-        }
-    }
-    /**
-     * Pack the bias
-     */
-    private function pack_b()
-    {
-        return pack("f*", ...$this->bias);
-    }
-    /**
-     * Unpack the bias
-     */
-    private function unpack_b()
-    {
-        return array_merge(unpack("f" . strval(count($this->tag_set)),
-            $this->bias));
-    }
-    /**
-     * Pack the tag_feature
-     */
-    private function pack_t($key)
-    {
-        return pack("f*", ...$this->tag_feature[$key]);
-    }
-    /**
-     * Unpack the tag_feature
-     */
-    private function unpack_t($key)
-    {
-        return array_merge(unpack("f".strval(count($this->tag_set)),
-            $this->tag_feature[$key]));
-    }
-    /**
-     * Pack the word_feature
-     */
-    private function pack_w($key)
-    {
-        $bin_str = "";
-        foreach($this->word_feature[$key] as $i => $t) {
-            foreach($t as $u) {
-                $v = 65535 * ($u-$this->min_w) / ($this->max_w-$this->min_w);
-                $bin_str .= pack("S", intval($v));
-            }
-        }
-        return $bin_str;
-    }
-    /**
-     * Unpack the word_feature
-     */
-    private function unpack_w($key)
-    {
-        $tmp = [];
-        $size = count($this->tag_set);
-        for ($i = 0; $i < 5; $i++) {
-            $tmp[$i-2] = array_merge(unpack("S".strval($size),
-                $this->word_feature[$key], 2*$i*count($this->tag_set)));
-            for($j = 0; $j < $size; $j++) {
-                $tmp[$i-2][$j] = $tmp[$i-2][$j] / 65535
-                    * ($this->max_w-$this->min_w) + $this->min_w;
-            }
-        }
-        return $tmp;
-    }
-    /**
-     * Get the bias value for tag
-     */
-    private function getB($tag_index)
-    {
-        return unpack("f",$this->bias,$tag_index*4)[1];
-    }
-    /**
-     * Get the bias value for tag
-     */
-    private function getT($key, $tag_index)
-    {
-        return unpack("f",$this->tag_feature[$key],$tag_index*4)[1];
-    }
-    /**
-     * Get the weight value for term at postion for tag
-     */
-    private function getW($term, $position, $tag_index)
-    {
-        $t = unpack("S",$this->word_feature[$term],
-            2*($position+2)*count($this->tag_set)+$tag_index*2)[1]
-            / 65535
-            * ($this->max_w-$this->min_w) + $this->min_w;;
-        return $t;
-    }
-}
diff --git a/src/library/ContextWeightedPosTagger.php b/src/library/ContextWeightedPosTagger.php
deleted file mode 100644
index d354d3fde..000000000
--- a/src/library/ContextWeightedPosTagger.php
+++ /dev/null
@@ -1,601 +0,0 @@
-<?php
-/**
- * SeekQuarry/Yioop --
- * Open Source Pure PHP Search Engine, Crawler, and Indexer
- *
- * Copyright (C) 2009 - 2019  Chris Pollett chris@pollett.org
- *
- * LICENSE:
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <https://www.gnu.org/licenses/>.
- *
- * @author Xianghong Sun sxh19911230@gmail.com
- * @license https://www.gnu.org/licenses/ GPL3
- * @link https://www.seekquarry.com/
- * @copyright 2009 - 2019
- * @filesource
- */
-namespace seekquarry\yioop\library;
-
-use seekquarry\yioop\configs as C;
-use seekquarry\yioop\locale\zh_CN\resources as ZH;
-
-/**
- * Machine learning based POS tagger. Typically, ContextWeightedPosTagger.php
- * can train the language with some dataset and predict
- * the tag given a list of word.
- *
- * Instruction to add a new language:
- * Add a switch case in the constructor.
- * Define the following functions:
- * getKeyImpl
- * See the class function 'getKey' for more information
- *
- * @author Xianghong Sun
- */
-class ContextWeightedPosTagger
-{
-    /**
-     * Current Language, only tested on Simplified Chinese
-     * Might be extensable for other languages in the furture
-     * @var string
-     */
-    public $lang;
-    /**
-     * The weight for predicting the pos tag
-     * y = wx + b
-     * Generized by training method
-     * @var array
-     */
-    public $w;
-    /**
-     * The bias for predicting the pos tag
-     * y = wx + b
-     * Generized by training method
-     * @var array
-     */
-    public $b;
-    /**
-     * range of w
-     */
-    private $min_w;
-    private $max_w;
-    /**
-     * All Possiable tag set
-     * Generized by training method
-     * @var associative array [tag => tag index]
-     */
-    private $tag_set;
-    /**
-     * The unknown words should be picked from these tags
-     */
-    private $unknown_word_possiable_tags=[];
-    /**
-     * Check if all the chars in the term is not current language
-     * @param $term is a string that to be checked
-     * @return true if all the chars in $term is not current language
-     *         false otherwise
-     */
-    public function notCurrentLang($term)
-    {
-        return preg_match("/^[^\p{Han}]+$/u", $term);
-    }
-    /**
-     * The constructer of the pos tagger
-     * To extend to other languages, some work are needed:
-     * Define $this->getKeyImpl, $this->rule_defined_key
-     * See Chinese example.
-     * @param @string $lang describes current langauge
-     * @param @book $packed describes how weight and bias would look like
-     */
-    public function __construct($lang, $packed = true)
-    {
-        //$this->packed = $packed;
-        switch($lang) {
-            case("zh_CN"):
-            case("zh-CH"):
-                $this->lang = "zh_CN";
-                /*
-                 * Some Exception of Tags. Some tags are detemined by ruls.
-                 * e.x. There are infinity amount of Arabic numerals.
-                 */
-                $this->getKeyImpl = function($term) {
-                    $key = ZH\Tokenizer::POSGetKey($term);
-                    return $key ? $this->tag_set[$key] : $term;
-                };
-                //Tags from above
-                $this->rule_defined_key = ['PU','CD','OD','NT','FW'];
-                //Unknown word possiable tag
-                $this->unknown_word_possiable_tags=["NN","NR","VV","VA"];
-                break;
-            default:
-                $this->lang = $lang;
-        }
-    }
-    /**
-     * __call  for calling dynamic methods
-     * @param string $method method of this class to call
-     * @param array $args arguments to pass to method
-     * @return mixed result of method calculation
-     */
-    public function __call($method, $args)
-    {
-        return call_user_func_array($this->$method, $args);
-    }
-    /**
-     *  __get  for getting dynamic variables
-     * @param string $var_name variable to retrieve
-     * @return mixed result of retrieval
-     */
-    public function __get($var_name)
-    {
-        return $this->$var_name;
-    }
-    /**
-     *  __set  for assigning dynamic variables
-     * @param string $var_name variable to assign
-     * @param  mixed $value value to assign to it
-     */
-    public function __set($var_name, $value)
-    {
-        $this->$var_name = $value;
-    }
-    /**
-     * check if the term can be determined by algorithm,
-     * usually by regualr expression, because there are infinity
-     * amount of them.
-     * ex. 13th is an ordinal number, 123 is a cardinal number
-     * then use the determined tag to be the weight key
-     * @param @string $term is the term to be checked
-     * @return right key in feature matrix
-     */
-    public function getKey($term)
-    {
-        if (isset($this->getKeyImpl)) {
-            return $this->getKeyImpl($term);
-        }
-        return $term;
-    }
-
-    /**
-     * A function that process the trainning data
-     * @param @mixed $text_files can be a file or an array of file names
-     * @return @array of seperated sentences, each sentenfce have the format of
-     *               [[words...],[tags...]]
-     * Currently, the trainning data needs to fit CTB format:
-     * term followed by a underscore and followed by the tag
-     * e.g. "新_VA 的_DEC 南斯拉夫_NR 会国_NN"
-     * To adapt to other language, some modifications are needed
-     */
-    public static function processTexts($text_files, $term_tag_splier="_",
-        $term_process = null, $tag_process = null)
-    {
-        $ret=[];
-        foreach($text_files as $text_file) {
-            if (file_exists($text_file)) {
-                $fn = fopen($text_file,"r");
-                while(! feof($fn))  {
-                    $line = fgets($fn);
-                    if(strpos($line, '<') !== false) {
-                        continue;
-                    }
-                    $word_tag_pairs = preg_split("/[\s ]+/u", $line);
-                    if (!count($word_tag_pairs)) {
-                        continue;
-                    }
-                    $ret[]=[];
-                    $ret[count($ret)-1][0]=[];
-                    $ret[count($ret)-1][1]=[];
-                    foreach ($word_tag_pairs as $word_tag_pair) {
-                        $t = explode($term_tag_splier, $word_tag_pair);
-
-                        if (count($t) == 2) {
-                            $ret[count($ret)-1][0][] =
-                                $term_process ? $term_process($t[0]) : $t[0];
-                            $ret[count($ret)-1][1][] =
-                                $tag_process ? $tag_process($t[1]) : $t[1];
-                        }
-                    }
-                }
-                fclose($fn);
-            }
-        }
-        return $ret;
-    }
-    /**
-     * Function to train a data
-     * Notice: This function might run very long time, depending on training set
-     * @param @mixed $text_files are training data
-     *  can be a file or an array of file names
-     * @param @float $learning_rate
-     * @param @int  $max_epoch 1200 might be a good one,
-     *  the weight will overfit if it's greater than this number
-     * @param @bool $resume if true, read the weight file and continue training
-     *   if false, start from beginning
-     */
-    public function train($text_files, $term_tag_splier="_", $learning_rate=0.1,
-        $max_epoch = 1200, $term_process = null, $tag_process = null,
-        $resume = false)
-    {
-        if (is_string($text_files)) {
-            $text_files = [$text_files];
-        }
-        echo "Reading files\n";
-        // term_tag_sentences[sentence#]=[[words...],[tags...]]
-        $term_tag_sentences = self::processTexts($text_files, $term_tag_splier,
-            $term_process, $tag_process);
-        if ($resume) {
-            echo "Loading weights... ";
-            $this->load_weight(true);
-            $tag_index = count($this->tag_set);
-            echo "ok\n";
-        } else {
-            $this->w=[];
-            $this->tag_set=[];
-            $tag_index = 0;
-            if (isset($this->rule_defined_key)) {
-                foreach($this->rule_defined_key as $k) {
-                    $this->tag_set[$k] = $tag_index++;
-                }
-            }
-            for ($i = -4; $i <= -1; $i++) {
-                $this->w[$i] = [];
-            }
-        }
-        foreach ($term_tag_sentences as $term_tag_pairs) {
-            $terms=$term_tag_pairs[0];
-            $tags=$term_tag_pairs[1];
-            for ($i = 0; $i < count($terms); $i++) {
-                if (!isset($this->tag_set[$tags[$i]])) {
-                    $this->tag_set[$tags[$i]] = $tag_index++;
-                }
-                $k = $this->getIndex($i,$terms);
-                if (!isset($this->w[$k])) {
-                    $this->w[$k] = [];
-                }
-            }
-        }
-        foreach (array_keys($this->w) as $key) {
-            for ($i=-2; $i<=2;$i++) {
-                if (!isset($this->w[$key][$i])) {
-                    $this->w[$key][$i] = [];
-                }
-                foreach($this->tag_set as $possiable_tag => $tag_index) {
-                    if (!isset($this->w[$key][$i][$tag_index])) {
-                        $this->w[$key][$i][$tag_index] = 0;
-                    }
-                }
-            }
-        }
-        foreach($this->tag_set as $possiable_tag => $tag_index) {
-            if (!isset($this->b[$tag_index])) {
-                $this->b[$tag_index] = 0;
-            }
-        }
-        echo "Training\n";
-        //train the weight
-        $cross_entropy_loss = 1;
-        $pre_cross_entropy_loss = 2;
-        for ($epoch = 0; $epoch < $max_epoch && $pre_cross_entropy_loss -
-            $cross_entropy_loss > 0.000001; $epoch++) {
-            $this->min_w=0;
-            $this->max_w=0;
-            $time = time();
-            $dy_dw = [];
-            $dy_dw_n = [];
-            $pre_cross_entropy_loss = $cross_entropy_loss;
-            $cross_entropy_loss = 0;
-            $cross_entropy_loss_n = 0;
-
-            $dy_db=[];
-            $dy_db_n=[];
-            for($i = 0; $i < count($this->tag_set); $i++) {
-                $dy_db[$i] = 0;
-                $dy_db_n[$i] = 0;
-            }
-            //for each sentence
-            foreach ($term_tag_sentences as $term_tag_pairs) {
-                $terms=$term_tag_pairs[0];
-                $tags=$term_tag_pairs[1];
-                for ($i = 0; $i < count($terms); $i++) {
-                    $k=[];
-                    for ($j=-2; $j<=2;$j++) {
-                        $k[$j]= $this->getIndex($i+$j,$terms);
-                    }
-                    foreach ($this->tag_set as $possiable_tag => $tag_index) {
-                        $equality = $possiable_tag == $tags[$i] ? 1 : 0;
-                        $sum=0;
-                        for ($j=-2; $j<=2;$j++) {
-                            $sum += $this->w[$k[$j]][$j][$tag_index];
-                        }
-                        $sum += $this->b[$tag_index];
-                        $sigmoid = 1 / (1 + exp(-1 * $sum));
-                        for ($j=-2; $j<=2;$j++) {
-                            if (!isset($dy_dw[$k[$j]])) {
-                                $dy_dw[$k[$j]] = [];
-                                $dy_dw_n[$k[$j]] = [];
-                            }
-                            if (!isset($dy_dw[$k[$j]][$j])) {
-                                $dy_dw[$k[$j]][$j] = [];
-                                $dy_dw_n[$k[$j]][$j] = [];
-                            }
-                            if (!isset($dy_dw[$k[$j]][$j][$tag_index])) {
-                                $dy_dw[$k[$j]][$j][$tag_index] = 0;
-                                $dy_dw_n[$k[$j]][$j][$tag_index] = 0;
-                            }
-
-                            $dy_dw[$k[$j]][$j][$tag_index] +=
-                                ($sigmoid - $equality);
-                            $dy_dw_n[$k[$j]][$j][$tag_index] += 1;
-
-                        }
-                        //dy_db
-                        $dy_db[$tag_index] += ($sigmoid - $equality);
-                        $dy_db_n[$tag_index] += 1;
-                        $cross_entropy_loss+=
-                            - $equality*log($sigmoid)
-                            - (1-$equality)*log(1-$sigmoid);
-                        $cross_entropy_loss_n++;
-                    }
-                }
-            }
-            $cross_entropy_loss /= $cross_entropy_loss_n;
-            $duration = time() - $time;
-            echo "epoch {$epoch} cross_entropy {$cross_entropy_loss}" .
-                " Takes {$duration} seconds\n";
-            foreach ($dy_dw as $i =>$v1) {
-                foreach ($v1 as $j =>$v2) {
-                    foreach ($v2 as $k =>$v3) {
-                        $this->w[$i][$j][$k] -=
-                            $dy_dw[$i][$j][$k] /
-                            $dy_dw_n[$i][$j][$k] *
-                            $learning_rate;
-                        if ($this->w[$i][$j][$k] < $this->min_w) {
-                            $this->min_w = $this->w[$i][$j][$k];
-                        }
-                        if ($this->w[$i][$j][$k] > $this->max_w) {
-                            $this->max_w = $this->w[$i][$j][$k];
-                        }
-                    }
-                }
-            }
-            foreach ($dy_db as $k =>$v) {
-                $this->b[$k]-=
-                    $dy_db[$k] /
-                    $dy_db_n[$k] *
-                    $learning_rate;
-            }
-            if ($epoch % 10 == 9 ) {
-                $this->save_weight();
-            }
-        }
-        $this->save_weight();
-        return true;
-    }
-    /**
-     * The primary function to predit the tag
-     * @param mixed $sentence is an array of segmented words/terms
-     *     or a string with words/terms seperated by space
-     * @return @array of tags
-     */
-    public function predict($sentence)
-    {
-        if (!is_array($sentence)) {
-            if ($sentence == "") {
-                $terms=[];
-            } else {
-                $terms=preg_split("/[\s]+/",$sentence);
-            }
-        } else {
-            $terms=$sentence;
-        }
-        if (!count($terms)) {
-            return [];
-        }
-        if (!$this->w) {
-            $this->load_weight();
-        }
-        $ret = [];
-        for($i = 0; $i < count($terms); $i++) {
-            $term = $terms[$i];
-            $score =[];
-            $key=$this->getKey($term);
-            foreach($this->tag_set as $possiable_tag => $tag_index) {
-                $score[$possiable_tag]=0;
-                for ($j=-2; $j <=2; $j++) {
-                    $k=$this->getIndex($i+$j, $terms);
-                    if (isset($this->w[$k])) {
-                        $score[$possiable_tag] +=
-                                $this->getW($k,$j,$tag_index);
-                    } else if ($j==0&&!in_array($possiable_tag,
-                        $this->unknown_word_possiable_tags)) {
-                        $score[$possiable_tag] += $this->min_w;
-                    }
-                }
-
-                $score[$possiable_tag] += $this->getB($tag_index);
-
-                //$score[$possiable_tag]
-                //    += 1 / (1 + exp(-1 * $score[$possiable_tag]));
-            }
-            $ret[]=array_keys($score, max($score))[0];
-        }
-        return $ret;
-    }
-    /**
-     * Wrap function for predict
-     * @param $texts to be a @string of texts
-     * @param $return_string is a boolean to determing if the user
-     *   want it to out put to stdout or a return value
-     * @return @string if $return_string is true;
-     *   @boolean true otherwise
-     * e.g. 中国_NR 人民_NN 将_AD 满怀信心_VV
-     *   地_DEV 开创_VV 新_VA 的_DEC 业绩_NN 。_PU
-     */
-    public function tag($texts, $return_string=false)
-    {
-        if ($return_string) {
-            $ret = "";
-        }
-        $sentences = preg_split('/\r\n|\r|\n/', $texts);
-        foreach($sentences as $sentence) {
-            $sentence=explode(" ",trim($sentence));
-            $term_pos = $this->predict($sentence);
-            for($i = 0; $i < count($term_pos); $i++) {
-                $term_pos[$i]=$sentence[$i]."_".$term_pos[$i];
-            }
-            $t = join(" ", $term_pos);
-            if ($return_string) {
-                $ret .= $t;
-            } else {
-                echo $t, "\n";
-            }
-        }
-        if ($return_string) {
-            return $ret;
-        } else {
-            return true;
-        }
-    }
-    /**
-     * A list of private helper functions
-     * Given a setence ($term), find the key at position $index
-     */
-    private function getIndex($index, $terms)
-    {
-        if ($index < 0) $k = $index - 2;
-        else if ($index >= count($terms)) {
-            $k = $index - count($terms) - 2;
-        }
-        else {
-            $k = $this->getKey($terms[$index]);
-        }
-        return $k;
-    }
-    /**
-     * Get the bias value for tag
-     */
-    private function getB($tag_index)
-    {
-        return unpack("f",$this->b,$tag_index*4)[1];
-    }
-    /**
-     * Set the bias value for tag
-     */
-    private function setB($tag_index, $value)
-    {
-        $this->b = substr_replace($this->b,pack("f",$value),$tag_index*4,4);
-    }
-    /**
-     * Get the weight value for term at postion for tag
-     */
-    private function getW($term, $position, $tag_index)
-    {
-        $t = unpack("S",$this->w[$term],
-            2*($position+2)*count($this->tag_set)+$tag_index*2)[1]
-            / 65535
-            * ($this->max_w-$this->min_w) + $this->min_w;;
-        return $t;
-    }
-    /**
-     * save the trained weight to disk
-     */
-    private function save_weight()
-    {
-        $out_file = C\LOCALE_DIR . "/{$this->lang}/resources/pos_weight.txt.gz";
-        $out = [];
-        $out["min_w"] = $this->min_w;
-        $out["max_w"] = $this->max_w;
-        $out["w"]=[];
-        foreach(array_keys($this->w) as $key) {
-            $out["w"][$key] = $this->pack_w($key);
-        }
-        $out["b"] = $this->pack_b();
-        $out["tag_set"] = $this->tag_set;
-        echo "Saving...";
-        file_put_contents($out_file,
-            gzencode(serialize($out),9));
-        echo " ok\n";
-    }
-    /**
-     * load the trained weight from disk
-     */
-    private function load_weight($trainning_load=false)
-    {
-        $dic_file = C\LOCALE_DIR .
-            "/{$this->lang}/resources/pos_weight.txt.gz";
-        if (!file_exists($dic_file)) {
-            echo "$dic_file does not exist!";
-            exit();
-        }
-        $f = unserialize(gzdecode(file_get_contents($dic_file))
-            ,['allowed_classes' => false]);
-        $this->w=$f["w"];
-        $this->b=$f["b"];
-        $this->min_w=$f["min_w"];
-        $this->max_w=$f["max_w"];
-        $this->tag_set=$f["tag_set"];
-        if ($trainning_load) {
-            foreach(array_keys($this->w) as $key) {
-                $this->w[$key] = $this->unpack_w($key);
-            }
-            $this->b = $this->unpack_b($this->b);
-        }
-    }
-    /**
-     * Pack the bias
-     */
-    private function pack_b()
-    {
-        return pack("f*", ...$this->b);
-    }
-    /**
-     * Unpack the bias
-     */
-    private function unpack_b()
-    {
-        return array_merge(unpack("f".strval(count($this->tag_set)),$this->b));
-    }
-    /**
-     * Pack the weight
-     */
-    private function pack_w($key)
-    {
-        $bin_str = "";
-        foreach($this->w[$key] as $i => $t) {
-            foreach($t as $u) {
-                $v = 65535 * ($u-$this->min_w) / ($this->max_w-$this->min_w);
-                $bin_str .= pack("S", intval($v));
-            }
-        }
-        return $bin_str;
-    }
-    /**
-     * Unpack the weight
-     */
-    private function unpack_w($key)
-    {
-        $tmp = [];
-        $size = count($this->tag_set);
-        for ($i = 0; $i < 5; $i++) {
-            $tmp[$i-2] = array_merge(unpack("S".strval($size),
-                $this->w[$key], 2*$i*count($this->tag_set)));
-            for($j = 0; $j < $size; $j++) {
-                $tmp[$i-2][$j] = $tmp[$i-2][$j] / 65535
-                    * ($this->max_w-$this->min_w) + $this->min_w;
-            }
-        }
-        return $tmp;
-    }
-}
diff --git a/src/library/CrawlDaemon.php b/src/library/CrawlDaemon.php
index efac71dfe..c7265ab7f 100644
--- a/src/library/CrawlDaemon.php
+++ b/src/library/CrawlDaemon.php
@@ -239,7 +239,7 @@ class CrawlDaemon implements CrawlConstants
      * Sends the message to stardard out if crawlLog not set up; otherwise,
      * sends to crawlLog()
      *
-     * @param string $masg string to log to either standard out or
+     * @param string $msg string to log to either standard out or
      *  to Yioop's crawlLog
      * @param int $exit_type the exit_type used by init() and start()
      *  values of absolute value >2 are only used if crawlLog has
diff --git a/src/library/DoubleIndexBundle.php b/src/library/DoubleIndexBundle.php
index f4cf5c69b..e71165ebd 100644
--- a/src/library/DoubleIndexBundle.php
+++ b/src/library/DoubleIndexBundle.php
@@ -210,7 +210,7 @@ class DoubleIndexBundle implements CrawlConstants
      *
      * @param int $generation field used to select partition
      * @param string $offset_field field used to record offsets after storing
-     * @param array& $pages data to store
+     * @param array &$pages data to store
      * @param int $visited_urls_count number to add to the count of visited urls
      *     (visited urls is a smaller number than the total count of objects
      *     stored in the index).
diff --git a/src/library/FeedArchiveBundle.php b/src/library/FeedArchiveBundle.php
index ac6df08fc..ef41f1674 100644
--- a/src/library/FeedArchiveBundle.php
+++ b/src/library/FeedArchiveBundle.php
@@ -37,18 +37,26 @@ use seekquarry\yioop\configs as C;
  */
 require_once __DIR__ . '/Utility.php';
 /**
+ * Subclass of IndexArchiveBundle with bloom filters to make it easy to check
+ * if a news feed item has been added to the bundle already before adding it
  *
  * @author Chris Pollett
  */
 class FeedArchiveBundle extends IndexArchiveBundle
 {
     /**
-     *
+     * Used to store unique identifiers of feed itemms that have been stored
+     * in this FeedArchiveBundle. This filter_a is used for checking if items
+     * are already in the archive, when it has URL_FILTER_SIZE/2 items
+     * filter_b is added to as well as filter_a. When filter_a is of size
+     * URL_FILTER_SIZE filter_a is deleted, filter_b is renamed to filter_a
+     * and the process is repeated.
      * @var BloomFilterFile
      */
     public $filter_a;
     /**
-     *
+     * Auxiliary BloomFilterFile used in checking if feed items are in this
+     * archive or not. @see $filter_a
      * @var BloomFilterFile
      */
     public $filter_b;
@@ -96,7 +104,7 @@ class FeedArchiveBundle extends IndexArchiveBundle
      * @param string $offset_field field used to record offsets after storing
      * @param string $key_field field used to store unique identifier for a
      *      each page item.
-     * @param array& $pages data to store
+     * @param array &$pages data to store
      * @param int $visited_urls_count number to add to the count of visited urls
      *     (visited urls is a smaller number than the total count of objects
      *     stored in the index).
@@ -112,7 +120,12 @@ class FeedArchiveBundle extends IndexArchiveBundle
             $visited_urls_count);
     }
     /**
-     *
+     * Adds the key (often GUID) of a feed item to the bloom filter pair
+     * associated with this archive. This always adds to filter a, if
+     * filter a is more than half full it adds to filter b. If filter a is full
+     * it is deletedand filter b is renamed filter a and te process continues
+     * where a new filter b is created when this becomee half full.
+     * @param string $key unique identifier of a feed item
      */
     public function addFilters($key)
     {
@@ -138,7 +151,10 @@ class FeedArchiveBundle extends IndexArchiveBundle
         }
     }
     /**
-     *
+     * Whether the active filter for this feed contain thee feed item
+     * of thee supplied key
+     * @param string $key the feed item id to check if in arcive
+     * @return bool true if it is in the archive, false otherwise
      */
     public function contains($key)
     {
diff --git a/src/library/FetchUrl.php b/src/library/FetchUrl.php
index 449eebc5a..42074ec88 100755
--- a/src/library/FetchUrl.php
+++ b/src/library/FetchUrl.php
@@ -582,7 +582,7 @@ class FetchUrl implements CrawlConstants
      * Computes a hash of a string containing page data for use in
      * deduplication of pages with similar content
      *
-     * @param string& $page reference to web page data
+     * @param string &$page reference to web page data
      * @return string 8 byte hash to identify page contents
      */
     public static function computePageHash(&$page)
diff --git a/src/library/IndexArchiveBundle.php b/src/library/IndexArchiveBundle.php
index cef420567..693bbd1d9 100644
--- a/src/library/IndexArchiveBundle.php
+++ b/src/library/IndexArchiveBundle.php
@@ -176,7 +176,7 @@ class IndexArchiveBundle implements CrawlConstants
      *
      * @param int $generation field used to select partition
      * @param string $offset_field field used to record offsets after storing
-     * @param array& $pages data to store
+     * @param array &$pages data to store
      * @param int $visited_urls_count number to add to the count of visited urls
      *     (visited urls is a smaller number than the total count of objects
      *     stored in the index).
diff --git a/src/library/IndexDictionary.php b/src/library/IndexDictionary.php
index 45dcbb435..3c7da6a92 100644
--- a/src/library/IndexDictionary.php
+++ b/src/library/IndexDictionary.php
@@ -1110,15 +1110,15 @@ class IndexDictionary implements CrawlConstants
      * @param int $file_num which prefix file to read from (always reads
      *     a file at the max_tier level)
      * @param int $num_aux_records
-     * @param int& $total_count
+     * @param int &$total_count
      * @param int $threshold
-     * @param array& $info
-     * @param int& $previous_generation
-     * @param int& $num_generations
+     * @param array &$info
+     * @param int &$previous_generation
+     * @param int &$num_generations
      * @param int $offset
      * @param int $num_distinct_generations
-     * @param int& $max_retained_generation
-     * @param array& $id_info
+     * @param int &$max_retained_generation
+     * @param array &$id_info
      */
     public function addAuxInfoRecords($id, $file_num, $num_aux_records,
         &$total_count, $threshold, &$info, &$previous_generation,
@@ -1173,7 +1173,7 @@ class IndexDictionary implements CrawlConstants
      * $max_retained_generation, $info) and filters blank entries from
      * $info and returns the resulting triple
      *
-     * @param int& $total_count
+     * @param int &$total_count
      * @param int $max_retained_generation
      * @param array $info
      * @return array resulting triple
@@ -1197,14 +1197,14 @@ class IndexDictionary implements CrawlConstants
      *     the quadruple array for
      * @param array $record current record from dictionary that we may or may
      *     not add to info
-     * @param array& $info quadruple array we are adding to
-     * @param int& $total_count count of items in $info
-     * @param int& $previous_generation last generation added to $info
-     * @param int& $previous_id last exact if added to $info
-     * @param int& $num_generations
+     * @param array &$info quadruple array we are adding to
+     * @param int &$total_count count of items in $info
+     * @param int &$previous_generation last generation added to $info
+     * @param int &$previous_id last exact if added to $info
+     * @param int &$num_generations
      * @param int $num_distinct_generations
-     * @param int& $max_retained_generation
-     * @param array& $id_info
+     * @param int &$max_retained_generation
+     * @param array &$id_info
      */
     public function addLookedUpEntry($id, $word_id, $record,
         &$info, &$total_count, &$previous_generation, &$previous_id,
@@ -1285,7 +1285,7 @@ class IndexDictionary implements CrawlConstants
      * @param int $file_num which dictionary file (given by first letter prefix)
      *     to read from
      * @param int $bytes byte offset to start reading from
-     * @return &string data fromIndexShard file
+     * @return string &data fromIndexShard file
      */
     public function &readBlockDictAtOffset($file_num, $bytes)
     {
diff --git a/src/library/IndexManager.php b/src/library/IndexManager.php
index c841c1871..23e33d2a2 100644
--- a/src/library/IndexManager.php
+++ b/src/library/IndexManager.php
@@ -245,18 +245,12 @@ class IndexManager implements CrawlConstants
     }
     /**
      * Returns the number of document that a given term or phrase appears in
-     * in the given index
+     * in the given index where we discount later generation -- those with
+     * lower document rank more
      *
-     * @param string $term_or_phrase what to look up in the indexes dictionary
+     * @param string $term what to look up in the indexes dictionary
      *     no  mask is used for this look up
      * @param string $index_name index to look up term or phrase in
-     * @param int $threshold if set and positive then once threshold many
-     *     documents are found the search for more documents to add to the
-     *     total is stopped
-     * @param int $start_generation what generation in the index to start
-     *      finding occurrence of phrase from
-     * @param int $num_distinct_generations from $start_generation how
-     *      many generation to search forward to
      * @return int number of documents
      */
     public static function discountedNumDocsTerm($term, $index_name)
diff --git a/src/library/IndexShard.php b/src/library/IndexShard.php
index 6c41e8d29..26f1f957e 100644
--- a/src/library/IndexShard.php
+++ b/src/library/IndexShard.php
@@ -31,12 +31,10 @@
 namespace seekquarry\yioop\library;

 use seekquarry\yioop\configs as C;
-
 /**
  * Load charCopy
  */
 require_once __DIR__ . "/Utility.php";
-
 /**
  * Data structure used to store one generation worth of the word document
  * index (inverted index). This data structure consists of three main
@@ -634,7 +632,7 @@ class IndexShard extends PersistentStructure implements CrawlConstants
      *
      * @param int $start_offset of the current posting list for query term
      *     used in calculating BM25F.
-     * @param int& $next_offset where to start in word docs
+     * @param int &$next_offset where to start in word docs
      * @param int $last_offset offset at which to stop by
      * @param int $len number of documents desired
      * @param int $direction which direction to iterate through elements
@@ -997,7 +995,7 @@ class IndexShard extends PersistentStructure implements CrawlConstants
      * Computes BM25F relevance and a score for the supplied item based
      * on the supplied parameters.
      *
-     * @param array& $item doc summary to compute a relevance and score for.
+     * @param array &$item doc summary to compute a relevance and score for.
      *     Pass-by-ref so self::RELEVANCE and self::SCORE fields can be changed
      * @param int $occurrences - number of occurences of the term in the item
      * @param int $doc_len number of words in doc item represents
@@ -1036,9 +1034,9 @@ class IndexShard extends PersistentStructure implements CrawlConstants
      *
      * @param int $current an index into the word_docs strings
      *     corresponds to a start search loc of $current * self::POSTING_LEN
-     * @param int& $posting_start after function call will be
+     * @param int &$posting_start after function call will be
      *     index of start of nearest posting to current
-     * @param int& $posting_end after function call will be
+     * @param int &$posting_end after function call will be
      *     index of end of nearest posting to current
      * @return string the substring of word_docs corresponding to the posting
      */
@@ -1194,7 +1192,7 @@ class IndexShard extends PersistentStructure implements CrawlConstants
      * position $current forward until either $end is reached or a
      * posting with document index bigger than $doc_index is found
      *
-     * @param int& $current current posting offset into posting list
+     * @param int &$current current posting offset into posting list
      * @param int $doc_index document index want bigger than or equal to
      * @param int $end last index of posting list
      * @return int document index bigger than or equal to $doc_index. Since
@@ -2027,7 +2025,7 @@ class IndexShard extends PersistentStructure implements CrawlConstants
      * @param int $bytes byte offset to start reading from
      * @param bool $cache whether to cache disk blocks that have been read to
      *     RAM
-     * @return &string data fromIndexShard file
+     * @return mixed data fromIndexShard file if found, false otherwise
      */
     public function readBlockShardAtOffset($bytes, $cache = true)
     {
@@ -2141,9 +2139,9 @@ class IndexShard extends PersistentStructure implements CrawlConstants
      * Load an IndexShard from a file or string
      *
      * @param string $fname the name of the file to the IndexShard from/to
-     * @param string& $data stringified shard data to load shard from. If null
+     * @param string &$data stringified shard data to load shard from. If null
      *     then the data is loaded from the $fname if possible
-     * @return object the IndexShard loaded
+     * @return IndexShard the IndexShard loaded
      */
     public static function load($fname, &$data = null)
     {
@@ -2216,7 +2214,7 @@ class IndexShard extends PersistentStructure implements CrawlConstants
      * Callback function for load method. splits a word_key . word_info string
      * into an entry in the passed shard $shard->words[word_key] = $word_info.
      *
-     * @param string& $value  the word_key . word_info string
+     * @param string &$value  the word_key . word_info string
      * @param int $key index in array - we don't use
      * @param object $shard IndexShard to add the entry to word table for
      */
diff --git a/src/library/LocaleFunctions.php b/src/library/LocaleFunctions.php
index b514c7c71..5450dda77 100755
--- a/src/library/LocaleFunctions.php
+++ b/src/library/LocaleFunctions.php
@@ -22,9 +22,6 @@
  *
  * END LICENSE
  *
- * This file contains global functions connected to localization that
- * are used throughout the web site part of Yioop!
- *
  * @author Chris Pollett chris@pollett.org
  * @license https://www.gnu.org/licenses/ GPL3
  * @link https://www.seekquarry.com/
@@ -33,10 +30,15 @@
  */
 namespace seekquarry\yioop\library;

+/**
+ * This file contains global functions connected to localization that
+ * are used throughout the web site part of Yioop!
+ */
 use seekquarry\yioop\configs as C;
 use seekquarry\yioop\models\LocaleModel;
-
-/** For Yioop global defines */
+/**
+ * For Yioop global defines
+ */
 require_once __DIR__."/../configs/Config.php";
 /**
  * Returns an array of locales that have a stop words list and a stop words
@@ -50,7 +52,9 @@ function localesWithStopwordsList()
         'vi-VN', 'zh-CN'];
 }
 /**
- *
+ * Converts a $locale_tag (major-minor) to an Iso 632-2 language name
+ * @param string $locale_tag want to convert
+ * @return string corresponding Iso 632-2 language tag
  */
 function localeTagToIso639_2Tag($locale_tag)
 {
@@ -277,7 +281,7 @@ function guessEncodingHtmlXml($html, $return_loc_info = false)
  * Converts page data in a site associative array to UTF-8 if it is not
  * already in UTF-8
  *
- * @param array& $site an associative of info about a web site
+ * @param array &$site an associative of info about a web site
  * @param string $page_field the field in the associative array that
  *  contains the $site's web page as a string.
  * @param string $encoding_field the  field in the associative array that
diff --git a/src/library/MailServer.php b/src/library/MailServer.php
index 9d8f45c37..d71308155 100644
--- a/src/library/MailServer.php
+++ b/src/library/MailServer.php
@@ -220,7 +220,9 @@ class MailServer implements MediaConstants
         $data = "";
         while($line = fgets($this->connection)) {
             $data .= $line;
-            if ($line[self::SMTP_CODE_LEN] == ' ') { break; }
+            if ($line[self::SMTP_CODE_LEN] == ' ') {
+                break;
+            }
         }
         $this->messages .= $data;
         return substr($data, 0, self::SMTP_CODE_LEN);
diff --git a/src/library/NamedEntityContextTagger.php b/src/library/NamedEntityContextTagger.php
new file mode 100644
index 000000000..94a2cba73
--- /dev/null
+++ b/src/library/NamedEntityContextTagger.php
@@ -0,0 +1,360 @@
+<?php
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009 - 2020  Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ *
+ * @author Xianghong Sun sxh19911230@gmail.com
+ * @license https://www.gnu.org/licenses/ GPL3
+ * @link https://www.seekquarry.com/
+ * @copyright 2009 - 2019
+ * @filesource
+ */
+namespace seekquarry\yioop\library;
+
+use seekquarry\yioop\configs as C;
+
+/**
+ * Machine learning based named entity recognizer.
+ * NamedEntityContextTagger is used by @see StochasticTermSegmenter
+ * to help in segmenting sentences in which no term separators such as spaces
+ * are used.
+ *
+ * @author Xianghong Sun (Principal),
+ *  Chris Pollett (mainly simplifications, and documentation)
+ */
+class NamedEntityContextTagger extends ContextTagger
+{
+    /**
+     * Constructor for the NamedEntityContextTagger.
+     * Sets the language this tagger tags for and sets up the path for
+     * where it should be stored
+     * @param string $lang locale tag of the language this tagger tags is for
+     */
+    public function __construct($lang)
+    {
+        $this->tagger_file = "nect_weights.txt.gz";
+        parent::__construct($lang);
+    }
+    /**
+     * Uses text files containing sentences to create a matrix
+     * so that from a two chars before a term, two chars after a char context,
+     * together with a two tags before a term context and a term,
+     * the odds that a named entity as been found can be calculated
+     *
+     * @param mixed $text_files with training data. These can be a file or
+     *  an array of file names.
+     * @param string $term_tag_separator separator used to separate term and tag
+     *  for terms in input sentence
+     * @param float $learning_rate learning rate when cycling over data trying
+     *  to minimize the cross-entropy loss in the prediction of the tag of the
+     *  middle term.
+     * @param int $num_epoch number of times to cycle through the
+     *  complete data set. Default value of 1200 seems to avoid overfitting
+     * @param function $term_callback callback function applied to a term
+     *  before adding term to sentence term array as part of processing and
+     *  training with a sentence.
+     * @param function $tag_callback callback function applied to a part of
+     *  speech tag  before adding tag to sentence tag array as part of
+     *  processing and training with a sentence.
+     */
+    public function train($text_files, $term_tag_separator = "-",
+        $learning_rate = 0.1, $num_epoch = 1200, $term_callback = null,
+        $tag_callback = null, $resume = false)
+    {
+        if (is_string($text_files)) {
+            $text_files = [$text_files];
+        }
+        echo "Reading files\n";
+        // term_tag_sentences[sentence#]=[[words...],[tags...]]
+        $term_tag_sentences = self::processTexts($text_files,
+            $term_tag_separator, $term_callback, $tag_callback);
+        $this->word_feature = [];
+        $this->tag_set = [];
+        $tag_index = 0;
+        for ($i = -4; $i <= -1; $i++) {
+            $this->word_feature[$i] = [];
+        }
+        foreach ($term_tag_sentences as $term_tag_pairs) {
+            $terms = $term_tag_pairs[0];
+            $tags = $term_tag_pairs[1];
+            $this->tag_feature["start"] = [];
+            $this->tag_feature["start-start"] = [];
+            for ($i = 0; $i < count($terms); $i++) {
+                if (!isset($this->tag_set[$tags[$i]])) {
+                    $this->tag_set[$tags[$i]] = $tag_index++;
+                }
+                if ($i == 0) {}
+                else if ($i == 1) {
+                    if (!isset($this->tag_feature["start-" . $tags[$i-1]])) {
+                        $this->tag_feature["start-".$tags[$i - 1]] = [];
+                    }
+                    if (!isset($this->tag_feature[$tags[$i - 1]])) {
+                        $this->tag_feature[$tags[$i - 1]] = [];
+                    }
+                } else {
+                    if (!isset($this->tag_feature[$tags[$i - 2] . "-" .
+                        $tags[$i - 1]])) {
+                        $this->tag_feature[$tags[$i - 2] . "-" .
+                            $tags[$i - 1]] = [];
+                    }
+                    if (!isset($this->tag_feature[$tags[$i - 1]])) {
+                        $this->tag_feature[$tags[$i - 1]] = [];
+                    }
+                }
+                if (!isset($this->word_feature[$terms[$i]])) {
+                    $this->word_feature[$terms[$i]] = [];
+                }
+            }
+        }
+        foreach (array_keys($this->word_feature) as $key) {
+            for ($i = -2; $i <= 2 ;$i++) {
+                if (!isset($this->word_feature[$key][$i])) {
+                    $this->word_feature[$key][$i] = [];
+                }
+                foreach($this->tag_set as $possible_tag => $tag_index) {
+                    if (!isset($this->word_feature[$key][$i][$tag_index])) {
+                        $this->word_feature[$key][$i][$tag_index] = 0;
+                    }
+                }
+            }
+        }
+        foreach (array_keys($this->tag_feature) as $key) {
+            foreach($this->tag_set as $possible_tag => $tag_index) {
+                if (!isset($this->tag_feature[$key][$tag_index])) {
+                    $this->tag_feature[$key][$tag_index] = 0;
+                }
+            }
+        }
+        foreach($this->tag_set as $possible_tag => $tag_index) {
+            if (!isset($this->bias[$tag_index])) {
+                $this->bias[$tag_index] = 0;
+            }
+        }
+        echo "Training...\n";
+        //train the weight
+        $cross_entropy_loss = 1;
+        $pre_cross_entropy_loss = 2;
+        for ($epoch = 0; ($epoch < $num_epoch) &&
+            $pre_cross_entropy_loss - $cross_entropy_loss > 0.000001;
+            $epoch++) {
+            $this->min_w = 0;
+            $this->max_w = 0;
+            $time = time();
+            $dy_dw = [];
+            $dy_dw_n = [];
+            $pre_cross_entropy_loss = $cross_entropy_loss;
+            $cross_entropy_loss = 0;
+            $cross_entropy_loss_n = 0;
+            $dy_db = [];
+            $dy_db_n = [];
+            $dy_dt = [];
+            $dy_dt_n = [];
+            for($i = 0; $i < count($this->tag_set); $i++) {
+                $dy_db[$i] = 0;
+                $dy_db_n[$i] = 0;
+            }
+            //for each sentence
+            foreach ($term_tag_sentences as $term_tag_pairs) {
+                $terms=$term_tag_pairs[0];
+                $tags=$term_tag_pairs[1];
+                for ($i = 0; $i < count($terms); $i++) {
+                    $k=[];
+                    for ($j=-2; $j <= 2;$j++) {
+                        $k[$j]= $this->getIndex($i + $j,$terms);
+                    }
+                    foreach ($this->tag_set as $possible_tag => $tag_index) {
+                        $equality = $possible_tag == $tags[$i] ? 1 : 0;
+                        $sum=0;
+                        //5 words including itself
+                        for ($j=-2; $j <= 2; $j++) {
+                            $sum += $this->word_feature[$k[$j]][$j][$tag_index];
+                        }
+                        //previous 2 tags
+                        if ($i == 0) {
+                            $tf1 = "start";
+                            $tf2 = "start-start";
+                        } else if ($i == 1) {
+                            $tf1 = $tags[$i - 1];
+                            $tf2 = "start-" . $tags[$i-1];
+                        } else {
+                            $tf1 = $tags[$i - 1];
+                            $tf2 = $tags[$i - 2] . "-" . $tags[$i - 1];
+                        }
+                        $sum += $this->tag_feature[$tf1][$tag_index];
+                        $sum += $this->tag_feature[$tf2][$tag_index];
+                        //bias
+                        $sum += $this->bias[$tag_index];
+                        $sigmoid = 1 / (1 + exp(-1 * $sum));
+                        for ($j=-2; $j<=2;$j++) {
+                            if (!isset($dy_dw[$k[$j]])) {
+                                $dy_dw[$k[$j]] = [];
+                                $dy_dw_n[$k[$j]] = [];
+                            }
+                            if (!isset($dy_dw[$k[$j]][$j])) {
+                                $dy_dw[$k[$j]][$j] = [];
+                                $dy_dw_n[$k[$j]][$j] = [];
+                            }
+                            if (!isset($dy_dw[$k[$j]][$j][$tag_index])) {
+                                $dy_dw[$k[$j]][$j][$tag_index] = 0;
+                                $dy_dw_n[$k[$j]][$j][$tag_index] = 0;
+                            }
+                            $dy_dw[$k[$j]][$j][$tag_index] +=
+                                ($sigmoid - $equality);
+                            $dy_dw_n[$k[$j]][$j][$tag_index] += 1;
+                        }
+                        //dy_dt
+                        if (!isset($dy_dt[$tf1])) {
+                            $dy_dt[$tf1] = [];
+                            $dy_dt_n[$tf1] = [];
+                        }
+                        if (!isset($dy_dt[$tf1][$tag_index])) {
+                            $dy_dt[$tf1][$tag_index] = 0;
+                            $dy_dt_n[$tf1][$tag_index] = 0;
+                        }
+                        if (!isset($dy_dt[$tf2])) {
+                            $dy_dt[$tf2] = [];
+                            $dy_dt_n[$tf2] = [];
+                        }
+                        if (!isset($dy_dt[$tf2][$tag_index])) {
+                            $dy_dt[$tf2][$tag_index] = 0;
+                            $dy_dt_n[$tf2][$tag_index] = 0;
+                        }
+                        $dy_dt[$tf1][$tag_index] += ($sigmoid - $equality);
+                        $dy_dt_n[$tf1][$tag_index] += 1;
+                        $dy_dt[$tf2][$tag_index] += ($sigmoid - $equality);
+                        $dy_dt_n[$tf2][$tag_index] += 1;
+                        //dy_db
+                        $dy_db[$tag_index] += ($sigmoid - $equality);
+                        $dy_db_n[$tag_index] += 1;
+                        $cross_entropy_loss -= ($equality * log($sigmoid)
+                            + (1 - $equality) * log(1 - $sigmoid));
+                        $cross_entropy_loss_n++;
+                    }
+                }
+            }
+            $cross_entropy_loss /= $cross_entropy_loss_n;
+            $duration = time() - $time;
+            echo "Epoch {$epoch} cross_entropy {$cross_entropy_loss}".
+                " took {$duration} seconds\n";
+            foreach ($dy_dw as $i => $v1) {
+                foreach ($v1 as $j => $v2) {
+                    foreach ($v2 as $k => $v3) {
+                        $this->word_feature[$i][$j][$k] -= $dy_dw[$i][$j][$k] /
+                            $dy_dw_n[$i][$j][$k] * $learning_rate;
+                        if ($this->word_feature[$i][$j][$k] < $this->min_w) {
+                            $this->min_w = $this->word_feature[$i][$j][$k];
+                        }
+                        if ($this->word_feature[$i][$j][$k] > $this->max_w) {
+                            $this->max_w = $this->word_feature[$i][$j][$k];
+                        }
+                    }
+                }
+            }
+            foreach ($dy_dt as $i => $v1) {
+                foreach ($v1 as $j => $v2) {
+                    $this->tag_feature[$i][$j] -= $dy_dt[$i][$j] /
+                        $dy_dt_n[$i][$j] * $learning_rate;
+                }
+            }
+            foreach ($dy_db as $k => $v) {
+                $this->bias[$k] -= $dy_db[$k] / $dy_db_n[$k] * $learning_rate;
+            }
+            if ($epoch % 10 == 9) {
+                $this->saveWeights();
+            }
+        }
+        $this->saveWeights();
+    }
+    /**
+     * Predicts named entities that exists in a sentence.
+     * @param mixed $sentence is an array of segmented words/terms
+     *  or a string that will be split on white space
+     * @return array all predicted named entities together with a tag
+     *  indicating kind of named entity
+     *  ex. [["郑振铎","nr"],["国民党","nt"]]
+     */
+    public function predict($sentence)
+    {
+        if (!is_array($sentence)) {
+            if ($sentence == "") {
+                $terms = [];
+            } else {
+                $terms = preg_split("/[\s]+/u", $sentence);
+            }
+        } else {
+            $terms = $sentence;
+        }
+        if (!count($terms)) {
+            return [];
+        }
+        if (!$this->word_feature) {
+            $this->loadWeights();
+        }
+        $result = [];
+        for($i = 0; $i < count($terms); $i++) {
+            $term = $terms[$i];
+            $score = [];
+            foreach($this->tag_set as $possible_tag => $tag_index) {
+                $score[$possible_tag] = 0;
+                for ($j = -2; $j <= 2; $j++) {
+                    $k = $this->getIndex($i + $j, $terms);
+                    if (isset($this->word_feature[$k])) {
+                        $score[$possible_tag] +=
+                            $this->getW($k, $j, $tag_index);
+                    }
+                }
+                if ($i == 0) {
+                    $tf1 = "start";
+                    $tf2 = "start-start";
+                } else if ($i == 1) {
+                    $tf1 = $result[$i - 1];
+                    $tf2 = "start-" . $result[$i - 1];
+                } else {
+                    $tf1 = $result[$i - 1];
+                    $tf2 = $result[$i - 2] . "-" . $result[$i-1];
+                }
+                $score[$possible_tag] += $this->getT($tf1, $tag_index);
+                $score[$possible_tag] += $this->getT($tf2, $tag_index);
+                $score[$possible_tag] += $this->getB($tag_index);
+            }
+            $result[] = array_keys($score, max($score))[0];
+        }
+        $pre_tag = 'o';
+        $current_entity = "";
+        $ret = [];
+        for ($i = 0; $i < count($terms); $i++) {
+            if ($pre_tag != $result[$i] && $pre_tag != "o") {
+                if (mb_strlen($current_entity) < 10) {
+                    $ret[] = [$current_entity, $pre_tag];
+                }
+                $current_entity = "";
+            }
+            if ($result[$i] != "o") {
+                if ($current_entity) {
+                    $current_entity .= $terms[$i];
+                } else {
+                    $current_entity = $terms[$i];
+                }
+            }
+            $pre_tag = $result[$i];
+        }
+        return $ret;
+    }
+}
diff --git a/src/library/PageRuleParser.php b/src/library/PageRuleParser.php
index 8b71a2538..70c3073af 100644
--- a/src/library/PageRuleParser.php
+++ b/src/library/PageRuleParser.php
@@ -201,7 +201,7 @@ class PageRuleParser implements CrawlConstants
      * Executes either the internal $rule_trees or the passed $rule_trees
      * on the provided $page_data associative array
      *
-     * @param array& $page_data an associative array of containing summary
+     * @param array &$page_data an associative array of containing summary
      *     info of a web page/record (will be changed by this operation)
      * @param array $rule_trees an array of annotated syntax trees to
      *     for rules used to update $page_data
@@ -223,7 +223,7 @@ class PageRuleParser implements CrawlConstants
      * Used to execute a single command rule on $page_data
      *
      * @param array $tree annotated syntax tree of a function call rule
-     * @param array& $page_data an associative array of containing summary
+     * @param array &$page_data an associative array of containing summary
      *     info of a web page/record (will be changed by this operation)
      */
     public function executeFunctionRule($tree, &$page_data)
@@ -250,7 +250,7 @@ class PageRuleParser implements CrawlConstants
      * Used to execute a single assignment rule on $page_data
      *
      * @param array $tree annotated syntax tree of an assignment rule
-     * @param array& $page_data an associative array of containing summary
+     * @param array &$page_data an associative array of containing summary
      *     info of a web page/record (will be changed by this operation)
      */
     public function executeAssignmentRule($tree, &$page_data)
@@ -304,7 +304,7 @@ class PageRuleParser implements CrawlConstants
      * of meta words for this page
      *
      * @param $field the key in $page_data to use
-     * @param array& $page_data an associative array of containing summary
+     * @param array &$page_data an associative array of containing summary
      *     info of a web page/record
      */
     public function addMetaWord($field, &$page_data)
@@ -329,7 +329,7 @@ class PageRuleParser implements CrawlConstants
      * which when clicked would perform a Yioop search on madonna.
      *
      * @param $field the key in $page_data to use
-     * @param array& $page_data an associative array of containing summary
+     * @param array &$page_data an associative array of containing summary
      *     info of a web page/record
      */
     public function addKeywordLink($field, &$page_data)
@@ -348,7 +348,7 @@ class PageRuleParser implements CrawlConstants
      * Set field variable to be used as a stack
      *
      * @param $field what field variable to use for current stack
-     * @param array& $page_data an associative array of containing summary
+     * @param array &$page_data an associative array of containing summary
      *     info of a web page/record
      */
     public function setStack($field, &$page_data)
@@ -367,7 +367,7 @@ class PageRuleParser implements CrawlConstants
      * stack
      *
      * @param $field what field  to get data to push onto fcurrent stack
-     * @param array& $page_data an associative array of containing summary
+     * @param array &$page_data an associative array of containing summary
      *     info of a web page/record
      */
     public function pushStack($field, &$page_data)
@@ -390,7 +390,7 @@ class PageRuleParser implements CrawlConstants
      * stack
      *
      * @param $field what field  to get data to push onto fcurrent stack
-     * @param array& $page_data an associative array of containing summary
+     * @param array &$page_data an associative array of containing summary
      *     info of a web page/record
      */
     public function popStack($field, &$page_data)
@@ -406,7 +406,7 @@ class PageRuleParser implements CrawlConstants
      *
      * @param $dir output directory in which to write data.txt files containing
      *     the contents of some fields after writeOutput commands
-     * @param array& $page_data an associative array of containing summary
+     * @param array &$page_data an associative array of containing summary
      *     info of a web page/record
      */
     public function setOutputFolder($dir, &$page_data)
@@ -417,7 +417,7 @@ class PageRuleParser implements CrawlConstants
      * Set output format
      *
      * @param $format can be either csv or sql
-     * @param array& $page_data an associative array of containing summary
+     * @param array &$page_data an associative array of containing summary
      *     info of a web page/record
      */
     public function setOutputFormat($format, &$page_data)
@@ -430,7 +430,7 @@ class PageRuleParser implements CrawlConstants
      * Set output table
      *
      * @param $table table to use if output format is sql
-     * @param array& $page_data an associative array of containing summary
+     * @param array &$page_data an associative array of containing summary
      *     info of a web page/record
      */
     public function setOutputTable($table, &$page_data)
@@ -444,7 +444,7 @@ class PageRuleParser implements CrawlConstants
      *
      *
      * @param $field the key in $page_data to use
-     * @param array& $page_data an associative array of containing summary
+     * @param array &$page_data an associative array of containing summary
      *     info of a web page/record
      */
     public function toArray($field, &$page_data)
@@ -463,7 +463,7 @@ class PageRuleParser implements CrawlConstants
      * and stores the result back into $page_data[$field]
      *
      * @param $field the key in $page_data to use
-     * @param array& $page_data an associative array of containing summary
+     * @param array &$page_data an associative array of containing summary
      *     info of a web page/record
      */
     public function toString($field, &$page_data)
@@ -479,7 +479,7 @@ class PageRuleParser implements CrawlConstants
      * it just sets it to the empty string
      *
      * @param $field the key in $page_data to use
-     * @param array& $page_data an associative array of containing summary
+     * @param array &$page_data an associative array of containing summary
      *     info of a web page/record
      */
     public function unsetVariable($field, &$page_data)
@@ -496,7 +496,7 @@ class PageRuleParser implements CrawlConstants
      * format. If the field is not set nothing is written
      *
      * @param $field the key in $page_data to use
-     * @param array& $page_data an associative array of containing summary
+     * @param array &$page_data an associative array of containing summary
      *     info of a web page/record
      */
     public function writeOutput($field, &$page_data)
diff --git a/src/library/PartOfSpeechContextTagger.php b/src/library/PartOfSpeechContextTagger.php
new file mode 100644
index 000000000..10fa5eec9
--- /dev/null
+++ b/src/library/PartOfSpeechContextTagger.php
@@ -0,0 +1,284 @@
+<?php
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009 - 2020  Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ *
+ * @author Xianghong Sun sxh19911230@gmail.com
+ * @license https://www.gnu.org/licenses/ GPL3
+ * @link https://www.seekquarry.com/
+ * @copyright 2009 - 2019
+ * @filesource
+ */
+namespace seekquarry\yioop\library;
+
+use seekquarry\yioop\configs as C;
+
+/**
+ * Machine learning based Part of Speech tagger.
+ * A PartOfSpeechContextTagger can be used to train a tagger for a language
+ * according to some dataset. Once training is complete it can be used to
+ * predict the tags for terms in a string or array of terms.
+ *
+ * @author Xianghong Sun (Principal),
+ *  Chris Pollett (mainly simplifications, and documentation)
+ */
+class PartOfSpeechContextTagger extends ContextTagger
+{
+    /**
+     * Constructor for the part of speech tagger.
+     * Sets the language this tagger tags for and sets up the path for
+     * where it should be stored
+     * @param string $lang locale tag of the language this tagger tags is for
+     */
+    public function __construct($lang)
+    {
+        $this->tagger_file = "pos_weights.txt.gz";
+        parent::__construct($lang);
+    }
+    /**
+     * Uses text files containing sentences to create a matrix
+     * so that from a two term before a term, two term after a term context
+     * and a term, the odds of each of its possible parts of speech can be
+     * calculated
+     *
+     * @param mixed $text_files with training data. These can be a file or
+     *  an array of file names. For now these files are assumed to be in
+     *  Chinese Treebank format.
+     * @param string $term_tag_separator separator used to separate term and tag
+     *  for terms in input sentence
+     * @param float $learning_rate learning rate when cycling over data trying
+     *  to minimize the cross-entropy loss in the prediction of the tag of the
+     *  middle term.
+     * @param int $num_epoch number of times to cycle through the
+     *  complete data set. Default value of 1200 seems to avoid overfitting
+     * @param function $term_callback callback function applied to a term
+     *  before adding term to sentence term array as part of processing and
+     *  training with a sentence.
+     * @param function $tag_callback callback function applied to a part of
+     *  speech tag  before adding tag to sentence tag array as part of
+     *  processing and training with a sentence.
+     * @param bool $resume if true, read the weight file and continue training
+     *   if false, start from beginning
+     */
+    public function train($text_files, $term_tag_separator = "-",
+        $learning_rate = 0.1, $num_epoch = 1200, $term_callback = null,
+        $tag_callback = null, $resume = false)
+    {
+        if (is_string($text_files)) {
+            $text_files = [$text_files];
+        }
+        echo "Reading files\n";
+        // term_tag_sentences[sentence#] = [[words...], [tags...]]
+        $term_tag_sentences = self::processTexts($text_files,
+            $term_tag_separator, $term_callback, $tag_callback);
+        if ($resume) {
+            echo "Loading weights... ";
+            $this->loadWeights(true);
+            $tag_index = count($this->tag_set);
+            echo "ok\n";
+        } else {
+            $this->word_feature = [];
+            $this->tag_set = [];
+            $tag_index = 0;
+            if (!empty($this->tokenizer) && method_exists($this->tokenizer,
+                "getPosKeyList")) {
+                $pos_key_list = $this->tokenizer::getPosKeyList();
+                foreach($pos_key_list as $k) {
+                    $this->tag_set[$k] = $tag_index++;
+                }
+            }
+            for ($i = -4; $i <= -1; $i++) {
+                $this->word_feature[$i] = [];
+            }
+        }
+        foreach ($term_tag_sentences as $term_tag_pairs) {
+            $terms = $term_tag_pairs[0];
+            $tags = $term_tag_pairs[1];
+            for ($i = 0; $i < count($terms); $i++) {
+                if (!isset($this->tag_set[$tags[$i]])) {
+                    $this->tag_set[$tags[$i]] = $tag_index++;
+                }
+                $k = $this->getIndex($i, $terms);
+                if (!isset($this->word_feature[$k])) {
+                    $this->word_feature[$k] = [];
+                }
+            }
+        }
+        foreach (array_keys($this->word_feature) as $key) {
+            for ($i = -2; $i <= 2; $i++) {
+                if (!isset($this->word_feature[$key][$i])) {
+                    $this->word_feature[$key][$i] = [];
+                }
+                foreach($this->tag_set as $possible_tag => $tag_index) {
+                    if (!isset($this->word_feature[$key][$i][$tag_index])) {
+                        $this->word_feature[$key][$i][$tag_index] = 0;
+                    }
+                }
+            }
+        }
+        foreach($this->tag_set as $possible_tag => $tag_index) {
+            if (!isset($this->bias[$tag_index])) {
+                $this->bias[$tag_index] = 0;
+            }
+        }
+        echo "Training\n";
+        //train the weight
+        $cross_entropy_loss = 1;
+        $pre_cross_entropy_loss = 2;
+        for ($epoch = 0; $epoch < $num_epoch &&
+            $pre_cross_entropy_loss - $cross_entropy_loss > 0.000001; $epoch++){
+            $this->min_w = 0;
+            $this->max_w = 0;
+            $time = time();
+            $dy_dw = [];
+            $dy_dw_n = [];
+            $pre_cross_entropy_loss = $cross_entropy_loss;
+            $cross_entropy_loss = 0;
+            $cross_entropy_loss_n = 0;
+            $dy_db = [];
+            $dy_db_n = [];
+            for($i = 0; $i < count($this->tag_set); $i++) {
+                $dy_db[$i] = 0;
+                $dy_db_n[$i] = 0;
+            }
+            //for each sentence
+            foreach ($term_tag_sentences as $term_tag_pairs) {
+                $terms = $term_tag_pairs[0];
+                $tags = $term_tag_pairs[1];
+                for ($i = 0; $i < count($terms); $i++) {
+                    $k=[];
+                    for ($j = -2; $j <= 2; $j++) {
+                        $k[$j] = $this->getIndex($i + $j, $terms);
+                    }
+                    foreach ($this->tag_set as $possible_tag => $tag_index) {
+                        $equality = ($possible_tag == $tags[$i]) ? 1 : 0;
+                        $sum = 0;
+                        for ($j = -2; $j <= 2; $j++) {
+                            $sum += $this->word_feature[$k[$j]][$j][$tag_index];
+                        }
+                        $sum += $this->bias[$tag_index];
+                        $sigmoid = 1 / (1 + exp(-1 * $sum));
+                        for ($j = -2; $j <= 2; $j++) {
+                            if (!isset($dy_dw[$k[$j]])) {
+                                $dy_dw[$k[$j]] = [];
+                                $dy_dw_n[$k[$j]] = [];
+                            }
+                            if (!isset($dy_dw[$k[$j]][$j])) {
+                                $dy_dw[$k[$j]][$j] = [];
+                                $dy_dw_n[$k[$j]][$j] = [];
+                            }
+                            if (!isset($dy_dw[$k[$j]][$j][$tag_index])) {
+                                $dy_dw[$k[$j]][$j][$tag_index] = 0;
+                                $dy_dw_n[$k[$j]][$j][$tag_index] = 0;
+                            }
+                            $dy_dw[$k[$j]][$j][$tag_index] +=
+                                ($sigmoid - $equality);
+                            $dy_dw_n[$k[$j]][$j][$tag_index] += 1;
+                        }
+                        //dy_db
+                        $dy_db[$tag_index] += ($sigmoid - $equality);
+                        $dy_db_n[$tag_index] += 1;
+                        $cross_entropy_loss -= ($equality * log($sigmoid)
+                            + (1 - $equality) * log(1 - $sigmoid));
+                        $cross_entropy_loss_n++;
+                    }
+                }
+            }
+            $cross_entropy_loss /= $cross_entropy_loss_n;
+            $duration = time() - $time;
+            echo "Epoch {$epoch} cross_entropy {$cross_entropy_loss}" .
+                " took {$duration} seconds\n";
+            foreach ($dy_dw as $i => $v1) {
+                foreach ($v1 as $j => $v2) {
+                    foreach ($v2 as $k => $v3) {
+                        $this->word_feature[$i][$j][$k] -=
+                            $dy_dw[$i][$j][$k] /
+                            $dy_dw_n[$i][$j][$k] *
+                            $learning_rate;
+                        if ($this->word_feature[$i][$j][$k] < $this->min_w) {
+                            $this->min_w = $this->word_feature[$i][$j][$k];
+                        }
+                        if ($this->word_feature[$i][$j][$k] > $this->max_w) {
+                            $this->max_w = $this->word_feature[$i][$j][$k];
+                        }
+                    }
+                }
+            }
+            foreach ($dy_db as $k => $v) {
+                $this->bias[$k] -= $dy_db[$k] / $dy_db_n[$k] * $learning_rate;
+            }
+            if ($epoch % 10 == 9 ) {
+                $this->saveWeights();
+            }
+        }
+        $this->saveWeights();
+    }
+    /**
+     * Predicts the part of speech tag for each term in a sentence
+     * @param mixed $sentence is an array of segmented words/terms
+     *  or a string with words/terms seperated by space
+     * @return array of tags for these terms
+     */
+    public function predict($sentence)
+    {
+        if (!is_array($sentence)) {
+            if ($sentence == "") {
+                $terms = [];
+            } else {
+                $terms = preg_split("/[\s]+/", $sentence);
+            }
+        } else {
+            $terms = $sentence;
+        }
+        if (!count($terms)) {
+            return [];
+        }
+        if (!$this->word_feature) {
+            $this->loadWeights();
+        }
+        $ret = [];
+        $pos_unknown_tags_list = [];
+        if (!empty($this->tokenizer) && method_exists($this->tokenizer,
+            "getPosUnknownTagsList")) {
+            $pos_unknown_tags_list = $this->tokenizer::getPosUnknownTagsList();
+        }
+        for($i = 0; $i < count($terms); $i++) {
+            $term = $terms[$i];
+            $score = [];
+            $key = $this->getKey($term);
+            foreach($this->tag_set as $possible_tag => $tag_index) {
+                $score[$possible_tag] = 0;
+                for ($j = -2; $j <= 2; $j++) {
+                    $k = $this->getIndex($i + $j, $terms);
+                    if (isset($this->word_feature[$k])) {
+                        $score[$possible_tag] +=
+                            $this->getW($k, $j, $tag_index);
+                    } else if ($j == 0 && !in_array($possible_tag,
+                        $pos_unknown_tags_list)) {
+                        $score[$possible_tag] += $this->min_w;
+                    }
+                }
+                $score[$possible_tag] += $this->getB($tag_index);
+            }
+            $ret[] = array_keys($score, max($score))[0];
+        }
+        return $ret;
+    }
+}
diff --git a/src/library/PhraseParser.php b/src/library/PhraseParser.php
index c16882e1b..86ffeebdb 100755
--- a/src/library/PhraseParser.php
+++ b/src/library/PhraseParser.php
@@ -270,7 +270,7 @@ class PhraseParser
      * a format that does not involved punctuation that will be stripped
      * as we extract phrases.
      *
-     * @param string& $string a string of words, etc which might involve such
+     * @param string &$string a string of words, etc which might involve such
      *      terms
      * @param $lang a language tag to use as part of the canonicalization
      *     process not used right now
@@ -315,7 +315,7 @@ class PhraseParser
      * Given a string, hyphenates words in the string which appear in
      * a bloom filter for the given locale as phrases.
      *
-     * @param string& $string a string of words, etc which might involve such
+     * @param string &$string a string of words, etc which might involve such
      *      terms
      * @param $lang a language tag to use as part of the canonicalization
      *     process
@@ -949,7 +949,7 @@ class PhraseParser
      * index for (server:apache) even if the document itself did not contain
      * them.
      *
-     * @param array& $site associated array containing info about a downloaded
+     * @param array &$site associated array containing info about a downloaded
      *     (or read from archive) document.
      * @return array of meta words to be associate with this document
      */
@@ -1165,7 +1165,10 @@ class PhraseParser
      * @param string $link_text text of the anchor tag link came from
      * @param string $site_url url of the page link was on
      * @param array $url_info key value pairs which may have been generated
-     *      as part of the page processor
+     *  as part of the page processor
+     * @param array $link_word_lists list of words used in anchor text
+     *  associated with this link and their positionns in the anchor text
+     * @return array meta words associated with the link
      */
     public static function calculateLinkMetas($url, $link_host, $link_text,
         $site_url, $url_info = [], $link_word_lists = [])
diff --git a/src/library/StochasticTermSegmenter.php b/src/library/StochasticTermSegmenter.php
index bab52ca31..b6f096d90 100644
--- a/src/library/StochasticTermSegmenter.php
+++ b/src/library/StochasticTermSegmenter.php
@@ -28,26 +28,12 @@
  */
 namespace seekquarry\yioop\library;

-use seekquarry\yioop\locale\zh_CN\resources as ZH;
 use seekquarry\yioop\configs as C;
+
 /**
- * A Stochastic Finite-State Word-Segmenter.
- * This class contains necessary tools to segment terms
- * from sentences.
- *
- * Currently only supports Chinese.
- * Instruction to add a new language:
- * Add a switch case in the constructor.
- * Define the following function:
- * isExceptionImpl
- * See the class function 'isException' for more information
- * isPunctuationImpl
- * See the class function 'isPunctuation' for more information
- * isNotCurrentLangImpl
- * See the class function 'notCurrentLang' for more information
- * Chinese example is provided in the constructor
+ * Class for segmenting terms using Stochastic Finite State Word Segmentation
  *
- * @author Xianghong Sun
+ * @author Xianghong Sun and Chris Pollett (tweaks to adding new language)
  */
 class StochasticTermSegmenter
 {
@@ -61,22 +47,22 @@ class StochasticTermSegmenter
      * In the test of Chinese Segmentation on pku dataset,
      * the speed is 43.803s vs. 1.540s
      * Default value = 0.06
-     * The time and Peak Memory are 5.094 s and 98.97MB
+     * The time and Peak Memory are 5.094s and 98.97MB
      * @var number from 0 - 1.0
      */
     private $cache_pct;
     /**
-     * Cache. Will have runtime data for the segmentation
+     * Cache of sub trie of dictionary trie used to speed up look up
      * @var array
      */
-    private $cache=[];
+    private $cache = [];
     /**
-     * The language currently being used  e.g. zh_CN, ja
+     * The language currently being used  e.g. zh-CN, ja
      * @var string
      */
     public $lang;
     /**
-     * regular expression to determine if the non of the char in this
+     * Regular expression to determine if the non of the char in this
      * term is in current language
      * Recommanded expression for:
      * Chinese:  \p{Han}
@@ -91,123 +77,97 @@ class StochasticTermSegmenter
      */
     public $unknown_term_score;
     /**
-     * A dictionary file that contains the statistic infomation of
-     * the terms
+     * A dictionary that contains statistical information on terms for a
+     * language. A non-empty dictionary should have two fields:
+     * N, the number of terms in the dictionary; dic,
+     * a trie implemented using nested php arrays that implements the
+     * dictionary. The leaves of the trie have frequency counts for terms
+     * stored in the trie.
      * @var array
      */
-    public $dictionary_file;
+    public $dictionary;
+    /**
+     * Path on disk to where segmentor dictionary should be stored
+     * @var string
+     */
+    public $dictionary_path;
     /**
-     * Construct an instance of this class used for segmenting string with
+     * Constructs an instance of this class used for segmenting string with
      * respect to words in a locale using a probabilistic approach to evaluate
      * segmentation possibilities.
-     * @param string $lang is a string to indicate the language
+     * @param string $lang locale this instance will do segmentation for
+     * @param float $cache_pct percentage of whole trie that can be
+     *  cached for faster look-up
      */
     function __construct($lang, $cache_pct = 0.06)
     {
+        $lang = str_replace("-", "_", $lang);
+        $this->lang = $lang;
+        $this->dictionary_path = C\LOCALE_DIR .
+         "/$lang/resources/term_weights.txt.gz";
         $this->cache_pct = $cache_pct;
-        /* Add different attribute for different languages
-         * Currently only Chinese
+        $this->tokenizer = PhraseParser::getTokenizer($lang);
+        if (!is_object($this->tokenizer)) {
+            return;
+        }
+        /*
+         * To use a StocasticTermSegmenter, a locale's Tokenizer should
+         * implement isCardinalNumber, isOrdinalNumber, isDate,
+         * isPunctuation, isNotCurrentLang and optionally getNamedEntityTagger
          */
-        switch($lang)
-        {
-            case "zh_CN":
-            case "zh-CN":
-                $this->lang = "zh_CN";
-                /*
-                 * Check if the term passed in is an exception term
-                 */
-                $this->isExceptionImpl = function($term) {
-                    return ZH\Tokenizer::isCardinalNumber($term)
-                    || ZH\Tokenizer::isOrdinalNumber($term)
-                    || ZH\Tokenizer::isDate($term);
-                };
-                /*
-                 * Check if the term passed in is a punctuation
-                 */
-                $this->isPunctuationImpl = function($term)
-                {
-                    return ZH\Tokenizer::isPunctuation($term);
-                };
-                /*
-                 * Check if all the chars in the term is NOT current language
-                 */
-                $this->isNotCurrentLangImpl = function($term)
-                {
-                    return ZH\Tokenizer::isNotCurrentLang($term);
-                };
-                /*
-                 * named entity recognizer;
-                 */
-                $this->NER = ZH\Tokenizer::getNER();
-                break;
-            default:
-                $this->lang = $lang;
+        if (method_exists($this->tokenizer, "getNamedEntityTagger")) {
+            /*
+             * Named entity recognizer;
+             */
+            $this->named_entity_tagger =
+                $this->tokenizer::getNamedEntityTagger();
         }
     }
-    /**
-     * __call  for calling dynamic methods
-     * @param string $method method of this class to call
-     * @param array $args arguments to pass to method
-     * @return mixed result of method calculation
-     */
-    public function __call($method, $args)
-    {
-        return call_user_func_array($this->$method, $args);
-    }
-    /**
-     *  __get  for getting dynamic variables
-     * @param string $var_name variable to retrieve
-     * @return mixed result of retrieval
-     */
-    public function __get($var_name)
-    {
-        return $this->$var_name;
-    }
-    /**
-     *  __set  for assigning dynamic variables
-     * @param string $var_name variable to assign
-     * @param  mixed $value value to assign to it
-     */
-    public function __set($var_name, $value)
-    {
-        $this->$var_name = $value;
-    }
     /**
      * Check if the term passed in is an exception term
      * Not all valid terms should be indexed.
      * e.g. there are infinite combinations of numbers in the world.
      * isExceptionImpl should be defined in constructor if needed
-     * @param $term is a string that to be checked
+     * @param string $term is a string that to be checked
      * @return true if $term is an exception term, false otherwise
      */
     public function isException($term)
     {
-        if (isset($this->isExceptionImpl))
-            return $this->isExceptionImpl($term);
+        if (method_exists($this->tokenizer, "isCardinalNumber") &&
+            method_exists($this->tokenizer, "isOrdinalNumber") &&
+            method_exists($this->tokenizer, "isDate")) {
+            return $this->tokenizer::isCardinalNumber($term)
+                || $this->tokenizer::isOrdinalNumber($term)
+                || $this->tokenizer::isDate($term);
+        }
         return false;
     }
     /**
-     * Check if the term passed in is a punctuation
+     * Check if the term passed in is a punctuation character
      * isPunctuationImpl should be defined in constructor if needed
-     * @param $term is a string that to be checked
-     * @return true if $term is a punctuation, false otherwise
+     * @param string $term is a string that to be checked
+     * @return true if $term is some kind of punctuation, false otherwise
      */
     public function isPunctuation($term)
     {
-        if (isset($this->isPunctuationImpl))
-            return $this->isPunctuationImpl($term);
+        if (!empty($this->tokenizer) &&
+            method_exists($this->tokenizer, "isPunctuation")) {
+            return $this->tokenizer::isPunctuation($term);
+        }
         return false;
     }
     /**
-     * Check if all the chars in the term is NOT current language
-     * @param $term is a string that to be checked
-     * @return bool true if all the chars in $term is NOT current language
-     *         false otherwise
+     * Check if all the chars in the term are NOT from the current language
+     * @param string $term is a string that to be checked
+     * @return bool true if all the chars in $term are NOT from the current
+     *  language false otherwise
      */
     public function notCurrentLang($term)
     {
-        if (isset($this->isNotCurrentLangImpl))
-            return $this->isNotCurrentLangImpl($term);
+        if (!empty($this->tokenizer) &&
+            method_exists($this->tokenizer, "isNotCurrentLang")) {
+            return $this->tokenizer::isNotCurrentLang($term);
+        }
         return false;
     }
     /**
@@ -219,20 +179,18 @@ class StochasticTermSegmenter
      */
     public function train($text_files, $format = "default")
     {
-        $ctb_fmt=false;
+        $ctb_fmt = false;
         switch ($format) {
             case("default"):
                 break;
             case("CTB"):
-                $ctb_fmt=true;
+                $ctb_fmt = true;
                 break;
             default:
                 echo "Unrecognized format";
                 exit();
         }
-        $out_file = C\LOCALE_DIR .
-            "/{$this->lang}/resources/term_weight.txt.gz";
-        echo "Saving file to: $out_file\n";
+        echo "Saving file to: {$this->dictionary_path}\n";
         $dictionary = [];
         $N = 0;
         if (is_string($text_files)) {
@@ -261,31 +219,31 @@ class StochasticTermSegmenter
                 fclose($fh);
             }
         }
-        $this->dictionary_file = [];
-        $this->dictionary_file["N"] = 0;
-        $this->dictionary_file["dic"] = [];
-        ksort ($dictionary);
+        $this->dictionary = [];
+        $this->dictionary["N"] = 0;
+        $this->dictionary["dic"] = [];
+        ksort($dictionary);
         $start_char = null;
-        $tmp_array=[];
+        $tmp_array = [];
         foreach ($dictionary as $key => $value) {
-            if (mb_substr($key,0,1)!=$start_char) {
-                $this->dictionary_file["dic"][$start_char]
-                    = json_encode($tmp_array[$start_char]);
-                $tmp_array=[];
-                $start_char=mb_substr($key,0,1);
+            if (mb_substr($key, 0, 1) != $start_char) {
+                $this->dictionary["dic"][$start_char] =
+                    json_encode($tmp_array[$start_char]);
+                $tmp_array = [];
+                $start_char = mb_substr($key, 0, 1);
             }
             $this->add($key, $value, $tmp_array);
-            $this->dictionary_file["N"]++;
+            $this->dictionary["N"]++;
         }
         $this->unknown_term_score = $this->getScore(1);
-        file_put_contents($out_file,
-            gzencode(json_encode($this->dictionary_file), 9));
+        file_put_contents($this->dictionary_path,
+            gzencode(json_encode($this->dictionary), 9));
         return true;
     }

     /**
-     * This function is used to segment a list of files
-     * @param $text_files can be a file name or a list of file names
+     * Segments the text in a list of files
+     * @param mixed $text_files can be a file name or a list of file names
      *        to be segmented
      * @param bool $return_string return segmented string if true,
      *        print to stdout otherwise
@@ -325,11 +283,11 @@ class StochasticTermSegmenter
         return true;
     }
     /**
-     * Segment texts. Words are seperated by space
-     * @param string $text  to be segmented
+     * Segments text into words separated by space
+     * @param string $text to be segmented
      * @param bool $return_string return segmented string if true,
      *        print otherwise
-     * @return string segmented words with space or true/false;
+     * @return mixed segmented words with space or true/false;
      */
     public function segmentText($text, $return_string = false)
     {
@@ -340,10 +298,12 @@ class StochasticTermSegmenter
         foreach ($sentences as $line) {
             if (mb_strlen($line)) {
                 $t = $this->segmentSentence($line);
-                if ($return_string) {
-                    $result .= join( " ", $t) . "\n";
-                } else {
-                    echo join( " ", $t) . "\n";
+                if (!empty($t)) {
+                    if ($return_string) {
+                        $result .= join( " ", $t) . "\n";
+                    } else {
+                        echo join( " ", $t) . "\n";
+                    }
                 }
             }
         }
@@ -353,35 +313,37 @@ class StochasticTermSegmenter
         return true;
     }
     /**
-     * Segment a sentence into arrays of words.
-     * Need NOT contain any new line characters.
+     * Segments a single sentence into an array of words.
+     * Must NOT contain any new line characters.
      * @param string $sentence is a string without newline to be segmented
      * @return array of segmented words
      */
     public function segmentSentence($sentence)
     {
-        $t=preg_split("/[\s ]+/u", trim($sentence));
+        $t = preg_split("/[\s ]+/u", trim($sentence));
         if(count($t) > 1) {
             $ret = [];
             foreach($t as $s) {
-                $ret=array_merge($ret,$this->segmentSentence($s));
+                $segments = $this->segmentSentence($s);
+                if (is_array($segments)) {
+                    $ret = array_merge($ret, $segments);
+                }
             }
             return $ret;
         }
-        if (!$this->dictionary_file) {
-            $dic_file = C\LOCALE_DIR .
-                "/{$this->lang}/resources/term_weight.txt.gz";
-            if (!file_exists($dic_file)) {
-                crawlLog("$dic_file does not exist!");
+        if (!$this->dictionary) {
+            if (!file_exists($this->dictionary_path)) {
+                crawlLog("{$this->dictionary_path} does not exist!");
                 return null;
             }
-            $this->dictionary_file =
-                json_decode(gzdecode(file_get_contents($dic_file)), true);
+            $this->dictionary =
+                json_decode(gzdecode(file_get_contents(
+                    $this->dictionary_path)), true);
             gc_collect_cycles();
             $this->unknown_term_score = $this->getScore(1);
         }
         $cache_size =
-            floor(count($this->dictionary_file['dic']) * $this->cache_pct);
+            floor(count($this->dictionary['dic']) * $this->cache_pct);
         if ($cache_size == 0) {
             $cache_size = 1;
         }
@@ -390,11 +352,12 @@ class StochasticTermSegmenter
         if (!count($characters)) {
             return [];
         }
-        $ner_dict=[];
-        if (isset($this->NER)) {
-            $named_entities=$this->NER->predict($characters);
+        $net_dict = [];
+        if (isset($this->named_entity_tagger)) {
+            $named_entities = $this->named_entity_tagger->predict(
+                $characters);
             foreach($named_entities as $e) {
-                $this->add($e[0],1,$ner_dict);
+                $this->add($e[0], 1, $net_dict);
             }
         }
         $score = [];
@@ -407,7 +370,7 @@ class StochasticTermSegmenter
                 && !$this->isPunctuation($characters[$index])) {
                 $current_char = $characters[$index];
                 for($j = $index + 1; $j < count($characters); $j++) {
-                    if ($this->notCurrentLang($current_char.$characters[$j])
+                    if ($this->notCurrentLang($current_char . $characters[$j])
                         && !$this->isPunctuation($characters[$j])) {
                         $current_char .= $characters[$j];
                     } else {
@@ -424,7 +387,7 @@ class StochasticTermSegmenter
             //If date or number
             if ($this->isException($characters[$index]) ) {
                 $current_char = $characters[$index];
-                for($j = $index+1; $j<count($characters); $j++) {
+                for($j = $index+1; $j < count($characters); $j++) {
                     if (!$this->isException(
                         $current_char . $characters[$j])) {
                         break;
@@ -468,11 +431,11 @@ class StochasticTermSegmenter
                 $path[$index] = $index - 1;
             }
             //if entry exists, look for the term
-            if (isset($this->dictionary_file["dic"][$characters[$index]])) {
+            if (isset($this->dictionary["dic"][$characters[$index]])) {
                 if (!isset($this->cache[$characters[$index]])) {
                     $this->cache = [$characters[$index] =>
                         json_decode(
-                        $this->dictionary_file["dic"][$characters[$index]],
+                        $this->dictionary["dic"][$characters[$index]],
                         true)] + $this->cache;
                     while (count($this->cache) > $cache_size) {
                         array_pop($this->cache);
@@ -493,9 +456,9 @@ class StochasticTermSegmenter
                     }
                 }
             }
-            //check NER dictionary
-            if (isset($ner_dict[$characters[$index]])) {
-                $subdic = $ner_dict;
+            //Check Named Entity Tagger dictionary
+            if (isset($net_dict[$characters[$index]])) {
+                $subdic = $net_dict;
                 for ($j = $index; $j < count($characters); $j++) {
                     if (!isset($subdic[$characters[$j]])) {
                         break;
@@ -520,9 +483,9 @@ class StochasticTermSegmenter
         }
         $result = [];
         $t = 0;
-        foreach(array_reverse($tmp) as $nextnode) {
+        foreach(array_reverse($tmp) as $next_node) {
             $result_word = "";
-            while($t <= $nextnode) {
+            while($t <= $next_node) {
               $result_word .= $characters[$t];
               $t++;
             }
@@ -531,40 +494,42 @@ class StochasticTermSegmenter
         return $result;
     }
     /**
-     * This is the function to calculate scores for each word
+     * Calculates a score for a term based on its frequency versus that
+     * of the whole trie.
      * @param int $frequency is an integer tells the frequency of a word
      * @return float the score of the term.
      */
-    private function getScore($frequency)
+    public function getScore($frequency)
     {
-        if (!empty($this->dictionary_file["N"]) &&
-            is_numeric($this->dictionary_file["N"])) {
-            return -log($frequency / $this->dictionary_file["N"]);
+        if (!empty($this->dictionary["N"]) &&
+            is_numeric($this->dictionary["N"])) {
+            return -log($frequency / $this->dictionary["N"]);
         } else {
             return 0;
         }
     }
     /**
-     * Adds a term to the dictionary
+     * Adds a (term, frequency) pair to an array based trie
      *
-     * @param string $key the term to be inserted
-     * @param string $value the frequency to be inserted
-     * @param array $array for insertion
+     * @param string $term the term to be inserted
+     * @param string $frequency the frequency to be inserted
+     * @param array & $trie array based trie we want to insert the key value
+     *      pair into
      */
-    private function add($key, $value, & $array)
+    public function add($term, $frequency, & $trie)
     {
-        $trie_array = & $array;
-        for ($i = 0; $i < mb_strlen($key,"utf-8"); $i++) {
-            $character = mb_substr($key, $i, 1, "utf-8");
+        $sub_trie = & $trie;
+        for ($i = 0; $i < mb_strlen($term, "utf-8"); $i++) {
+            $character = mb_substr($term, $i, 1, "utf-8");
             $enc_char = $character;
             // If letter doesnt exist then create one by
             // assigning new array
-            if (!isset($trie_array[$enc_char])) {
-                $trie_array[$enc_char] = [];
+            if (!isset($sub_trie[$enc_char])) {
+                $sub_trie[$enc_char] = [];
             }
-            $trie_array = & $trie_array[$enc_char];
+            $sub_trie = & $sub_trie[$enc_char];
         }
         // Set end of term marker
-        $trie_array['$'] = $value;
+        $sub_trie['$'] = $frequency;
     }
 }
diff --git a/src/library/SuffixTree.php b/src/library/SuffixTree.php
index f0bcdb837..25eb36fd0 100644
--- a/src/library/SuffixTree.php
+++ b/src/library/SuffixTree.php
@@ -177,7 +177,7 @@ class SuffixTree
      * The number of elements out of $this->text that this node is currently
      * responsible for
      *
-     * @param array& $node the node to compute the length of
+     * @param array &$node the node to compute the length of
      */
     public function edgeLength(&$node)
     {
@@ -288,7 +288,7 @@ class SuffixTree
      * @param int $index a node in the suffix tree
      * @param string $path from root to current node
      * @param int $len number of nodes from root to current node in suffix tree
-     * @param array& $maximal assoc array of phrase => (cond_max => pos of
+     * @param array &$maximal assoc array of phrase => (cond_max => pos of
      *     conditional maximal subphrase, [0] => pos_1st_occurrence of phrase,
      *     [1]=>pos_2nd_occurrence of phrase, etc)
      */
diff --git a/src/library/UpgradeFunctions.php b/src/library/UpgradeFunctions.php
index ea463e0d9..d4389ea89 100644
--- a/src/library/UpgradeFunctions.php
+++ b/src/library/UpgradeFunctions.php
@@ -93,7 +93,7 @@ function upgradeLocales()
 /**
  * Used to force push the default Public and Wiki pages into the current
  * database
- * @param object& $db datasource to use to upgrade
+ * @param resource &$db datasource to use to upgrade
  */
 function upgradePublicHelpWiki(&$db)
 {
@@ -242,7 +242,7 @@ function getWikiHelpPages()
  * Inserting at an ID rather than at the end is useful since activities are
  * displayed in admin panel in order of increasing id.
  *
- * @param resource& $db database handle where Yioop database stored
+ * @param resource &$db database handle where Yioop database stored
  * @param string $string_id message identifier to give translations for
  *     for activity
  * @param string  $method_name admin_controller method to be called to perform
@@ -288,7 +288,7 @@ function addActivityAtId(&$db, $string_id, $method_name, $activity_id)
  * Adds or replaces a translation for a database message string for a given
  * IANA locale tag.
  *
- * @param resource& $db database handle where Yioop database stored
+ * @param resource &$db database handle where Yioop database stored
  * @param string $string_id message identifier to give translation for
  * @param string $locale_tag  the IANA language tag to update the strings of
  * @param string $translation the translation for $string_id in the language
diff --git a/src/library/Utility.php b/src/library/Utility.php
index 4e80d00cf..bc3796db9 100755
--- a/src/library/Utility.php
+++ b/src/library/Utility.php
@@ -180,7 +180,7 @@ function getIniAssignMatch($matches)
  * bytes to destination string
  *
  * @param string $source  string to copy from
- * @param string& $destination string to copy to
+ * @param string &$destination string to copy to
  * @param int $start starting offset
  * @param int $length number of bytes to copy
  * @param string $timeout_msg for long copys message to print if taking more
@@ -228,7 +228,7 @@ function vByteEncode($pos_int)
 /**
  * Decodes from a string using variable byte coding an integer.
  *
- * @param string& $str string to use for decoding
+ * @param string &$str string to use for decoding
  * @param int $offset byte offset into string when var int stored
  * @return int the decoded integer
  */
@@ -278,7 +278,7 @@ function packPosting($doc_index, $position_list, $delta = true)
  *
  * @param string $posting a string containing
  *     a doc index position list pair coded encoded using modified9
- * @param int& $offset a offset into the string where the modified9 posting
+ * @param int &$offset a offset into the string where the modified9 posting
  *     is encoded
  * @param bool $dedelta if true then assumes the list is a sequence of
  *     differences (a delta list) and undoes the difference to get
@@ -309,7 +309,7 @@ function unpackPosting($posting, &$offset, $dedelta = true)
  * Given a string of postings adds $add_offset add to each offset to the
  * document map in each posting.
  *
- * @param string& $postings a string of index shard postings
+ * @param string &$postings a string of index shard postings
  * @param int $add_offset an fixed amount to add to each postings doc map offset
  *
  * @return string $new_postings where each doc offset has had $add_offset added
@@ -525,8 +525,8 @@ function packListModified9($continue_bits, $cnt, $pack_list)
  * Returns the next complete posting string from $input_string being at offset.
  * Does not do any decoding.
  *
- * @param string& $input_string a string of postings
- * @param int& $offset an offset to this string which will be updated after call
+ * @param string &$input_string a string of postings
+ * @param int &$offset an offset to this string which will be updated after call
  * @return string undecoded posting
  */
 function nextPostString(&$input_string, &$offset)
@@ -562,7 +562,7 @@ function nextPostString(&$input_string, &$offset)
  * encoded using Modified 9
  *
  * @param string $input_string string to decode from
- * @param int& $offset where to string in the string, after decode
+ * @param int &$offset where to string in the string, after decode
  *     points to where one was after decoding.
  * @return array sequence of positive integers that were decoded
  * @see encodeModified9
@@ -950,28 +950,6 @@ function intToMetric($num)
     }
     return $num;
 }
-/**
- *
- */
-function binomial($n, $k, $p = 1)
-{
-    //modified from wikipedia
-    if ($k < 0 || $k > $n) {
-        return 0;
-    }
-    if ($k == 0 || $k == $n) {
-        return 1;
-    }
-    $k = min($k, $n - $k); // symmetry
-    $res = 1;
-    for ($i = 0; $i < $k; $i++) {
-        $res *= ($p*($n - $i))/($i + 1);
-    }
-    if ($p != 1) {
-        $res *= pow(1 - $p, $n - $k);
-    }
-    return $res;
-}
 /**
  * Logs a message to a logfile or the screen
  *
@@ -1604,11 +1582,15 @@ function microTimestamp()
     return vsprintf('%d.%06d', gettimeofday());
 }
 /**
+ * Checks that a timestamp is within the time interval given by a
+ * start time (HH:mm) and a duration
  *
- * @param string $start_time
- * @param string $duration
- * @param int $time
- * @return int
+ * @param string $start_time string of the form (HH:mm)
+ * @param string $duration string containting an int in seconds
+ * @param int $time a Unix timestamp.
+ * @return int -1 if the time of day of $time is not within the given interval.
+ *      Otherwise, the Unix timestamp at which the interval will be over for
+ *      the same day as $time.
  */
 function checkTimeInterval($start_time, $duration, $time = -1)
 {
@@ -2213,7 +2195,7 @@ function computeLCS($lines1, $lines2, $offset = 0)
  * @param int $offset a number to add to each line number output into $lcs.
  *     This is useful if we have trimmed off the initially common lines from
  *     our two strings we are trying to compute the LCS of
- * @param array& $lcs an array of triples
+ * @param array &$lcs an array of triples
  *     (index_string1, index_string2, line)
  *     the indexes indicate the line number in each string, line is the line
  *     in common the two strings
diff --git a/src/library/WebArchive.php b/src/library/WebArchive.php
index 692a3ea79..55909598b 100755
--- a/src/library/WebArchive.php
+++ b/src/library/WebArchive.php
@@ -158,7 +158,7 @@ class WebArchive
      *
      * @param resource $fh resource for the web archive file. If null
      *     the web archive is open first and close when the data is written
-     * @param array& $data data to write into the info block of the archive
+     * @param array &$data data to write into the info block of the archive
      */
     public function writeInfoBlock($fh = null, &$data = null)
     {
@@ -215,7 +215,7 @@ class WebArchive
      *
      * @param string $offset_field field in objects to return the byte offset
      *     at which they were stored
-     * @param array& $objects references to objects that will be stored
+     * @param array &$objects references to objects that will be stored
      *     the offset field in these references will be adjusted if
      * @param array $data data to write in the WebArchive's info block
      * @param string $callback name of a callback
diff --git a/src/library/WebArchiveBundle.php b/src/library/WebArchiveBundle.php
index 8d748a693..272c3f9c0 100755
--- a/src/library/WebArchiveBundle.php
+++ b/src/library/WebArchiveBundle.php
@@ -166,7 +166,7 @@ class WebArchiveBundle
      * the resulting offsets given by $offset_field.
      *
      * @param string $offset_field field used to record offsets after storing
-     * @param array& $pages data to store
+     * @param array &$pages data to store
      * @return int the write_partition the pages were stored in
      */
     public function addPages($offset_field, &$pages)
diff --git a/src/library/WebQueueBundle.php b/src/library/WebQueueBundle.php
index 8234b2b55..b06ef43ee 100755
--- a/src/library/WebQueueBundle.php
+++ b/src/library/WebQueueBundle.php
@@ -521,7 +521,7 @@ class WebQueueBundle implements Notifier
     }
     /**
      * Removes all url objects from $url_array which have been seen
-     * @param array& $url_array objects to check if have been seen
+     * @param array &$url_array objects to check if have been seen
      * @param array $field_names an array of components of a url_array element
      * which contain a url to check if seen
      */
diff --git a/src/library/WebSite.php b/src/library/WebSite.php
index 8d59ab521..04e38c9d1 100644
--- a/src/library/WebSite.php
+++ b/src/library/WebSite.php
@@ -1,9 +1,9 @@
 <?php
 /**
- * seekquarry\yioop\Website -- a small web server and web routing engine
+ * seekquarry\yioop\Website --
+ * a small web server and web routing engine
  *
- *
- * Copyright (C) 2018  Chris Pollett chris@pollett.org
+ * Copyright (C) 2018-2020  Chris Pollett chris@pollett.org
  *
  * LICENSE:
  *
@@ -46,6 +46,8 @@ use seekquarry\yioop\configs as C;
  * PHP superglobals like $_GET, $_POST, $_REQUEST, $_COOKIE, $_SESSION,
  * $_FILES, etc and endeavors to make it easy to code apps in a rapid PHP
  * style.
+ *
+ * @author Chris Pollett
  */
 class WebSite
 {
diff --git a/src/library/archive_bundle_iterators/OdpRdfArchiveBundleIterator.php b/src/library/archive_bundle_iterators/OdpRdfArchiveBundleIterator.php
index d0442ec56..8c61cacdd 100644
--- a/src/library/archive_bundle_iterators/OdpRdfArchiveBundleIterator.php
+++ b/src/library/archive_bundle_iterators/OdpRdfArchiveBundleIterator.php
@@ -193,7 +193,7 @@ class OdpRdfArchiveBundleIterator extends TextArchiveBundleIterator
      * document
      *
      * @param object $dom document object for one Topic tag tag
-     * @param array& $site a reference to an array of header and page info
+     * @param array &$site a reference to an array of header and page info
      *     for an html page
      */
     public function processTopic($dom, &$site)
@@ -229,7 +229,7 @@ class OdpRdfArchiveBundleIterator extends TextArchiveBundleIterator
      * document
      *
      * @param object $dom document object for one Topic tag tag
-     * @param array& $site a reference to an array of header and page info
+     * @param array &$site a reference to an array of header and page info
      *     for an html page
      */
     public function processExternalPage($dom, &$site)
diff --git a/src/library/archive_bundle_iterators/TextArchiveBundleIterator.php b/src/library/archive_bundle_iterators/TextArchiveBundleIterator.php
index 2349a3e5c..bef2ad6e3 100644
--- a/src/library/archive_bundle_iterators/TextArchiveBundleIterator.php
+++ b/src/library/archive_bundle_iterators/TextArchiveBundleIterator.php
@@ -316,7 +316,7 @@ class TextArchiveBundleIterator extends ArchiveBundleIterator
      * Helper function for nextChunk to advance the parition if we are
      * at the end of the current archive file
      *
-     * @param array& $info a struct with data about current chunk. will up start
+     * @param array &$info a struct with data about current chunk. will up start
      *     partition flag
      */
     public function updatePartition(&$info)
@@ -724,7 +724,7 @@ class TextArchiveBundleIterator extends ArchiveBundleIterator
     {
         $info = $this->getNextTagsData([$tag]);
         if (!isset($info[1])) {
-            return $info;
+            return $info;
         }
         return $info[0];
     }
diff --git a/src/library/classifiers/Classifier.php b/src/library/classifiers/Classifier.php
index 63cb9d23b..1d217de31 100644
--- a/src/library/classifiers/Classifier.php
+++ b/src/library/classifiers/Classifier.php
@@ -984,8 +984,8 @@ class Classifier implements CrawlConstants
      * @param array $summary page summary to classify, passed by reference
      * @param array $classifiers list of Classifier instances, each prepared
      * for classifying (via the prepareToClassify method)
-     * @param array& $active_classifiers
-     * @param array& $active_rankers
+     * @param array &$active_classifiers
+     * @param array &$active_rankers
      */
     public static function labelPage(&$summary, $classifiers,
         &$active_classifiers, &$active_rankers)
diff --git a/src/library/index_bundle_iterators/DocIterator.php b/src/library/index_bundle_iterators/DocIterator.php
index 75122a164..15399e50a 100755
--- a/src/library/index_bundle_iterators/DocIterator.php
+++ b/src/library/index_bundle_iterators/DocIterator.php
@@ -96,9 +96,17 @@ class DocIterator extends IndexBundleIterator
      * Creates a word iterator with the given parameters.
      * @param string $index_name time_stamp of the to use
      * @param SearchfiltersModel $filter Model responsible for keeping
-     *      track of edited and deleted search results
+     *  track of edited and deleted search results
+     * @param int $results_per_block number of results in a block of results
+     *  return in one go from the iterator
+     * @param int $direction when results are access from $index_name in
+     *  which order they should be presented. self::ASCENDING is from first
+     *  added to last added, self::DESCENDING is from last added to first
+     *  added. Note: this value is not saved permanently. So you
+     *  could in theory open two read only versions of the same bundle but
+     *  reading the results in different directions
      * @param int $results_per_block the maximum number of results that can
-     *      be returned by a findDocsWithWord call
+     *  be returned by a findDocsWithWord call
      */
     public function __construct($index_name, $filter = null,
         $results_per_block = IndexBundleIterator::RESULTS_PER_BLOCK,
@@ -225,7 +233,9 @@ class DocIterator extends IndexBundleIterator
         return $results;
     }
     /**
-     *
+     * Get the document offset prior to the current $doc_offset
+     * @param int $doc_offset an offset into the document map of an IndexShard
+     * @return int previous doc_offset
      */
     public function getPreviousDocOffset($doc_offset)
     {
diff --git a/src/library/index_bundle_iterators/GroupIterator.php b/src/library/index_bundle_iterators/GroupIterator.php
index 6c9f18b8f..e303f37a7 100644
--- a/src/library/index_bundle_iterators/GroupIterator.php
+++ b/src/library/index_bundle_iterators/GroupIterator.php
@@ -227,7 +227,7 @@ class GroupIterator extends IndexBundleIterator
      * have been remembered in grouped_keys and will be ignored in the return
      * result of this function.
      *
-     * @param array& $pages pages to group
+     * @param array &$pages pages to group
      * @return array $pre_out_pages pages after grouping
      */
     public function groupByHashUrl(&$pages)
@@ -267,7 +267,7 @@ class GroupIterator extends IndexBundleIterator
      * that group as its representative. The function then modifies the
      * supplied argument array to make it an array of group representatives.
      *
-     * @param array& $pre_out_pages documents previously grouped by hash of url
+     * @param array &$pre_out_pages documents previously grouped by hash of url
      */
     public function groupByHashAndAggregate(&$pre_out_pages)
     {
@@ -319,7 +319,7 @@ class GroupIterator extends IndexBundleIterator
      * of single summarized documents for each group. These single summarized
      * documents have aggregated scores.
      *
-     * @param array& $pre_out_pages array of groups of pages for which out pages
+     * @param array &$pre_out_pages array of groups of pages for which out pages
      *     are to be generated.
      * @return array $out_pages array of single summarized documents
      */
@@ -379,7 +379,7 @@ class GroupIterator extends IndexBundleIterator
      * @param string $hash_url the crawlHash of the url of the page we are
      *      scoring which will be compared with that of the host to see if
      *      the current page has the url of a hostname.
-     * @param array& $pre_hash_page pages to compute scores for
+     * @param array &$pre_hash_page pages to compute scores for
      */
     public function aggregateScores($hash_url, &$pre_hash_page)
     {
diff --git a/src/library/index_bundle_iterators/IndexBundleIterator.php b/src/library/index_bundle_iterators/IndexBundleIterator.php
index c2230b60e..3c7ee844b 100644
--- a/src/library/index_bundle_iterators/IndexBundleIterator.php
+++ b/src/library/index_bundle_iterators/IndexBundleIterator.php
@@ -149,6 +149,8 @@ abstract class IndexBundleIterator implements CrawlConstants
      *
      * @param array $gen_doc1  first ordered pair
      * @param array $gen_doc2  second ordered pair
+     * @param int $direction whether the comparison should be done for
+     *  a self::ASCEDNING or a self::DESCENDING search
      * @return int -1,0,1 depending on which is bigger
      */
      public function genDocOffsetCmp($gen_doc1, $gen_doc2, $direction =
@@ -185,7 +187,13 @@ abstract class IndexBundleIterator implements CrawlConstants
         return 0;
     }
     /**
-     *
+     * Returns the direction of a IndexBundleIterator. Depending on the
+     * iterator could be either forward from the start of an index
+     * (self::ASCENDING) or backward from the end of the index
+     * (self::DESCENDING). For this base class, the function always returns
+     * self::ASCENDING, but subclasses might return different values.
+     * @return int either CrawlConstants::ASCENDING or
+     *  CrawlConstants::DESCENDING
      */
     public function getDirection()
     {
diff --git a/src/library/index_bundle_iterators/IntersectIterator.php b/src/library/index_bundle_iterators/IntersectIterator.php
index 97c1c21d8..d6012b2e0 100644
--- a/src/library/index_bundle_iterators/IntersectIterator.php
+++ b/src/library/index_bundle_iterators/IntersectIterator.php
@@ -266,8 +266,8 @@ class IntersectIterator extends IndexBundleIterator
      * @param mixed $next_pos * or int if * next_pos must be >= $cur_pos
      *     +len_search_term. $next_pos represents the position the next
      *     quoted term should be at
-     * @param $qp $position_list_index => $len_of_list_term pairs
-     * @return -1 on failure, 0 on backtrack, 1 on success
+     * @param array $qp $position_list_index => $len_of_list_term pairs
+     * @return int -1 on failure, 0 on backtrack, 1 on success
      */
     public function checkQuote(&$position_lists, $cur_pos, $next_pos, $qp)
     {
@@ -308,10 +308,10 @@ class IntersectIterator extends IndexBundleIterator
      * Given the position_lists of a collection of terms computes
      * a score for how close those words were in the given document
      *
-     * @param array& $word_position_lists a 2D array item
+     * @param array &$word_position_lists a 2D array item
      *      number => position_list (locations in doc where item occurred) for
      *      that item.
-     * @param array& $word_len_lists length for each item of its position list
+     * @param array &$word_len_lists length for each item of its position list
      * @param bool $is_doc whether this is the position list of a document
      *     or a link
      * @return sum of inverse of all covers computed by plane sweep algorithm
diff --git a/src/library/index_bundle_iterators/NetworkIterator.php b/src/library/index_bundle_iterators/NetworkIterator.php
index fbbd9d9fa..7839255f2 100644
--- a/src/library/index_bundle_iterators/NetworkIterator.php
+++ b/src/library/index_bundle_iterators/NetworkIterator.php
@@ -351,9 +351,18 @@ class NetworkIterator extends IndexBundleIterator
         return $pages;
     }
     /**
+     * If we want the top $num_results results (a block) and we have
+     * $num_machines, this computes how many results we shhould request
+     * of each machine.
      * Buttcher, Clark, Cormack give an exact formula to compute this,
      * but it is slow to compute
-     * We instead compute a (1/$num_machines^{3/4})* $num_results +5;
+     * We instead compute a (1/$num_machines^{3/4})* $num_results + 5;
+     * @param int $num_machines number of machines each having a portion
+     *  of the results
+     * @param int $num_results, the k value that we want the top k best
+     *  overall results.
+     * @return int number of best results we should ask from each machine
+     *  to ensure get top k best results overall
      */
     public static function serverAdjustedResultsPerBlock($num_machines,
         $num_results)
diff --git a/src/library/indexing_plugins/IndexingPlugin.php b/src/library/indexing_plugins/IndexingPlugin.php
index 2a018bdb1..d6a312680 100644
--- a/src/library/indexing_plugins/IndexingPlugin.php
+++ b/src/library/indexing_plugins/IndexingPlugin.php
@@ -173,11 +173,13 @@ abstract class IndexingPlugin
      * them in the getAdditionalMetaWords function for this plugin, or they
      * will not be recognized in queries.
      *
-     * @param array& $summary the summary data produced by the relevant page
+     * @param array &$summary the summary data produced by the relevant page
      *     processor's handle method; modified in-place.
      * @param string $url the url where the summary contents came from
      */
-    public function pageSummaryProcessing(&$summary, $url) {return null;}
+    public function pageSummaryProcessing(&$summary, $url) {
+        return null;
+    }
     /**
      * This method is called by the queue_server with the name of
      * a completed index. This allows the indexing plugin to
diff --git a/src/library/indexing_plugins/WordfilterPlugin.php b/src/library/indexing_plugins/WordfilterPlugin.php
index 28b6e66ed..8570c2c06 100644
--- a/src/library/indexing_plugins/WordfilterPlugin.php
+++ b/src/library/indexing_plugins/WordfilterPlugin.php
@@ -57,7 +57,7 @@ require_once C\BASE_DIR. "/library/LocaleFunctions.php";
  * NOYDIR, NONE or can be the word NOPROCESS, JUSTFOLLOW, NOTCONTAIN.
  * The preconditions is checked in the function checkFilter. Details on
  * what constitutes are legal precondition are described in the
- * @see $filter_rules and @see $rules_string documentation.
+ * See $filter_rules and $rules_string documentation.
  * Usually, if checkFilter returns true then pageSummaryProcessing adds the
  * meta tags to the document summary and returns. If one of the actions
  * was NOTCONTAIN, then only if checkFilter returned false are the meta tags
@@ -215,7 +215,7 @@ EOD;
      * whether the summary title and description satisfy various rules
      * in $this->filter_rules
      *
-     * @param array& $summary the summary data produced by the relevant page
+     * @param array &$summary the summary data produced by the relevant page
      *     processor's handle method; modified in-place.
      * @param string $url the url where the summary contents came from
      */
@@ -272,7 +272,7 @@ EOD;
     /**
      * Used to check if $precondition is met by a supplied string.
      *
-     * @see $filter_terms to see what constitutes a valid precondition.
+     * See $filter_terms to see what constitutes a valid precondition.
      *
      * @param string $preconditions the terms and their
      *      frequencies to search for
@@ -350,7 +350,7 @@ EOD;
      * it. It then modifies $data so that if the plugin's configuration view
      * is drawn it makes use of the current plugin configuration info.
      *
-     * @param array& $data info to be used by the admin view to draw itself.
+     * @param array &$data info to be used by the admin view to draw itself.
      */
     public function configureHandler(&$data)
     {
@@ -469,7 +469,7 @@ EOD;
     /**
      * Used to draw the HTML configure screen for the word filter plugin.
      *
-     * @param array& $data contains configuration data to be used in drawing
+     * @param array &$data contains configuration data to be used in drawing
      *     the view
      */
     public function configureView(&$data)
diff --git a/src/library/media_jobs/AnalyticsJob.php b/src/library/media_jobs/AnalyticsJob.php
index 86f6d9287..a9eea2b1a 100644
--- a/src/library/media_jobs/AnalyticsJob.php
+++ b/src/library/media_jobs/AnalyticsJob.php
@@ -141,7 +141,7 @@ class AnalyticsJob extends MediaJob
      * for which statistics have been requested but not yet computed.
      * If these queries take too long it saves partial results and returns.
      *
-     * @param array& $data associative array which will have all the statistics
+     * @param array &$data associative array which will have all the statistics
      *     data collected.
      */
     public function computeCrawlStatistics()
diff --git a/src/library/media_jobs/FeedsUpdateJob.php b/src/library/media_jobs/FeedsUpdateJob.php
index 2171d3511..6e54fd832 100644
--- a/src/library/media_jobs/FeedsUpdateJob.php
+++ b/src/library/media_jobs/FeedsUpdateJob.php
@@ -61,14 +61,18 @@ class FeedsUpdateJob extends MediaJob
      */
     public $db;
     /**
-     * @var IndexArchiveBundle
+     * The FeedArchiveBundle to put feed items into periodically
+     * @var FeedArchiveBundle
      */
     public $index_archive;
     /**
+     * News Feed Items found from the current feed
      * @var array
      */
     public $found_items;
     /**
+     * Used to keep track of image urls of thumbnails to download
+     * for feed items
      * @var array
      */
     public $media_urls;
@@ -189,7 +193,12 @@ class FeedsUpdateJob extends MediaJob
         $this->media_urls = [];
     }
     /**
-     * @param array $thumb_sites
+     * Download images and create thumbnails for a list of image urls.
+     *
+     * @param array $thumb_sites array of arrays. The sub-array should contain
+     *  a field CrawlConstants::THUMB_URL with url to download.
+     *  After download the thumb_nail is saved in the file
+     * CrawlConstants::FILE_NAME.
      */
     private function getThumbs($thumb_sites)
     {
@@ -828,7 +837,7 @@ class FeedsUpdateJob extends MediaJob
      * Updates trending term counts based on the string from the current
      * feed item.
      *
-     * @param array& $term_counts lang => [term => occurrences]
+     * @param array &$term_counts lang => [term => occurrences]
      * @param string $source_phrase original non-stemmed phrase from feed
      *      item to adjust $term_counts with. Used to remember non-stemmed
      *      terms. We assume we have already extracted position lists from
@@ -1204,6 +1213,10 @@ class FeedsUpdateJob extends MediaJob
             }
         }
     }
+    /**
+     * Sets the value of $this->index_archive to point to
+     * the FeedArchiveBundle associated to feeds on this instance of Yioop
+     */
     public function getFeedArchive()
     {
         $dir = C\CRAWL_DIR . '/cache/' . self::feed_index_data_base_name;
diff --git a/src/library/processors/ImageProcessor.php b/src/library/processors/ImageProcessor.php
index 3f54d9b1e..9bc716e08 100755
--- a/src/library/processors/ImageProcessor.php
+++ b/src/library/processors/ImageProcessor.php
@@ -58,7 +58,12 @@ class ImageProcessor extends PageProcessor
         return null;
     }
     /**
+     * Used to save a temporary file with the data downloaded for a url
+     * while carrying out image processing
      *
+     * @param string $page contains data about an image that one needs to save
+     * @param string $url where $page data came from
+     * @param string $file_extension to be associated wit the $page data
      */
     public function saveTempFile($page, $url, $file_extension)
     {
diff --git a/src/library/processors/PageProcessor.php b/src/library/processors/PageProcessor.php
index 0f9468f8d..a6826e29f 100644
--- a/src/library/processors/PageProcessor.php
+++ b/src/library/processors/PageProcessor.php
@@ -173,7 +173,7 @@ abstract class PageProcessor implements CrawlConstants
     /**
      * Should be implemented to compute a summary based on a
      * text string of a document. This method is called from
-     * @see handle($page, $url)
+     * @see PageProcessor::handle
      *
      * @param string $page string of a document
      * @param string $url location the document came from
diff --git a/src/library/processors/TextProcessor.php b/src/library/processors/TextProcessor.php
index c08f29093..fca5817fa 100755
--- a/src/library/processors/TextProcessor.php
+++ b/src/library/processors/TextProcessor.php
@@ -194,7 +194,7 @@ class TextProcessor extends PageProcessor
      * If an end of file is reached before closed tags are seen, this methods
      * closes these tags in the correct order.
      *
-     * @param string& $page a reference to an xml or html document
+     * @param string &$page a reference to an xml or html document
      */
     public static function closeDanglingTags(&$page)
     {
diff --git a/src/library/summarizers/Summarizer.php b/src/library/summarizers/Summarizer.php
index 3c2890eee..032c23b2b 100644
--- a/src/library/summarizers/Summarizer.php
+++ b/src/library/summarizers/Summarizer.php
@@ -41,8 +41,8 @@ use seekquarry\yioop\library\processors\PageProcessor;
  * document and produces a summary of that document up to
  * PageProcessor::$max_description_len many characters. Summarizers
  * also contain various methods to generate word cloud from such a summary
- * @see wordCloudFromSummary and/or document centroids
- * @see wordCloudFromTermVector.
+ * @see Summarizer::wordCloudFromSummary and/or document centroids
+ * wordCloudFromTermVector.
  *
  * @author Charles Bocage charles.bocage@sjsu.edu
  *   Chris Pollett chris@pollett.org
diff --git a/src/locale/en_US/resources/Tokenizer.php b/src/locale/en_US/resources/Tokenizer.php
index 31fe5bc3f..2f32b9e65 100755
--- a/src/locale/en_US/resources/Tokenizer.php
+++ b/src/locale/en_US/resources/Tokenizer.php
@@ -277,7 +277,7 @@ class Tokenizer
      * This methods tries to handle punctuation in terms specific to the
      * English language such as abbreviations.
      *
-     * @param string& $string a string of words, etc which might involve such
+     * @param string &$string a string of words, etc which might involve such
      *      terms
      */
     public function canonicalizePunctuatedTerms(&$string)
@@ -569,7 +569,7 @@ class Tokenizer
      * sentence, create a phrase string for each of the next nodes
      * which belong to part of speech group $type.
      *
-     * @param array& $cur_node node within parse tree
+     * @param array &$cur_node node within parse tree
      * @param array $tagged_phrase parse tree for phrase
      * @param string $type self::$noun_type, self::$verb_type, etc
      * @return string phrase string involving only terms of that $type
@@ -1661,7 +1661,7 @@ class Tokenizer
      * @param array $tagged_tokens array pairs as might come from tagTokenize
      * @param bool $with_tokens whether to include the terms and the tags
      *      in the output string or just the part of speech tags
-     * @return $tagged_phrase a phrase with terms in the format token~tag
+     * @return string $tagged_phrase a phrase with terms in the format token~tag
      *      ($with_token == true) or space separated tags (!$with_token).
      */
     private static function taggedPartOfSpeechTokensToString($tagged_tokens,
diff --git a/src/locale/hi/resources/Tokenizer.php b/src/locale/hi/resources/Tokenizer.php
index faf51af23..095f7ab48 100755
--- a/src/locale/hi/resources/Tokenizer.php
+++ b/src/locale/hi/resources/Tokenizer.php
@@ -335,7 +335,7 @@ class Tokenizer
      * sentence, create a phrase string for each of the next nodes
      * which belong to part of speech group $type.
      *
-     * @param array& $cur_node node within parse tree
+     * @param array &$cur_node node within parse tree
      * @param array $tagged_phrase parse tree for phrase
      * @param string $type self::$noun_type, self::$verb_type, etc
      * @return string phrase string involving only terms of that $type
diff --git a/src/locale/it/resources/Tokenizer.php b/src/locale/it/resources/Tokenizer.php
index b70eea3c8..d0f839753 100755
--- a/src/locale/it/resources/Tokenizer.php
+++ b/src/locale/it/resources/Tokenizer.php
@@ -45,7 +45,7 @@ class Tokenizer
     /**
      * A list of frequently occurring terms for this locale which should
      * be excluded from certain kinds of queries
-     * @array
+     * @var array
      */
     public static $stop_words = [
         'http', 'https',
@@ -189,7 +189,7 @@ class Tokenizer
      *
      * @param $parent_string is the string in which we wish to find the suffix
      * @param $substring is the suffix we wish to check
-     * @return $pos as the starting position of the suffix $substring in
+     * @return int $pos as the starting position of the suffix $substring in
      * $parent_string if it exists, else false
      */
     private static function checkForSuffix($parent_string,$substring)
@@ -221,8 +221,8 @@ class Tokenizer
     /**
      * Computes the starting index for region R1
      *
-     * @param $string is the string for which we wish to find the index
-     * @return $r1_start as the starting index for R1 for $string
+     * @param string $string for which we wish to find the index
+     * @return int $r1_start as the starting index for R1 for $string
      */
     private static function r1($string)
     {
@@ -249,8 +249,8 @@ class Tokenizer
     /**
      * Computes the starting index for region R2
      *
-     * @param $string is the string for which we wish to find the index
-     * @return $r2_start as the starting index for R1 for $string
+     * @param string $string for which we wish to find the index
+     * @return int $r2_start as the starting index for R1 for $string
      */
     private static function r2($string)
     {
@@ -283,8 +283,8 @@ class Tokenizer
     /**
      * Computes the starting index for region RV
      *
-     * @param $string is the string for which we wish to find the index
-     * @return $rv_start as the starting index for RV for $string
+     * @param string $string for which we wish to find the index
+     * @return int $rv_start as the starting index for RV for $string
      */
     private static function rv($string)
     {
@@ -351,7 +351,7 @@ class Tokenizer
     /**
      * Checks if a character is a vowel or not
      *
-     * @param $char is the character to be checked
+     * @param string $char is the character to be checked
      * @return bool if $char is a vowel
      */
     private static function isVowel($char)
@@ -376,9 +376,9 @@ class Tokenizer
      * Computes the longest suffix for a given string from a given set of
      * suffixes
      *
-     * @param $string is the for which the maximum suffix is to be found
-     * @param $suffixes is an array of suffixes
-     * @return $max_suffix is the longest suffix for $string
+     * @param string $string for which the maximum suffix is to be found
+     * @param array $suffixes an array of suffixes
+     * @return int $max_suffix is the longest suffix for $string
      */
     private static function maxSuffix($string, $suffixes)
     {
@@ -404,9 +404,9 @@ class Tokenizer
      * Replaces all acute accents in a string by grave accents and also handles
      * accented characters
      *
-     * @param $string is the string from in which the acute accents are to be
+     * @param string $string in which the acute accents are to be
      * replaced
-     * @return $string with changes
+     * @return string with changes
      */
     private static function acuteByGrave($string)
     {
diff --git a/src/locale/zh_CN/resources/Tokenizer.php b/src/locale/zh_CN/resources/Tokenizer.php
index 72665cf7e..3548d8eed 100755
--- a/src/locale/zh_CN/resources/Tokenizer.php
+++ b/src/locale/zh_CN/resources/Tokenizer.php
@@ -45,7 +45,7 @@ class Tokenizer
      * A list of frequently occurring terms for this locale which should
      * be excluded from certain kinds of queries. This is also used
      * for language detection
-     * @array
+     * @var array
      */
     public static $stop_words = ['一', '人', '里', '会', '没', '她', '吗', '去',
         '也', '有', '这', '那', '不', '什', '个', '来', '要', '就', '我', '你',
@@ -65,19 +65,19 @@ class Tokenizer
     public static $non_char_preg = "/^[^\p{Han}]+$/u";
     /**
      * The dictionary of characters can be used as Chinese Numbers
-     * @string
+     * @var string
      */
     public static $num_dict =
        "1234567890○〇零一二两三四五六七八九十百千万亿".
        "0123456789壹贰叁肆伍陆柒捌玖拾廿卅卌佰仟萬億";
     /**
      * Dots used in Chinese Numbers
-     * @string
+     * @var string
      */
     public static $dot = "\..点";
     /**
      * A list of characters can be used at the end of numbers
-     * @string
+     * @var string
      */
     public static $num_end = "%%";
     /**
@@ -86,56 +86,58 @@ class Tokenizer
      * ex. "十分" in most of time means "very", but it will
      * be determined to be "10 minutes" by the function so we
      * need to remove it
-     * @array of string
+     * @var array of string
      */
     public static $exception_list= ["十分","一","一点","千万",
     "万一", "一一", "拾", "一时", "千千", "万万", "陆"];
     /**
      * A list of characters can be used as Chinese punctuations
-     * @string
+     * @var string
      */
     public static $punctuation_preg =
     "/^([\x{2000}-\x{206F}\x{3000}-\x{303F}\x{FF00}-\x{FF0F}" .
     "\x{FF1A}-\x{FF20}\x{FF3B}-\x{FF40}\x{FF5B}-\x{FF65}" .
     "\x{FFE0}-\x{FFEE}\x{21}-\x{2F}\x{21}-\x{2F}" .
     "\x{3A}-\x{40}\x{5B}-\x{60}\x{25cf}])\\1*$/u";
+
     /**
      * Any unique identifier corresponding to the component of a triplet which
      * can be answered using a question answer list
-     * @string
+     * @var string
      */
     public static $question_token = "qqq";
     /**
      * Words array that determine if a sentence passed in is a question
-     * @array
+     * @var array
      */
-    public static $question_words=[
-        "any"=>["谁"=>"who",
-                "哪儿|哪里"=>"where",
-                "哪个"=>"which",
-                "哪些"=>"list",
-                "哪"=>["after"=>[   "1|一"=>"which",
+    public static $question_words = [
+        "any" => ["谁" => "who",
+                "哪儿|哪里" => "where",
+                "哪个" => "which",
+                "哪些" => "list",
+                "哪" => ["after" => [   "1|一"=>"which",
                                     "[2-9]|[1-9][0-9]+"=>"list"
                                 ],
                        "other"=>"where"
                         ],
-                "什么|啥|咋"=>[ "after"=>[    "地方"=>"where",
+                "什么|啥|咋" => [ "after" => [    "地方"=>"where",
                                             "地点"=>"where",
                                             "时\w*"=>"when"
                                        ],
-                                "other"=>"what"],
-                "怎么|怎样|怎么样|如何"=>"how",
-                "为什么"=>"why",
-                "多少"=>"how many",
-                "几\w*"=>["any"=>["吗|\?|?"=>"how many"], "other"=>false],
-                "多久"=>"how long",
-                "多大"=>"how big"
+                                "other" => "what"],
+                "怎么|怎样|怎么样|如何" => "how",
+                "为什么" => "why",
+                "多少" => "how many",
+                "几\w*" => ["any" => ["吗|\?|?" => "how many"],
+                    "other" => false],
+                "多久" => "how long",
+                "多大" => "how big"
                 ],
-        "other"=>[  "any"=>[    "吗"=>"yesno",
-                                "呢"=>"what about"
+        "other" => [  "any" => [    "吗"=>"yesno",
+                                "呢" => "what about"
                            ],
-                    "other"=>[  "other"=>false,
-                                "any"=>["\?|?"=>"yesno"]
+                    "other" => [  "other" => false,
+                                "any" => ["\?|?" => "yesno"]
                              ]
                  ]
         ];
@@ -143,12 +145,12 @@ class Tokenizer
      * List of adjective-like parts of speech that might appear in lexicon file
      * Predicative adjective: VA
      * other noun-modifier: JJ
-     * @array
+     * @var array
      */
-    public static $adjective_type = ["VA","JJ"];
+    public static $adjective_type = ["VA", "JJ"];
     /**
      * List of adverb-like parts of speech that might appear in lexicon file
-     * @array
+     * @var array
      */
     public static $adverb_type = ["AD"];
     /**
@@ -156,9 +158,9 @@ class Tokenizer
      * file
      * Coordinating conjunction: CC
      * Subordinating conjunction: CS
-     * @array
+     * @var array
      */
-    public static $conjunction_type = ["CC","CS"];
+    public static $conjunction_type = ["CC", "CS"];
     /**
      * List of determiner-like parts of speech that might appear in lexicon
      * file
@@ -175,7 +177,7 @@ class Tokenizer
      * Temporal Noun: NT
      * Other Noun: NN
      * Pronoun: PN
-     * @array
+     * @var array
      */
     public static $noun_type = ["NR", "NT", "NN", "PN"];
     /**
@@ -185,32 +187,32 @@ class Tokenizer
      * Other verb: VV
      * Short passive voice: SB
      * Long passive voice: LB
-     * @array
+     * @var array
      */
     public static $verb_type = ["VC", "VE", "VV", "SB", "LB"];
     /**
      * List of particle-like parts of speech that might appear in lexicon file
      * No meaning words that can appear anywhere
-     * @array
+     * @var array
      */
     public static $particle_type = [
         "AS", "ETC", "DEC", "DEG", "DEV", "MSP",
         "DER", "SP", "IJ", "FW"];
     /**
      * Stochastic Term Segmenter instance
-     * @object
+     * @var object
      */
-    private static $stochasticTermSegmenter;
+    private static $stochastic_term_segmenter;
     /**
-     * named Entity Recognizer instance
-     * @object
+     * Named Entity tagger instance
+     * @var object
      */
-    private static $namedEntityRecognizer;
+    private static $named_entity_tagger;
     /**
      * PosTagger instance
-     * @object
+     * @var object
      */
-    private static $posTagger;
+    private static $pos_tagger;
     /**
      * Removes the stop words from the page (used for Word Cloud generation
      * and language detection)
@@ -237,16 +239,16 @@ class Tokenizer
      * @param string $method  indicates which method to use
      * @return string with words separated by space
      */
-    public static function segment($pre_segment, $method="STS")
+    public static function segment($pre_segment, $method = "STS")
     {
         switch($method) {
-            case("RMM"):
+            case "RMM":
                 return PhraseParser::reverseMaximalMatch($pre_segment, "zh-CN",
                 ['/^\d+$/', '/^[a-zA-Z]+$/']);
                 break;
-            case("STS"):
+            case "STS":
                 return self::getStochasticTermSegmenter()
-                        ->segmentText($pre_segment,true);
+                        ->segmentText($pre_segment, true);
                 break;
         }
     }
@@ -283,7 +285,7 @@ class Tokenizer
             "]+(年|年代|月|日|时|小时|時|小時|" .
             "点|点钟|點|點鐘|分|分鐘|秒|秒鐘)$/u",$term);
     }
-    /*
+    /**
      * Check if the term is a punctuation
      */
     public static function isPunctuation($term)
@@ -300,31 +302,26 @@ class Tokenizer
     {
         return preg_match(self::$non_char_preg, $term);
     }
-    /*
-     * Create stochastic term segmenter
+    /**
+     * Get the segmenter instance
+     * @return StochasticTermSegmenter
      */
-    public static function createStochasticTermSegmenter($cache_pct=0.06)
+    public static function getStochasticTermSegmenter()
     {
-        self::$stochasticTermSegmenter
-            = new L\StochasticTermSegmenter("zh_CN", $cache_pct);
+        if (!self::$stochastic_term_segmenter) {
+            self::$stochastic_term_segmenter
+                = new L\StochasticTermSegmenter("zh-CN");
+        }
+        return self::$stochastic_term_segmenter;
     }
-    /*
-     * Destory stochastic term segmenter
+    /**
+     * Determines the part of speech tag of a term using simple rules if
+     * possible
+     * @param string $term to see if can get a part of speech for via a rule
+     * @return string part of speech tag or $term if can't be determine
      */
-    public static function destoryStochasticTermSegmenter()
+    public static function getPosKey($term)
     {
-        self::$stochasticTermSegmenter = null;
-    }
-    /*
-     * Get the segmenter instance
-     */
-    public static function getStochasticTermSegmenter() {
-        if (!self::$stochasticTermSegmenter) {
-            self::createStochasticTermSegmenter();
-        }
-        return self::$stochasticTermSegmenter;
-    }
-    public static function POSGetKey($term) {
         if (self::isPunctuation($term)) {
             return 'PU';
         } else if (self::isCardinalNumber($term)) {
@@ -336,55 +333,47 @@ class Tokenizer
         } else if (self::isNotCurrentLang($term)) {
             return 'FW';
         }
-        return null;
+        return $term;
     }
-    /*
-     * Create named entity recognizer instance
+    /**
+     * Possible tags a term can have that can be determined by a simple rule
+     * @return array
      */
-    public static function createNER()
+    public static function getPosKeyList()
     {
-        self::$namedEntityRecognizer
-            = new L\ContextWeightedNamedEntityRecognizer("zh_CN");
+        return ['PU','CD','OD','NT','FW'];
     }
-    /*
-     * Destory named entity recognizer instance
+    /**
+     * Return list of possible tags that an unknown term can have
+     * @return array
      */
-    public static function destoryNER()
+    public static function getPosUnknownTagsList()
     {
-        self::$namedEntityRecognizer = null;
+        return ["NN","NR","VV","VA"];
     }
-    /*
-     * Get the named entity recognizer instance
-     */
-    public static function getNER() {
-        if (!self::$namedEntityRecognizer) {
-            self::createNER();
-        }
-        return self::$namedEntityRecognizer;
-    }
-    /*
-     * Create POSTagger instance
+    /**
+     * Get the named entity tagger instance
+     * @return NamedEntityContextTagger for Chinese
      */
-    public static function createPosTagger()
+    public static function getNamedEntityTagger()
     {
-        self::$posTagger
-            = new L\ContextWeightedPosTagger("zh_CN");
+        if (!self::$named_entity_tagger) {
+            self::$named_entity_tagger
+                = new L\NamedEntityContextTagger("zh-CN");
+        }
+        return self::$named_entity_tagger;
     }
-    /*
-     * Destory POSTagger instance
+    /**
+     * Get Part of Speec instance
+     * @return PartOfSpeechContextTagger for Chinese
      */
-    public static function destoryPosTagger()
+    public static function getPosTagger()
     {
-        self::$posTagger = null;
-    }
-    /*
-     * Get PosTagger instance
-     */
-    public static function getPosTagger() {
-        if (!self::$posTagger) {
-            self::createPosTagger();
+        if (!self::$pos_tagger) {
+            self::$pos_tagger
+                = new L\PartOfSpeechContextTagger("zh-CN");
         }
-        return self::$posTagger;
+        return self::$pos_tagger;
     }
     /**
      * Scans a word list for phrases. For phrases found generate
@@ -405,7 +394,8 @@ class Tokenizer
         $triplet_types = ['CONCISE', 'RAW'];
         foreach ($word_and_phrase_list as $word_and_phrase => $position_list) {
             // strip parentheticals
-            $word_and_phrase = preg_replace("/[\{\[\(【(][^\}\])】\)]+[\}\]\))】]/u",
+            $word_and_phrase = preg_replace(
+                "/[\{\[\(【(][^\}\])】\)]+[\}\]\))】]/u",
                 "", $word_and_phrase);
             $tagged_phrase = self::tagTokenizePartOfSpeech($word_and_phrase);
             $parse_tree = ['cur_node' => 0];
@@ -418,7 +408,8 @@ class Tokenizer
                 if (isset($parse_tree['NP'])) {
                     $pre_sub = $parse_tree['NP'];
                 }
-                $extracted_triplets_set[] = self::rearrangeTripletsByType($triplets);
+                $extracted_triplets_set[] = self::rearrangeTripletsByType(
+                    $triplets);
                 // next partial sentence
                 while($parse_tree['cur_node'] < count($tagged_phrase)
                     && $tagged_phrase[$parse_tree['cur_node']]["tag"] != "PU") {
@@ -434,7 +425,8 @@ class Tokenizer
                         foreach ($questions as $question) {
                             $question_list[$question] = $position_list;
                         }
-                        $question_answer_list = array_merge($question_answer_list,
+                        $question_answer_list = array_merge(
+                            $question_answer_list,
                             $triplets['QUESTION_ANSWER_LIST']);
                     }
                 }
@@ -458,10 +450,10 @@ class Tokenizer
         $segmented = self::getStochasticTermSegmenter()->segmentSentence($text);
         $tags = self::getPosTagger()->predict($segmented);
         $result=[];
-        for($i=0; $i<count($segmented); $i++) {
-            $result[$i]=[];
-            $result[$i]["token"]=$segmented[$i];
-            $result[$i]["tag"]=$tags[$i];
+        for($i = 0; $i < count($segmented); $i++) {
+            $result[$i] = [];
+            $result[$i]["token"] = $segmented[$i];
+            $result[$i]["tag"] = $tags[$i];
         }
         return $result;
     }
@@ -470,7 +462,7 @@ class Tokenizer
      * sentence, create a phrase string for each of the next nodes
      * which belong to part of speech group $type.
      *
-     * @param array& $cur_node node within parse tree
+     * @param array &$cur_node node within parse tree
      * @param array $tagged_phrase parse tree for phrase
      * @param string $type self::$noun_type, self::$verb_type, etc
      * @return string phrase string involving only terms of that $type
@@ -643,23 +635,25 @@ class Tokenizer
         $index = 1)
     {
         $cur_node = $tree['cur_node'];
-        // There are two forms of prepostion.
-        // First one has lc only
-        // 之前(lc) 他在看书
+        /* There are two forms of preposition.
+           The first one has lc only
+           之前(lc) 他在看书 */
         if (isset($tagged_phrase[$cur_node]['tag']) &&
             trim($tagged_phrase[$cur_node]['tag']) == "LC") {
             $tree["LC"] = $tagged_phrase[$cur_node]['token'];
             $tree['cur_node']+=1;
             return $tree;
         }
-        // Second form:
-        // format: prep [anything] [locolizer|punctuation]
-        // 在(p)今天早上,(pu) 他 在(p) 车 里(lc) 睡觉。
-        // In the morning today, he was sleeping in the car.
+        /* Second form:
+          format: prep [anything] [locolizer|punctuation]
+           在(p)今天早上,(pu) 他 在(p) 车 里(lc) 睡觉。
+           In the morning today, he was sleeping in the car.
+         */
         if (isset($tagged_phrase[$cur_node]['tag']) &&
             trim($tagged_phrase[$cur_node]['tag']) == "P") {
             /* can have multiple prep's in a row, for example,
-               it is known in over 20 countries*/
+               it is known in over 20 countries
+              */
             $preposition_string = self::parseTypeList($cur_node, $tagged_phrase,
                 ["P"]);
             if (!empty($preposition_string)) {
@@ -667,7 +661,7 @@ class Tokenizer
             }
             while(isset($tagged_phrase[$cur_node]) &&
                 isset($tagged_phrase[$cur_node]['tag']) &&
-                !in_array($tagged_phrase[$cur_node]['tag'],["PU","LC"])) {
+                !in_array($tagged_phrase[$cur_node]['tag'],["PU", "LC"])) {
                 $tree["P"] .= $tagged_phrase[$cur_node]['token'];
                 $cur_node++;
             }
@@ -796,34 +790,34 @@ class Tokenizer
      * Given a part-of-speeech tagged phrase array generates a parse tree
      * for the phrase using a recursive descent parser.
      *
-     * @param array $tagged_phrase
-     *      an array of pairs of the form ("token" => token_for_term,
-     *     "tag"=> part_of_speech_tag_for_term)
+     * @param array $tagged_phrase an array of pairs of the form
+     *  ("token" => token_for_term, "tag"=> part_of_speech_tag_for_term)
      * @param $tree that consists of ["curnode" =>
-     *      current parse position in $tagged_phrase]
+     *  current parse position in $tagged_phrase]
      * @param $tree_np_pre subject found from previous sub-sentence
      * @return array used to represent a tree. The array has up to three fields
-     *      $tree["cur_node"] index of how far we parsed our$tagged_phrase
-     *      $tree["NP"] contains a subtree for a noun phrase
-     *      $tree["VP"] contains a subtree for a verb phrase
+     *  $tree["cur_node"] index of how far we parsed our$tagged_phrase
+     *  $tree["NP"] contains a subtree for a noun phrase
+     *  $tree["VP"] contains a subtree for a verb phrase
      */
-    public static function parseWholePhrase($tagged_phrase, $tree, $tree_np_pre=[])
+    public static function parseWholePhrase($tagged_phrase, $tree,
+        $tree_np_pre = [])
     {
         //remove heading adverbs
         $cur_node = $tree['cur_node'];
         do {
-            $start_node=$cur_node;
+            $start_node = $cur_node;
             self::parseTypeList($cur_node, $tagged_phrase,
                 self::$adverb_type);
             self::parseTypeList($cur_node, $tagged_phrase,
                 self::$particle_type);
-        } while($start_node!=$cur_node);
+        } while ($start_node != $cur_node);
         $tree_np = self::parseNounPhrase($tagged_phrase,
             ["cur_node" => $cur_node]);
         if ($tree_np['cur_node'] == $cur_node) {
             if (!empty($tree_np_pre)) {
-                $tree_np['NP']=$tree_np_pre;
-                $tree_np["cur_node"]=$cur_node;
+                $tree_np['NP'] = $tree_np_pre;
+                $tree_np["cur_node"] = $cur_node;
             } else {
                 return $tree;
             }
@@ -1054,7 +1048,7 @@ class Tokenizer
         $keywords = self::isQuestion($question);
         if ($keywords) {
             $generated_questions=self::parseQuestion(
-                        $tagged_question, 1, $keywords);
+                $tagged_question, 1, $keywords);
         }
         return $generated_questions;
     }
@@ -1070,7 +1064,7 @@ class Tokenizer
         $terms=self::getStochasticTermSegmenter()->segmentSentence($phrase);
         $qt=self::questionType($terms, self::$question_words);
         if (in_array($qt["types"], ["who","what","which","where","when",
-            "whose","how", "how many", "how long", "how big"])) {
+            "whose", "how", "how many", "how long", "how big"])) {
             return $qt["ques_words"];
         }
         return false;
@@ -1084,24 +1078,27 @@ class Tokenizer
      * @param string $question_word is the question word need to be replaced
      * @return array parsed triplet
      */
-    public static function parseQuestion($tagged_question, $index, $question_word)
+    public static function parseQuestion($tagged_question, $index,
+        $question_word)
     {
         $generated_questions = [];
         $tree = ["cur_node" => 0];
-        $parse_tree = self::parseWholePhrase($tagged_question,
-                $tree);
+        $parse_tree = self::parseWholePhrase($tagged_question, $tree);
         $triplets = self::extractTripletsParseTree($parse_tree);
         $triplet_types = ['CONCISE', 'RAW'];
         foreach ($triplet_types as $type) {
             if (!empty($triplets['subject'][$type])
                 && !empty($triplets['predicate'][$type])
                 && !empty($triplets['object'][$type])) {
-                $sub=trim($triplets['subject'][$type]);
-                $sub=preg_replace("/^.*".$question_word.".*$/",self::$question_token, $sub);
-                $pre=trim($triplets['predicate'][$type]);
-                $pre=preg_replace("/^.*".$question_word.".*$/",self::$question_token, $pre);
-                $obj=trim($triplets['object'][$type]);
-                $obj=preg_replace("/^.*".$question_word.".*$/",self::$question_token, $obj);
+                $sub = trim($triplets['subject'][$type]);
+                $sub = preg_replace("/^.*".$question_word.".*$/u",
+                    self::$question_token, $sub);
+                $pre = trim($triplets['predicate'][$type]);
+                $pre = preg_replace("/^.*".$question_word.".*$/u",
+                    self::$question_token, $pre);
+                $obj = trim($triplets['object'][$type]);
+                $obj = preg_replace("/^.*".$question_word.".*$/u",
+                    self::$question_token, $obj);
                 $generated_questions[$type][] = $obj . " " . $pre . " " . $sub;
                 $generated_questions[$type][] = $sub . " " . $pre . " " . $obj;
             }
@@ -1119,52 +1116,52 @@ class Tokenizer
         if (!isset($type_list["any"])) {
             return ["ques_words"=>"","types"=>""];
         }
-        $types="";
-        $ques_words="";
-        for($i=0; $i < count($term_array); $i++ ) {
+        $types = "";
+        $ques_words = "";
+        for($i = 0; $i < count($term_array); $i++ ) {
             foreach($type_list["any"] as $key => $value) {
                 if (preg_match('/^('.$key.')$/u',$term_array[$i])) {
                     if (is_array($value)) {
                         if(isset($value["after"])) {
-                            $found_after=false;
+                            $found_after = false;
                             if (array_key_exists($i+1,$term_array)) {
                                 foreach($value["after"] as $key2 => $value2) {
-                                    if (preg_match('/^('.$key2.')$/u',
-                                        $term_array[$i+1])) {
-                                        $ques_words=$term_array[$i].
-                                            " ".$term_array[$i+1];
-                                        $types=$value2;
-                                        $found_after=true;
+                                    if (preg_match('/^(' . $key2 . ')$/u',
+                                        $term_array[$i + 1])) {
+                                        $ques_words = $term_array[$i].
+                                            " " . $term_array[$i + 1];
+                                        $types = $value2;
+                                        $found_after = true;
                                         break;
                                     }
                                 }
                             }
-                            if (!$found_after && isset($type_list["other"])
-                                    && $value["other"]) {
-                                $ques_words=$term_array[$i];
-                                $types=$value["other"];
+                            if (!$found_after && isset($type_list["other"]) &&
+                                $value["other"]) {
+                                $ques_words = $term_array[$i];
+                                $types = $value["other"];
                             }
-                        } elseif(isset($value["any"])) {
-                            $t=self::questionType($term_array,$value);
-                            $ques_words[]=$term_array[$i];
-                            $types=$t["types"];
+                        } elseif (isset($value["any"])) {
+                            $t = self::questionType($term_array,$value);
+                            $ques_words[] = $term_array[$i];
+                            $types = $t["types"];
                         }
                     } elseif ($value) {
-                        $ques_words=$term_array[$i];
-                        $types=$value;
+                        $ques_words = $term_array[$i];
+                        $types = $value;
                     }
                 }
             }
         }
-        if ($types == "" && isset($type_list["other"])){
+        if ($types == "" && isset($type_list["other"])) {
             if (is_array($type_list["other"])) {
-                $t=self::questionType($term_array, $type_list["other"]);
-                $ques_words=$t["ques_words"];
-                $types=$t["types"];
-            } elseif ( $type_list["other"]) {
-                $types=$type_list["other"];
+                $t = self::questionType($term_array, $type_list["other"]);
+                $ques_words = $t["ques_words"];
+                $types = $t["types"];
+            } elseif ($type_list["other"]) {
+                $types = $type_list["other"];
             }
         }
-        return ["ques_words"=>$ques_words,"types"=>$types];
+        return ["ques_words" => $ques_words, "types" => $types];
     }
 }
diff --git a/src/locale/zh_CN/resources/ner_weight.txt.gz b/src/locale/zh_CN/resources/nect_weights.txt.gz
similarity index 100%
rename from src/locale/zh_CN/resources/ner_weight.txt.gz
rename to src/locale/zh_CN/resources/nect_weights.txt.gz
diff --git a/src/locale/zh_CN/resources/pos_weight.txt.gz b/src/locale/zh_CN/resources/pos_weights.txt.gz
similarity index 100%
rename from src/locale/zh_CN/resources/pos_weight.txt.gz
rename to src/locale/zh_CN/resources/pos_weights.txt.gz
diff --git a/src/locale/zh_CN/resources/term_weight.txt.gz b/src/locale/zh_CN/resources/term_weights.txt.gz
similarity index 100%
rename from src/locale/zh_CN/resources/term_weight.txt.gz
rename to src/locale/zh_CN/resources/term_weights.txt.gz
diff --git a/src/models/GroupModel.php b/src/models/GroupModel.php
index fe9abf374..2b9d6992f 100644
--- a/src/models/GroupModel.php
+++ b/src/models/GroupModel.php
@@ -40,8 +40,8 @@ use seekquarry\yioop\library\processors\ImageProcessor;
 use seekquarry\yioop\models\ImpressionModel;

 /**
- * This is class is used to handle
- * db results related to Group Administration. Groups are collections of
+ * This is class is used to handle db results related to Group Administration.
+ * Groups are collections of
  * users who might access a common blog/news feed and set of pages. This
  * method also controls adding and deleting entries to a group feed and
  * does limited access control checks of these operations.
@@ -777,10 +777,10 @@ class GroupModel extends Model implements MediaConstants
      * @param int $type flag saying what kind of group item this is. One of
      *      STANDARD_GROUP_ITEM, WIKI_GROUP_ITEM (used for threads discussing
      *      a wiki page)
-     * @param string $url a url associated with this group item (mainly for
-     *      search group)
      * @param int $post_time timstamp for when this group item was created
      *      default to the current time
+     * @param string $url a url associated with this group item (mainly for
+     *      search group)
      * @return int $id of item added
      */
     public function addGroupItem($parent_id, $group_id, $user_id, $title,
@@ -1107,7 +1107,9 @@ class GroupModel extends Model implements MediaConstants
         return $row['NUM'] ?? false;
     }
     /**
-     *
+     * Returns the most recent post posted to a group
+     * @param int $group_id id of the group to get the most recent post for
+     * @return array associate array of post details
      */
     public function getMostRecentGroupPost($group_id)
     {
@@ -1141,7 +1143,9 @@ class GroupModel extends Model implements MediaConstants
         return $db->fetchArray($result);
     }
     /**
-     *
+     * Returns the number of distinct threads in a group's feed
+     * @param int $group_id id of the group to get thread count for
+     * @return int number of threads
      */
     public function getGroupThreadCount($group_id)
     {
@@ -1157,7 +1161,9 @@ class GroupModel extends Model implements MediaConstants
         return $row['NUM'] ?? 0;
     }
     /**
-     *
+     * Returns the number of posts to a group
+     * @param int $group_id id of the group to get post count for
+     * @return int number of posts
      */
     public function getGroupPostCount($group_id)
     {
diff --git a/src/models/LocaleModel.php b/src/models/LocaleModel.php
index e2c31491c..db54aa209 100755
--- a/src/models/LocaleModel.php
+++ b/src/models/LocaleModel.php
@@ -138,7 +138,7 @@ class LocaleModel extends Model
      * @param string $locale one getRows row corresponding to a given locale
      * @param mixed $args additional arguments that might be used for this
      *     method (none used for this sub-class)
-     * @return $locale row with PERCENT_WITH_STRINGS field added
+     * @return array $locale row with PERCENT_WITH_STRINGS field added
      */
     public function rowCallback($locale, $args)
     {
diff --git a/src/models/Model.php b/src/models/Model.php
index 5754563c1..8d886bfc2 100755
--- a/src/models/Model.php
+++ b/src/models/Model.php
@@ -594,7 +594,7 @@ class Model implements CrawlConstants
      *
      * @param int $limit starting row from the potential results to return
      * @param int $num number of rows after start row to return
-     * @param int& $total gets set with the total number of rows that
+     * @param int &$total gets set with the total number of rows that
      *     can be returned by the given database query
      * @param array $search_array each element of this is a
      *     quadruple name of a field, what comparison to perform, a value to
diff --git a/src/models/PhraseModel.php b/src/models/PhraseModel.php
index 8f5afa79d..a7b6c44e9 100755
--- a/src/models/PhraseModel.php
+++ b/src/models/PhraseModel.php
@@ -529,7 +529,7 @@ class PhraseModel extends ParallelModel
      * phrases, the weight that should be put on these query results, and
      * which archive to use.
      *
-     * @param string& $phrase string to extract struct from, if the phrase
+     * @param strin g&$phrase string to extract struct from, if the phrase
      *  semantics is guessed or an if condition is processed the value of
      *  phrase will be altered. (Helps for feeding to network queries)
      * @return array struct representing the conjunctive query
@@ -1446,8 +1446,8 @@ class PhraseModel extends ParallelModel
      * are HTTP Location redirect page's then looks these up in turn.
      * This method handles robot meta tags which might forbid indexing.
      *
-     * @param array& $pages of page data without text summaries
-     * @param array& $queue_servers array of queue server to find data on
+     * @param array &$pages of page data without text summaries
+     * @param array &$queue_servers array of queue server to find data on
      * @param int $raw only lookup locations if 0
      * @param bool $groups_with_docs whether to return only groups that
      *     contain at least one doc as opposed to a groups with only links
@@ -1660,7 +1660,7 @@ class PhraseModel extends ParallelModel
      *     when making iterator get sub-iterators to advance to gen doc_offset
      *     stored with respect to save_timestamp if exists.
      *
-     * @return &object an iterator for iterating through results to the
+     * @return object an iterator for iterating through results to the
      * query
      */
     public function getQueryIterator($word_structs, $filter, $raw,
diff --git a/src/models/SearchverticalsModel.php b/src/models/SearchverticalsModel.php
index 413c14533..4ec2bb695 100644
--- a/src/models/SearchverticalsModel.php
+++ b/src/models/SearchverticalsModel.php
@@ -143,14 +143,14 @@ class SearchverticalsModel extends GroupModel
      * the ordered pair is used later for the PARENT_ID and USER_ID in the
      * (both of which have indexes) Search group look-up.
      *
-     *  @param string an input to be hash to a pair of integers
-     *  @param bool $compute_hash flag to chheck if a crawlHash is done
-     *      before converting the result to an ordered pair. In some situations
-     *      the url or host of the url has already been hashed so don't want
-     *      to hash it again.
-     *  @return array [int, int] that corresponds to the hash of the input
-     *      to keep postgres happy (no unsigned ints) we make the value
-     *      of this function signed
+     * @param string $input to be hash to a pair of integers
+     * @param bool $compute_hash flag to chheck if a crawlHash is done
+     *  before converting the result to an ordered pair. In some situations
+     *  the url or host of the url has already been hashed so don't want
+     *  to hash it again.
+     * @return array [int, int] that corresponds to the hash of the input
+     *  to keep postgres happy (no unsigned ints) we make the value
+     *  of this function signed
      */
     public function hashIntPair($input, $compute_hash = true)
     {
@@ -164,7 +164,14 @@ class SearchverticalsModel extends GroupModel
         return [$front, $back];
     }
     /**
+     * Given a $query and a $locale_tag returns a ordered sets
+     * of urls to put at the top of the search results for that query
+     * if such a map has been defined.
      *
+     * @param string $query user supplied query
+     * @param string $locale_tag language that the lookup of urls should
+     *  be done for
+     * @return array of urls that correspond to the query
      */
     public function getQueryMap($query, $locale_tag)
     {
@@ -187,7 +194,13 @@ class SearchverticalsModel extends GroupModel
         return $map_urls;
     }
     /**
-     *
+     * Stores a query map into the public database.
+     * A query map associate a $query in a $locale_tag language to a set of
+     * urls desired to be at the top of the search results.
+     * @param string $query that triggers the mapping
+     * @param array $map_urls urls that should appear at the top of the search
+     *      results
+     * @param string $locale_tag for the language the map should apply to
      */
     public function setQueryMap($query, $map_urls, $locale_tag)
     {
@@ -211,13 +224,13 @@ class SearchverticalsModel extends GroupModel
      * This is used in wiki read mode for search result verticals or edit mode
      * (the wiki info is not pre-parsed) for editing the knowledge wiki page.
      *
-     *  @param string $query to get knowledge wiki results for
-     *  @param string $locale the locale tag language that one want the results
-     *      for
-     *  @param bool $edit_mode whether the wiki page should be pre-parsed
-     *      (suitable for display in query results) or left unparsed (suitable
-     *      for editing).
-     *  @return array knowledge wiki page info
+     * @param string $query to get knowledge wiki results for
+     * @param string $locale_tag the locale tag language that one want the
+     *  results for
+     * @param bool $edit_mode whether the wiki page should be pre-parsed
+     *  suitable for display in query results) or left unparsed (suitable
+     *   for editing).
+     * @return array knowledge wiki page info
      */
     public function getKnowledgeWiki($query, $locale_tag, $edit_mode = false)
     {
diff --git a/src/models/SourceModel.php b/src/models/SourceModel.php
index 2d8e38435..e29c3b90b 100644
--- a/src/models/SourceModel.php
+++ b/src/models/SourceModel.php
@@ -384,7 +384,12 @@ class SourceModel extends ParallelModel
         $db->execute($sql, [$locale_string]);
     }
     /**
+     * Used to delete any feed data  (IndexDataFeed bundle) and trending
+     * data in this Yioop installation.
      *
+     * @param array $machine_urls a list of machines which are running
+     * MediaUpdaters for this instance of Yioop. If empty assume is just
+     * the Name Server
      */
     public function clearFeedData($machine_urls = null)
     {
diff --git a/src/models/TrendingModel.php b/src/models/TrendingModel.php
index 64e5cfe08..9084f7425 100644
--- a/src/models/TrendingModel.php
+++ b/src/models/TrendingModel.php
@@ -120,6 +120,8 @@ class TrendingModel extends Model implements MediaConstants
      * to category supplied.
      *
      * @param string $locale_tag language to get random trending terms for
+     * @param string $category category to compute trending terms for
+     * @param int $num_terms number of trending terms to return
      * @return array terms which are trending
      */
     public function randomTrends($locale_tag, $category = 'news', $num_terms =
diff --git a/src/views/AdminView.php b/src/views/AdminView.php
index 21500e37f..c4d53dd01 100755
--- a/src/views/AdminView.php
+++ b/src/views/AdminView.php
@@ -31,6 +31,7 @@
 namespace seekquarry\yioop\views;

 /**
+ * View used to draw activity list and current activty for a logged in user
  *
  * @author Chris Pollett
  */
diff --git a/src/views/ComponentView.php b/src/views/ComponentView.php
index b13f72dd8..185097677 100755
--- a/src/views/ComponentView.php
+++ b/src/views/ComponentView.php
@@ -37,7 +37,8 @@ use seekquarry\yioop\library\UrlParser;
 use seekquarry\yioop\views\elements\Element;

 /**
- *
+ * Base class for views created by adding elements to top, sub-top, same,
+ *  opposite, center columns, or bottom possitions
  *
  * @author Chris Pollett
  */
@@ -53,7 +54,8 @@ class ComponentView extends View
      */
      private $containers = [];
     /**
-     *
+     * Method used to draw the components  of this ComponentView
+     * @param array containing fields to render the elements on this view
      */
     public function renderView($data)
     {
diff --git a/src/views/CrawlstatusView.php b/src/views/CrawlstatusView.php
index 355329368..19bda48b0 100755
--- a/src/views/CrawlstatusView.php
+++ b/src/views/CrawlstatusView.php
@@ -32,7 +32,6 @@ namespace seekquarry\yioop\views;

 use seekquarry\yioop as B;
 use seekquarry\yioop\configs as C;
-
 /**
  * This view is used to display information about
  * crawls that have been made by this seek_quarry instance
@@ -47,7 +46,7 @@ class CrawlstatusView extends View
      * about the currently active crawl.The $data is supplied by the crawlStatus
      * method of the AdminController.
      *
-     * @param array $data   info about the current crawl status
+     * @param array $data info about the current crawl status
      */
     public function renderView($data)
     {
@@ -353,7 +352,9 @@ class CrawlstatusView extends View
         <?php
     }
     /**
-     *
+     * Draws the form used to start a new crawl
+     * @param array $data containing CSRF_TOKEN field and other field used
+     *  to draw this form
      */
     public function renderCrawlForm($data)
     {?>
diff --git a/src/views/elements/AdminElement.php b/src/views/elements/AdminElement.php
index 79132dd08..ab2b47848 100644
--- a/src/views/elements/AdminElement.php
+++ b/src/views/elements/AdminElement.php
@@ -37,6 +37,7 @@ use seekquarry\yioop\library\UrlParser;
 use seekquarry\yioop\views\elements\Element;

 /**
+ * Element used to render the admin interface for a logged in user of Yioop
  *
  * @author Chris Pollett
  */
diff --git a/src/views/elements/AdminbarElement.php b/src/views/elements/AdminbarElement.php
index 7e51841fd..3abb6d97e 100644
--- a/src/views/elements/AdminbarElement.php
+++ b/src/views/elements/AdminbarElement.php
@@ -43,7 +43,7 @@ class AdminbarElement extends Element
      * Used to draw the navigation bar on the admin portion
      * of the yioop website
      *
-     * @param array $data contains antiCSRF token, as well as data on
+     * @param array $data contains anti-CSRF token, as well as data on
      *     used to render what the current admin activity is
      */
     public function render($data)
@@ -82,6 +82,13 @@ class AdminbarElement extends Element
         </div>
         <?php
     }
+    /**
+     * Used to draw the hamburger menu symbol and associated link to the
+     * settings menu
+     *
+     * @param bool $logged_in whether or not the user is logged in. If so,
+     *  the hamburger menu symbol draws the users name
+     */
     public function renderSettingsToggle($logged_in)
     { ?>
         <div class="settings" id="settings-toggle"
diff --git a/src/views/elements/BotstoryElement.php b/src/views/elements/BotstoryElement.php
index f428d7c17..2bc3e1389 100644
--- a/src/views/elements/BotstoryElement.php
+++ b/src/views/elements/BotstoryElement.php
@@ -139,7 +139,9 @@ class BotstoryElement extends Element
         <?php
     }
     /**
-     *
+     * Used to draw the form to add or update a bot story
+     * @param $data containing field values that have already been
+     *  been filled in and the anti-CSRF attack token
      */
      public function renderBotStoryForm($data)
      {
diff --git a/src/views/elements/GroupfeedElement.php b/src/views/elements/GroupfeedElement.php
index 263ad0501..9a4936670 100644
--- a/src/views/elements/GroupfeedElement.php
+++ b/src/views/elements/GroupfeedElement.php
@@ -134,7 +134,7 @@ class GroupfeedElement extends Element implements CrawlConstants
      *
      * @param string $paging_query stem for all links
      *      drawn in view
-     * @param array& $data fields used to draw the queue
+     * @param array &$data fields used to draw the queue
      */
     public function renderGroupedView($paging_query, &$data)
     {
@@ -174,7 +174,7 @@ class GroupfeedElement extends Element implements CrawlConstants
      * @param string $base_query url that serves as the stem for all links
      *      drawn in view
      * @param string $paging_query base_query concatenated with limit and num
-     * @param array& $data fields used to draw the queue
+     * @param array &$data fields used to draw the queue
      * @return array $page last feed item processed
      */
     public function renderUngroupedView($logged_in, $base_query, $paging_query,
@@ -455,9 +455,13 @@ class GroupfeedElement extends Element implements CrawlConstants
         return $page;
     }
     /**
+     * Used to slightly clean up hypertext links before drawing them
+     * (get rid of empty queries, avoid double encoding)
      *
+     * @param string $url to clean up
+     * @return string cleaned url
      */
-    public function formatHref($url)
+    private function formatHref($url)
     {
         return rtrim(html_entity_decode($url), '?');
     }
diff --git a/src/views/elements/ManageclassifiersElement.php b/src/views/elements/ManageclassifiersElement.php
index d36756d91..f588f4dc5 100644
--- a/src/views/elements/ManageclassifiersElement.php
+++ b/src/views/elements/ManageclassifiersElement.php
@@ -58,6 +58,7 @@ class ManageclassifiersElement extends Element
         <?php
     }
     /**
+     * Draws the table of currently defined classifiers for the Yioop system
      * @param array $data info about current users and current mixes, CSRF token
      */
     public function renderClassifiersTable($data)
diff --git a/src/views/elements/ManagecreditsElement.php b/src/views/elements/ManagecreditsElement.php
index b8c9e4b31..a8b535672 100644
--- a/src/views/elements/ManagecreditsElement.php
+++ b/src/views/elements/ManagecreditsElement.php
@@ -43,7 +43,8 @@ class ManagecreditsElement extends Element
 {
     /**
      * Draws create advertisement form and existing campaigns
-     * @param array $data
+     * @param array $data containing field values that have already been
+     *  been filled in, data about exsting campaigns and the anti-CSRF attack token
      */
     public function render($data)
     {
@@ -102,7 +103,10 @@ class ManagecreditsElement extends Element
         <?php
     }
     /**
-     *
+     * Draws the form used to create or edit a keyword
+     * advertisement
+     * @param array $data containing field values that have already been
+     *  been filled in and the anti-CSRF attack token
      */
     public function renderCreditsForm($data)
     { ?>
diff --git a/src/views/elements/ManagerolesElement.php b/src/views/elements/ManagerolesElement.php
index bea8b7002..f09080a3d 100644
--- a/src/views/elements/ManagerolesElement.php
+++ b/src/views/elements/ManagerolesElement.php
@@ -49,13 +49,15 @@ class ManagerolesElement extends Element
      *     available roles or which activity has what role
      */
     public function render($data)
-    { ?>
+    {?>
         <div class="current-activity">
         <?= $this->renderRoleTable($data); ?>
         </div>
         <?php
     }
     /**
+     * Draws the table to display thhe currently available roles
+     * and their properties in this Yioop system
      * @param array $data info about current users and current roles, CSRF token
      */
     public function renderRoleTable($data)
diff --git a/src/views/elements/ManageusersElement.php b/src/views/elements/ManageusersElement.php
index 573809eef..eb867cba0 100644
--- a/src/views/elements/ManageusersElement.php
+++ b/src/views/elements/ManageusersElement.php
@@ -62,6 +62,8 @@ class ManageusersElement extends Element
         <?php
     }
     /**
+     * Draws the table that displays the users and their properties for
+     * the Yioop system
      * @param array $data info about current users and current roles, CSRF token
      */
     public function renderUserTable($data)
@@ -534,32 +536,31 @@ class ManageusersElement extends Element
                         ?>
                         <div class="center">
                         <?php
-                            $action_url = $base_url. "&amp;user_name=".
-                                $data['CURRENT_USER']['user_name'].
-                                "&amp;role_filter=".$data['ROLE_FILTER'].
-                                "&amp;group_filter=".$data['GROUP_FILTER'];
-                            if ($limit >= C\NUM_RESULTS_PER_PAGE ) {
-                                ?><a href='<?=
-                                "$action_url&amp;arg=edituser&amp;$context" .
-                                "&amp;group_limit=".
-                                ($limit - C\NUM_RESULTS_PER_PAGE) ?>'
-                                >&lt;&lt;</a><?php
-                            }
-                            ?>
+                        $action_url = $base_url. "&amp;user_name=".
+                            $data['CURRENT_USER']['user_name'].
+                            "&amp;role_filter=".$data['ROLE_FILTER'].
+                            "&amp;group_filter=".$data['GROUP_FILTER'];
+                        if ($limit >= C\NUM_RESULTS_PER_PAGE ) {
+                            ?><a href='<?=
+                            "$action_url&amp;arg=edituser&amp;$context" .
+                            "&amp;group_limit=".
+                            ($limit - C\NUM_RESULTS_PER_PAGE) ?>'
+                            >&lt;&lt;</a><?php
+                        } ?>
                         <input class="very-narrow-field center"
                             name="group_filter" type="text" maxlength="<?=
                             C\SHORT_TITLE_LEN ?>" value='<?=
                             $data['GROUP_FILTER'] ?>' />
                         <?php
-                            if ($data['NUM_USER_GROUPS'] > $limit +
-                                C\NUM_RESULTS_PER_PAGE) {
-                                ?><a href='<?=
-                                "$action_url&amp;arg=edituser&amp;$context" .
-                                "&amp;group_limit=".
-                                ($limit + C\NUM_RESULTS_PER_PAGE)
-                                ?>'>&gt;&gt;</a>
+                        if ($data['NUM_USER_GROUPS'] > $limit +
+                            C\NUM_RESULTS_PER_PAGE) {
+                            ?><a href='<?=
+                            "$action_url&amp;arg=edituser&amp;$context" .
+                            "&amp;group_limit=".
+                            ($limit + C\NUM_RESULTS_PER_PAGE)
+                            ?>'>&gt;&gt;</a>
                             <?php
-                            }
+                        }
                         ?><br />
                         <button type="submit" name="change_filter"
                             value="group"><?= tl('manageusers_element_filter')
diff --git a/src/views/elements/MediajobsElement.php b/src/views/elements/MediajobsElement.php
index 4dba0780f..2084ce8e7 100644
--- a/src/views/elements/MediajobsElement.php
+++ b/src/views/elements/MediajobsElement.php
@@ -34,14 +34,19 @@ use seekquarry\yioop as B;
 use seekquarry\yioop\configs as C;

 /**
+ * Element used to draw toggles indicating which jobs the Media Updater
+ * will run and letting the user turn these jobs on/off.
  *
  * @author Chris Pollett
  */
 class MediajobsElement extends Element
 {
     /**
+     * Draws interface to allow users to say which jobs will run in the
+     * MediaUpdater. Also used to draw the nameserver/distribbuted mode toggle
      *
-     * @param array $data holds data on
+     * @param array $data with field containing the nonstatic values needed
+     *  to draw this element
      */
     public function render($data)
     {
diff --git a/src/views/elements/MixcrawlsElement.php b/src/views/elements/MixcrawlsElement.php
index ede130eb2..d0ce407a3 100644
--- a/src/views/elements/MixcrawlsElement.php
+++ b/src/views/elements/MixcrawlsElement.php
@@ -56,6 +56,7 @@ class MixcrawlsElement extends Element
         <?php
     }
     /**
+     * Draw the table that displays the currently defined crawl mixes
      * @param array $data info about current users and current mixes, CSRF token
      */
     public function renderMixesTable($data)
diff --git a/src/views/elements/PaginationElement.php b/src/views/elements/PaginationElement.php
index 6332075c4..62c99e740 100644
--- a/src/views/elements/PaginationElement.php
+++ b/src/views/elements/PaginationElement.php
@@ -32,13 +32,19 @@ namespace seekquarry\yioop\views\elements;

 use seekquarry\yioop\configs as C;
 /**
- *
+ * Element responsible for drawing the sequence of available pages for
+ * search  results.
  * @author Chris Pollett
  */
 class PaginationElement extends Element
 {
     /**
-     *
+     * Draws the sequence of available pages for
+     * search results. (next prev links and, group of pages)
+     * @param array $data containing fields with info about thhe  total
+     *  number of search results for the query, the subsearch the query is
+     *  is for, the desired number of results per page, which page we are on,
+     *  etc.
      */
     public function render($data)
     {
diff --git a/src/views/elements/SearchElement.php b/src/views/elements/SearchElement.php
index e9c541d49..d92c8dbe4 100644
--- a/src/views/elements/SearchElement.php
+++ b/src/views/elements/SearchElement.php
@@ -391,7 +391,9 @@ class SearchElement extends Element implements CrawlConstants
         <?php
     }
     /**
-     *
+     * Draws the landing page for this instance of Yioop when the default
+     * big search bar (rather than the Main public wiki page is used)
+     * @param array $data containing fields used to draw landing page
      */
     public function renderSearchLanding($data)
     {
diff --git a/src/views/elements/SearchsourcesElement.php b/src/views/elements/SearchsourcesElement.php
index 9c2310bb4..012c07e62 100644
--- a/src/views/elements/SearchsourcesElement.php
+++ b/src/views/elements/SearchsourcesElement.php
@@ -34,7 +34,7 @@ use seekquarry\yioop as B;
 use seekquarry\yioop\configs as C;

 /**
- * Contains the forms for managing search sources for news, etc.
+ * This element renders the forms for managing search sources for news, etc.
  * Also, contains form for managing subsearches which appear in SearchView
  *
  * @author Chris Pollett
diff --git a/src/views/elements/SideadvertisementElement.php b/src/views/elements/SideadvertisementElement.php
index bbf0150c7..1696393d0 100644
--- a/src/views/elements/SideadvertisementElement.php
+++ b/src/views/elements/SideadvertisementElement.php
@@ -32,13 +32,18 @@ namespace seekquarry\yioop\views\elements;

 use seekquarry\yioop\configs as C;
 /**
+ * Element used to draw an external server advertisement (if there is one) as
+ * a column on the opposite side of a search results page
  *
  * @author Chris Pollett
  */
 class SideadvertisementElement extends Element
 {
     /**
-     *
+     * Draws an external server advertisement (if there is one) as a column
+     * on othe opposite side of a search results page
+     * @param array $data with a field SIDE_ADSCRIPT that should contain
+     *  the advertisement text
      */
     public function render($data)
     {
diff --git a/src/views/elements/TopadvertisementElement.php b/src/views/elements/TopadvertisementElement.php
index 47ce1d2a0..06b240469 100644
--- a/src/views/elements/TopadvertisementElement.php
+++ b/src/views/elements/TopadvertisementElement.php
@@ -31,14 +31,20 @@
 namespace seekquarry\yioop\views\elements;

 use seekquarry\yioop\configs as C;
+
 /**
+ * This element is used to draw the keyword advertisement above search
+ * results (if present)
  *
  * @author Chris Pollett
  */
 class TopadvertisementElement extends Element
 {
     /**
-     *
+     * Draws a keyword advertisement (if there is one) at the top
+     * of a search results page
+     * @param array $data with a field TOP_ADSCRIPT that should contain
+     *  the advertisement text
      */
     public function render($data)
     {
diff --git a/src/views/helpers/FeedsHelper.php b/src/views/helpers/FeedsHelper.php
index 701393723..5d5d2342e 100644
--- a/src/views/helpers/FeedsHelper.php
+++ b/src/views/helpers/FeedsHelper.php
@@ -42,7 +42,6 @@ use seekquarry\yioop\library\UrlParser;
  */
 class FeedsHelper extends Helper implements CrawlConstants
 {
-
     /**
      * Takes page summaries for RSS pages and the current query
      * and draws list of news links and a link to the news link subsearch
@@ -51,6 +50,8 @@ class FeedsHelper extends Helper implements CrawlConstants
      * @param array $feed_pages page data from news feeds
      * @param string  $csrf_token token to prevent cross site request forgeries
      * @param string $query the current search query
+     * @param string $subsearch the name of the subsearch of this feed
+     *  For example, one could have sports feed, a news feed, etc
      * @param boolean $open_in_tabs whether new links should be opened in
      *    tabs
      */
diff --git a/tests/PdfProcessorTest.php b/tests/PdfProcessorTest.php
index fe25bd561..16e255cda 100644
--- a/tests/PdfProcessorTest.php
+++ b/tests/PdfProcessorTest.php
@@ -80,7 +80,7 @@ class PdfProcessorTest extends UnitTest implements CrawlConstants
             "Word Extraction 3");
     }
     /**
-     *
+     * Tests Tessaract text extraction from Images
      */
     public function textFromImageTestCase()
     {
ViewGit