diff --git a/src/configs/CreditConfig.php b/src/configs/CreditConfig.php index ab7a2f623..d5ac15ec1 100644 --- a/src/configs/CreditConfig.php +++ b/src/configs/CreditConfig.php @@ -59,11 +59,11 @@ class CreditConfig * @param float $amount dollar amount to charge the card * @param string $token token issued for transaction from the card * processing agency - * @param string& $message message to use as for reason for charge + * @param string &$message message to use as for reason for charge * @return bool whether or not the charge was successful */ public static function charge($amount, $token, &$message) { return true; } -} \ No newline at end of file +} diff --git a/src/configs/TokenTool.php b/src/configs/TokenTool.php index a9266b22b..2bf749e7d 100644 --- a/src/configs/TokenTool.php +++ b/src/configs/TokenTool.php @@ -27,7 +27,6 @@ * * A description of its usage is given in the $usage global variable * - * * @author Ravi Dhillon ravi.dhillon@yahoo.com, Chris Pollett (modified for n * ngrams) * @license https://www.gnu.org/licenses/ GPL3 @@ -148,8 +147,8 @@ where from Wikipedia source to extract: A knowledge wiki entry is a search wiki page which is displayed on a given query usually in a callout box. TokenTool.php can be used to create such entries based on the first paragraph of a Wikipedia page which matches the -query. At the same time TokenTool.php is doing htis it can also use -thhe infoboxes on wiki pages to generate a initial list of potential seed +query. At the same time TokenTool.php is doing this it can also use +the infoboxes on wiki pages to generate a initial list of potential seed sites for a web crawl. The syntax to create knowledge wiki seed sites is: php TokenTool.php kwiki-seeds locale page_count_file wiki_locale_dump \ @@ -270,6 +269,27 @@ switch ($argv[1]) { makeKwikiEntriesGetSeedSites($argv[2], $argv[3], $argv[4], $argv[5], $argv[6]); break; + case "pos-tagger": + $file_path = PREP_DIR . "/"; + if (!isset($argv[3])) { + echo $usage; + } + $texts = []; + for($i = 4; $i < count($argv); $i++) { + $files = glob($file_path . $argv[$i]); + if (count($files) == 0) { + echo "error: {$file_path}{$argv[i]}: File not found\n"; + exit(); + } + $texts = array_merge($texts, $files); + } + $pos_tagger = new ContextWeightedPosTagger($argv[2]); + if ($pos_tagger->train($texts, "/")) { + echo "Success\n"; + } else { + echo "Failed\n"; + } + break; case "segment-filter": $file_path = PREP_DIR . "/"; if (!file_exists($file_path . $argv[3])) { @@ -283,14 +303,14 @@ switch ($argv[1]) { if (!isset($argv[3])) { echo $usage; } - $texts=[]; + $texts = []; for($i = 4; $i < count($argv); $i++) { $files = glob($file_path . $argv[$i]); if (count($files) == 0) { - echo "error: {$file_path}{$argv[i]}: File not found\n"; + echo "error: {$file_path}{$argv[$i]}: File not found\n"; exit(); } - $texts = array_merge ($texts, $files); + $texts = array_merge($texts, $files); } $segmenter = new StochasticTermSegmenter($argv[2]); if ($segmenter->train($texts, $argv[3])) { @@ -315,7 +335,22 @@ if (!PROFILE) { exit(); } /** + * Generates knowledge wiki callouts for search results pages based + * on the first paragraph of a Wikipedia Page that matches a give qeury. + * Also generates an initial list of potential seed sites for a crawl + * based off urls scraped from the wiki pages. * + * @param string $locale_tag the IANA language tag of the locale to + * create knowledge wiki entries and seed sites for + * @param string $page_count_file the file name of a a wiki page count dump + * file (or folder of such files). Such a file contains the names of wiki + * pages and how many times they were accessed + * @param string $wiki_dump_file a dump of wikipedia pages and meta pages + * @param int $max_entries maximum number of kwiki entries to create. + * Will pick the one with the highest counts in $page_count_file + * @param int $max_seed_sites maximum number of seed sites to add + * to Yioop's set of seed sites. Again chooses those with highest + * page count score */ function makeKwikiEntriesGetSeedSites($locale_tag, $page_count_file, $wiki_dump_file, $max_entries, $max_seed_sites) @@ -423,10 +458,10 @@ function makeKwikiEntriesGetSeedSites($locale_tag, $page_count_file, continue; } $text = str_replace($infobox_offset[0], "\n", $text); - $text = removeTags($text, "{", "}", ""); - $text = removeTags($text, "<!--", "-->", ""); + $text = removeTags($text, "{", "}"); + $text = removeTags($text, "<!--", "-->"); $text = preg_replace('/\<\;ref[^\>]+\/\>/u', " ", $text); - $text = removeTags($text, "<ref", "/ref>", ""); + $text = removeTags($text, "<ref", "/ref>"); $text = preg_replace('/\[\[[^\[\]]+\|([^\[\|]+)\]\]/u', "$1", $text); $text = preg_replace('/\[\[(File|Image)\:(.+)\]\]/u', "", $text); @@ -512,7 +547,12 @@ function makeKwikiEntriesGetSeedSites($locale_tag, $page_count_file, $crawl_model->setSeedInfo($seed_info); } /** - * + * Gets the next wiki page from a file handle pointing to the wiki dump file + * @param resource $fr file handle (might be a compressed file handle, + * for example, corresponding to gzopen of bzopen) + * @param function $read a function for reading from thhe given file handle + * @param int $block_size size of blocks to use when reading + * @param string & $input_buffer used to buffer data from the wiki dump file */ function getNextPage($fr, $read, $block_size, &$input_buffer) { @@ -532,9 +572,13 @@ function getNextPage($fr, $read, $block_size, &$input_buffer) return $page; } /** - * + * Remove all occurrence of a open close tag pairs from $text + * @param string $text to remove tag pair from + * @param string $open string pattern for open tag + * @param string $close string pattern for close tag + * @return string text after tag removed */ -function removeTags($text, $open, $close, $tag) +function removeTags($text, $open, $close) { $old_text = ""; while ($text != $old_text) { @@ -550,7 +594,20 @@ function removeTags($text, $open, $close, $tag) return $text; } /** + * Get a substring offset pair matching the input open close brace tag pattern * + * @param string $page source text to search for the tag in + * For example, lala {{infobox {{blah yoyoy}} }} dada. + * @param string $brace_open character sequence starting the tag region. For + * example {{ + * @param string $brace_close character sequence ending the tag region. For + * example }} + * @param string $tag tag that might be associated with the opening of the + * the sequence. For example infobox. + * @param int $offset offset to start searching from + * @return array ordered pair [substring containing the brace tag, offset after + * the tag]. If had "lala {{infobox {{blah yoyoy}} }} dada" as input and + * searched on {{, }}, infobox, 0 would get ["{{infobox {{blah yoyoy}}", 31] */ function getBraceTag($page, $brace_open, $brace_close, $tag, $offset = 0) { @@ -586,7 +643,12 @@ function getBraceTag($page, $brace_open, $brace_close, $tag, $offset = 0) return [$outer_contents, $end_pos]; } /** - * + * Get the outer contents of an xml open/close tag pair from + * a text source together with a new offset location after + * @param string $page text source to search the tag pair in + * @param string $tag the xml tag to look for + * @param int $offset offset to start searching after for the open/close pair + * @param array ordered pair [outer contents, new offset] */ function getTagOffsetPage($page, $tag, $offset = 0) { @@ -604,7 +666,16 @@ function getTagOffsetPage($page, $tag, $offset = 0) return [$outer_contents, $end_pos]; } /** + * Returns title and page counts of the top $max_pages many entries + * in a $page_count_file for a locale $locale_tag * + * @param string $page_count_file page count file to use to search for title + * counts with respect to a locale + * @param string $locale_tag locale to get top pages for + * @param int $max_pages number of pages + * @param array $title_counts title counts that migt have come from analyzing + * a previous file. These will be in the output and contribute to $max_pages + * @return array $title_counts wiki page titles => num_views associative array */ function getTopPages($page_count_file, $locale_tag, $max_pages, $title_counts = []) diff --git a/src/controllers/AdminController.php b/src/controllers/AdminController.php index 45c348f8c..003dcfb64 100755 --- a/src/controllers/AdminController.php +++ b/src/controllers/AdminController.php @@ -351,8 +351,14 @@ class AdminController extends Controller implements CrawlConstants return $data; } /** - * @param array $user_activities - * @return array + * For a given user's access and the list component and activities + * return a list of translated names of components associated to a + * list of user accessible activities for that component + * + * @param array $user_activities a list of activities that a + * user is allowed to access + * @return array of translated name of component => [list of user accessible + * actvitities] */ public static function computeComponentActivities($user_activities) { @@ -488,8 +494,8 @@ class AdminController extends Controller implements CrawlConstants /** * Used to update the yioop installation profile based on $_REQUEST data * - * @param array& $data field data to be sent to the view - * @param array& $profile used to contain the current and updated profile + * @param array &$data field data to be sent to the view + * @param array &$profile used to contain the current and updated profile * field values * @param array $check_box_fields fields whose data comes from a html * checkbox @@ -551,7 +557,7 @@ class AdminController extends Controller implements CrawlConstants * are used by manageUsers, manageRoles, manageGroups, to do advanced * search of the entity they are responsible for. * - * @param array& $data modified to contain the field data needed for + * @param array &$data modified to contain the field data needed for * the view to draw the search form * @param string activity in which this search is being conducted * @param array $comparison_fields those fields of the entity diff --git a/src/controllers/Controller.php b/src/controllers/Controller.php index 57a0bde0b..826d0b7bb 100755 --- a/src/controllers/Controller.php +++ b/src/controllers/Controller.php @@ -84,11 +84,12 @@ abstract class Controller */ public $web_site; /** - * + * Array of instances of components used by this controller + * @var array */ public $component_instances; /** - * Array of instances of views used by this controller + * Array of instances of views used by this controller * @var array */ public $view_instances = []; @@ -399,7 +400,7 @@ abstract class Controller * data sources, rather than directly make a call to the model to get the * data it might be passed directly to this method. * - * @param array& $data used to send data to the view will be updated by + * @param array &$data used to send data to the view will be updated by * this method with row and paging data * @param mixed $field_or_model if an object, this is assumed to be a model * and so the getRows method of this model is called to get row data, @@ -905,7 +906,7 @@ abstract class Controller * @param string $line_type does additional cleaning depending on the type * of the lines. For instance, if is "url" then a line not beginning * with a url scheme will have http:// prepended. - * @return $lines an array of clean lines + * @return array $lines an array of clean lines */ public function convertStringCleanArray($str, $line_type="url") { @@ -1018,7 +1019,7 @@ abstract class Controller * controller this function can be used to initialize the field variables * used to write the appropriate Javascripts * - * @param array& $data data to be used in drawing the view + * @param array &$data data to be used in drawing the view * @param bool $ads_off whether or not ads are turned off so that this * method should do nothing */ diff --git a/src/controllers/CrawlController.php b/src/controllers/CrawlController.php index 83bd41888..006e655e2 100644 --- a/src/controllers/CrawlController.php +++ b/src/controllers/CrawlController.php @@ -37,8 +37,8 @@ use seekquarry\yioop\library\MediaConstants; use seekquarry\yioop\library\UrlParser; /** - * Controller used to manage networked installations of Yioop where - * there might be mutliple queue_servers and a name_server. Command + * Controller used to manage networked installations of Yioop + * where there might be mulliple QueueServers and a NameServer. Command * sent to the nameserver web page are mapped out to queue_servers * using this controller. Each method of the controller essentially * mimics one method of CrawlModel, PhraseModel, or in general anything @@ -325,7 +325,9 @@ class CrawlController extends Controller implements CrawlConstants null, $num_fetchers); } /** - * + * Wrapper call to the source model method that deletes the news feed + * and trending data stored in this Yioop instance + * @see SourceModel::clearFeedData */ public function clearFeedData() { diff --git a/src/controllers/FetchController.php b/src/controllers/FetchController.php index 4dc928310..ea9cc910f 100755 --- a/src/controllers/FetchController.php +++ b/src/controllers/FetchController.php @@ -547,7 +547,7 @@ class FetchController extends Controller implements CrawlConstants * $time to a subfolder $day of a folder $dir * * @param string $schedule_name the name of the kind of schedule being saved - * @param string& $data_string encoded, compressed, serialized data the + * @param string &$data_string encoded, compressed, serialized data the * schedule is to contain */ public function addScheduleToScheduleDirectory($schedule_name, diff --git a/src/controllers/GroupController.php b/src/controllers/GroupController.php index 2a1036e55..fdd5fe288 100644 --- a/src/controllers/GroupController.php +++ b/src/controllers/GroupController.php @@ -159,9 +159,9 @@ class GroupController extends Controller implements CrawlConstants * * @param string $format can be one of rss, json, or serialize, * if different, default HTML GroupView used. - * @param string& $view variable used to set the view in calling + * @param string &$view variable used to set the view in calling * method - * @param array& $data used to send data to the view for drawing + * @param array &$data used to send data to the view for drawing */ public function setupViewFormatOutput($format, &$view, &$data) { diff --git a/src/controllers/RegisterController.php b/src/controllers/RegisterController.php index 94ba65783..c1e5abab5 100755 --- a/src/controllers/RegisterController.php +++ b/src/controllers/RegisterController.php @@ -458,9 +458,9 @@ class RegisterController extends Controller implements CrawlConstants $user['FIRST_NAME'], $user['LAST_NAME'])."\n"; $message .= tl('register_controller_recover_body')."\n"; $time = time(); - $message .= C\BASE_URL. + $message .= C\BASE_URL . "?c=register&a=recoverComplete&user=" . - $user['USER_NAME']. + $user['USER_NAME'] . "&hash=".urlencode(L\crawlCrypt( $user['HASH'] . $time . $user['USER_NAME'].C\AUTH_KEY)) . "&time=" . $time ; @@ -826,7 +826,7 @@ class RegisterController extends Controller implements CrawlConstants * Sets up the graphical captcha view * Draws the string for graphical captcha * - * @param array& $data used by view to draw any dynamic content + * @param array &$data used by view to draw any dynamic content * in this case we append a field "CAPTCHA_IMAGE" with a data * url of the captcha to draw. */ @@ -837,8 +837,7 @@ class RegisterController extends Controller implements CrawlConstants } unset($_SESSION["captcha_text"]); // defines captcha text - $characters_for_captcha = '123456789'. - 'abcdefghijklmnpqrstuvwxyz'. + $characters_for_captcha = '123456789abcdefghijklmnpqrstuvwxyz'. 'ABCDEFGHIJKLMNPQRSTUVWXYZ'; $len = strlen($characters_for_captcha); // selecting letters for captcha @@ -966,12 +965,12 @@ class RegisterController extends Controller implements CrawlConstants * $activity_success. If $activity was not initially equal to * $activity_success then this method does nothing. * - * @param string& $activity current tentative activity + * @param string &$activity current tentative activity * @param string $activity_success activity to test for and to test prereqs * for. * @param string $activity_fail if prereqs not met which acitivity to switch * to - * @param array& $data data to help render the view this controller draws + * @param array &$data data to help render the view this controller draws */ public function preactivityPrerequisiteCheck(&$activity, $activity_success, $activity_fail, &$data) @@ -1018,7 +1017,7 @@ class RegisterController extends Controller implements CrawlConstants * missing fields on a create account or recover account form. * also adds error info if try to create an existing using. * - * @param array& $data contains info for the view on which the above + * @param array &$data contains info for the view on which the above * forms are to be drawn. */ public function dataIntegrityCheck(&$data) @@ -1075,7 +1074,7 @@ class RegisterController extends Controller implements CrawlConstants * in blank values for missing fields into a "MISSING" * array * - * @param array& $data an array of data to be sent to the view + * @param array &$data an array of data to be sent to the view * After this method is done it will have cleaned versions * of the $_REQUEST variables from create or recover account * forms as well as a "MISSING" field which is an array of diff --git a/src/controllers/ResourceController.php b/src/controllers/ResourceController.php index b79966e77..604d4319e 100644 --- a/src/controllers/ResourceController.php +++ b/src/controllers/ResourceController.php @@ -180,9 +180,13 @@ class ResourceController extends Controller implements CrawlConstants } } /** - * - * @param bool $is_src_folder - * @param array + * Returns the file system folder where resources are stored + * making use of the n field for the name of the resource, its type, + * the sf field describing the desired subfolder + * and whether this is a request for a thumbnail or a object + * @param bool $is_src_folder should we look in the base directory + * (src folder) or work_directory to try to find the resource + * @return array ordered pair [path beneath base folder to file, basefolder] */ public function getNameAndBaseFolder($is_src_folder = false) { diff --git a/src/controllers/SearchController.php b/src/controllers/SearchController.php index 364ade925..97b3f23cb 100755 --- a/src/controllers/SearchController.php +++ b/src/controllers/SearchController.php @@ -382,7 +382,7 @@ class SearchController extends Controller implements CrawlConstants * against to prevent CSRF attacks, just after someone logged out, or * a bot session (googlebot, etc) so remove the query request * - * @param array& $data that will eventually be sent to the view. We might + * @param array &$data that will eventually be sent to the view. We might * update with error messages * @return array consisting of (query based on user info, whether * if a cache request highlighting should be userd, what activity @@ -732,7 +732,7 @@ class SearchController extends Controller implements CrawlConstants * @param int $raw should validate against list of known crawls or an * internal (say network) query that doesn't require validation * (faster without). - * @param array& $data that will eventually be sent to the view. We set + * @param array &$data that will eventually be sent to the view. We set * the 'its' (index_time_stamp) field here * @return array consisting of index timestamp of crawl or mix in use, * $index_info an array of info about that index, and $save_timestamp @@ -861,7 +861,7 @@ class SearchController extends Controller implements CrawlConstants * @param string $view name of view class search results are for * @param array $subsearches an array of data about each subsearch to draw * to the view - * @param array& $data that will eventually be sent to the view for + * @param array &$data that will eventually be sent to the view for * rendering. This method adds fields to the array */ public function addSearchViewData($index_info, $no_query, $raw, $view, @@ -1084,7 +1084,7 @@ EOD; * Searches the database for the most relevant pages for the supplied search * terms. Renders the results to the HTML page. * - * @param array& $data an array of view data that will be updated to include + * @param array &$data an array of view data that will be updated to include * at most results_per_page many search results * @param string $query a string containing the words to search on * @param string $activity besides a straight search for words query, @@ -2287,7 +2287,7 @@ EOD; * indexes * @param array $queue_servers is an array containing URLs for queue * servers - * @return [$all_crawl_times, $all_crawl_items] is an array containing + * @return array [$all_crawl_times, $all_crawl_items] is an array containing * an array of crawl times and an array of their respective crawl items */ public function getCrawlItems($url, $crawl_times, $queue_servers) @@ -2577,7 +2577,7 @@ EOD; * libraries used to display cache pages * * @param DOMDocument $dom used to create new nodes - * @param DomElement& $node what to add script node to + * @param DomElement &$node what to add script node to */ public function addCacheJavascriptTags($dom, &$node) { diff --git a/src/controllers/components/AccountaccessComponent.php b/src/controllers/components/AccountaccessComponent.php index 46f4e3a08..082c5c755 100644 --- a/src/controllers/components/AccountaccessComponent.php +++ b/src/controllers/components/AccountaccessComponent.php @@ -775,7 +775,7 @@ class AccountaccessComponent extends Component * and $_REQUEST['role_sorts']. Information about these roles is added as * fields to $data[NUM_USER_ROLES'] and $data['USER_ROLES'] * - * @param array& $data data for the manageUsers view. + * @param array &$data data for the manageUsers view. * @param int $user_id user to look up roles for */ public function getUserRolesData(&$data, $user_id) @@ -830,7 +830,7 @@ class AccountaccessComponent extends Component * about these roles is added as * fields to $data[NUM_USER_GROUPS'] and $data['USER_GROUPS'] * - * @param array& $data data for the manageUsers view. + * @param array &$data data for the manageUsers view. * @param int $user_id user to look up roles for */ public function getUserGroupsData(&$data, $user_id) diff --git a/src/controllers/components/Component.php b/src/controllers/components/Component.php index 15ea4bc57..a692a73d1 100644 --- a/src/controllers/components/Component.php +++ b/src/controllers/components/Component.php @@ -87,7 +87,7 @@ class Component * and to send any localizations needed from PHP to Javascript-land * It is used by both Crawl and SocialComponent * - * @param array& $data an asscoiative array of data to be used by the + * @param array &$data an asscoiative array of data to be used by the * view and layout that the wiki editor will be drawn on * This method tacks on to INCLUDE_SCRIPTS to make the layout load * wiki.js. diff --git a/src/controllers/components/CrawlComponent.php b/src/controllers/components/CrawlComponent.php index 09491daec..a1d32ae78 100644 --- a/src/controllers/components/CrawlComponent.php +++ b/src/controllers/components/CrawlComponent.php @@ -618,7 +618,7 @@ class CrawlComponent extends Component implements CrawlConstants * Called from @see manageCrawls to start a new crawl on the machines * $machine_urls. Updates $data array with crawl start message * - * @param array& $data an array of info to supply to AdminView + * @param array &$data an array of info to supply to AdminView * @param array $request_fields if start crawl fails this is a list of * request fields to preserve in the redirect message */ @@ -690,7 +690,7 @@ class CrawlComponent extends Component implements CrawlConstants * Reads the parameters for a crawl from an array gotten from a crawl.ini * file * - * @param array& $crawl_params parameters to write to queue_server + * @param array &$crawl_params parameters to write to queue_server * @param array $seed_info data from crawl.ini file */ public function getCrawlParametersFromSeedInfo(&$crawl_params, $seed_info) @@ -773,7 +773,7 @@ class CrawlComponent extends Component implements CrawlConstants * crawl (or current crawl) to be carried out by the machines * $machine_urls. Updates $data array to be supplied to AdminView * - * @param array& $data an array of info to supply to AdminView + * @param array &$data an array of info to supply to AdminView * @param array $machine_urls string urls of machines managed by this * Yioop name server on which to perform the crawl */ @@ -1148,7 +1148,7 @@ class CrawlComponent extends Component implements CrawlConstants * Called from @see manageCrawls to read in the file with statistics * information about a crawl. This file is computed by @see AnalyticsJob * - * @param array& $data an array of info to supply to AdminView + * @param array &$data an array of info to supply to AdminView * @param array $machine_urls machines that are being used in crawl * Yioop name server on which to perform the crawl */ diff --git a/src/controllers/components/SocialComponent.php b/src/controllers/components/SocialComponent.php index d414b747f..8c4601a2a 100644 --- a/src/controllers/components/SocialComponent.php +++ b/src/controllers/components/SocialComponent.php @@ -1062,7 +1062,7 @@ class SocialComponent extends Component implements CrawlConstants * membership in a group if the group is By Request or Public * Request * - * @param array& $data field variables to be drawn to view, + * @param array &$data field variables to be drawn to view, * we modify the SCRIPT component of this with a message * regarding success of not of add attempt. * @param int $add_id group id to be added @@ -1138,7 +1138,7 @@ class SocialComponent extends Component implements CrawlConstants * $_REQUEST['user_filter']. Information about these roles is added as * fields to $data[NUM_USERS_GROUP'] and $data['GROUP_USERS'] * - * @param array& $data data for the manageGroups view. + * @param array &$data data for the manageGroups view. * @param int $group_id group to look up users for */ public function getGroupUsersData(&$data, $group_id) @@ -1187,9 +1187,9 @@ class SocialComponent extends Component implements CrawlConstants * if the current group is to be modfied, and if so, to call model to * handle the update * - * @param array& $data used to add any information messages for the view + * @param array &$data used to add any information messages for the view * about changes or non-changes to the model - * @param array& $group current group which might be altered + * @param array &$group current group which might be altered * @param array $update_fields which fields in the current group might be * changed. Elements of this array are triples, the name of the * group field, name of the request field to use for data, and an @@ -1376,7 +1376,7 @@ class SocialComponent extends Component implements CrawlConstants if (in_array($group['REGISTER_TYPE'], [C\PUBLIC_BROWSE_REQUEST_JOIN, C\PUBLIC_JOIN])) { $post_url = B\feedsUrl("thread", $parent_item["ID"], - true, "group") . "preserve=true\n"; + true, "group", false) . "preserve=true\n"; } $subject = tl('social_component_thread_notification', $parent_item['TITLE']); @@ -1539,7 +1539,7 @@ class SocialComponent extends Component implements CrawlConstants $subject = tl('social_component_new_thread_mail', $group['GROUP_NAME']); $post_url = B\feedsUrl("thread", $thread_id, true, - "group")."preserve=true\n"; + "group", false)."preserve=true\n"; $body = tl('social_component_new_thread_body', $group['GROUP_NAME'])."\n". "\"".$title."\"\n". @@ -2301,7 +2301,7 @@ class SocialComponent extends Component implements CrawlConstants /** * Handles requests to reading, editing, viewing history, reverting, etc * wiki pages - * @return $data an associative array of form variables used to draw + * @return array $data an associative array of form variables used to draw * the appropriate wiki page */ public function wiki() @@ -3047,7 +3047,7 @@ EOD; * page and to update the recent page impressions so that this can be * calculated * - * @param array& $data $data data to be sent to the view, will be modified + * @param array &$data $data data to be sent to the view, will be modified * according to impression info. * @param int $user_id id of the user requesting to change the given wiki * page @@ -3115,7 +3115,7 @@ EOD; * of reading a media list or to help find resources in the case of a * user using edit mode * - * @param array& $data data to be sent to the view. The + * @param array &$data data to be sent to the view. The * $data["RESOURCES_INFO"]['resources'] array of resources will be * sorted according to the wiki page's settings as given in * $data["HEAD"]['sort'] @@ -3153,7 +3153,7 @@ EOD; * refactoring still needs some work. Hence, the awkward parameter list * below. * - * @param array& $data $data data to be sent to the view, will be modified + * @param array &$data $data data to be sent to the view, will be modified * according to the edit action. * @param int $user_id id of the user requesting to change the given wiki * page @@ -3578,7 +3578,7 @@ EOD; * needed to display a single media item on a media list. The name of * the media item to be display is expected to come from $_REQUEST['n']. * - * @param array& $data array of field variables for view will be modified + * @param array &$data array of field variables for view will be modified * by this function * @param int $group_id id of group wiki page belongs to * @param int $page_id id of wiki page @@ -4009,7 +4009,7 @@ EOD; /** * Used to create Javascript used to toggle a wiki page's settings control * - * @param array& $data will contain in SCRIPT field neccessary Javascript + * @param array &$data will contain in SCRIPT field neccessary Javascript * to pass to view. */ private function initializeWikiPageToggle(&$data) diff --git a/src/controllers/components/StoreComponent.php b/src/controllers/components/StoreComponent.php index 0b54be2fa..02bb162c6 100644 --- a/src/controllers/components/StoreComponent.php +++ b/src/controllers/components/StoreComponent.php @@ -534,7 +534,7 @@ class StoreComponent extends Component /** * Trim white spaces callback for array_walk * - * @param string& $value string to remove initial and trailing whitespace + * @param string &$value string to remove initial and trailing whitespace * from */ public function trim_value(&$value) diff --git a/src/examples/StockBot.php b/src/examples/StockBot.php index 8a32aafcb..49adc6f6a 100644 --- a/src/examples/StockBot.php +++ b/src/examples/StockBot.php @@ -24,8 +24,6 @@ * * @author Harika Nukala harikanukala9@gmail.co * (updated after yahoo stock quotes went dark, by Chris Pollett) - * @package seek_quarry - * @subpackage examples * @license https://www.gnu.org/licenses/ GPL3 * @link https://www.seekquarry.com/ * @copyright 2009 - 2020 diff --git a/src/examples/WeatherBot.php b/src/examples/WeatherBot.php index b930ea989..c58ad966b 100644 --- a/src/examples/WeatherBot.php +++ b/src/examples/WeatherBot.php @@ -23,8 +23,6 @@ * END LICENSE * * @author Harika Nukala harika.nukala@sjsu.edu - * @package seek_quarry - * @subpackage examples * @license https://www.gnu.org/licenses/ GPL3 * @link https://www.seekquarry.com/ * @copyright 2009 - 2020 diff --git a/src/executables/Fetcher.php b/src/executables/Fetcher.php index 22485c591..2bb94b995 100755 --- a/src/executables/Fetcher.php +++ b/src/executables/Fetcher.php @@ -1418,7 +1418,7 @@ class Fetcher implements CrawlConstants * Sets parameters for fetching based on provided info struct * ($info typically would come from the queue server) * - * @param array& $info struct with info about the kind of crawl, timestamp + * @param array &$info struct with info about the kind of crawl, timestamp * of index, crawl order, etc. */ public function setCrawlParamsFromArray(&$info) @@ -1706,7 +1706,7 @@ class Fetcher implements CrawlConstants * for which no content was downloaded so that they can be scheduled * to be crawled again. * - * @param array& $site_pages pages to sort + * @param array &$site_pages pages to sort * @return an array conisting of two array downloaded pages and * not downloaded pages. */ @@ -2126,8 +2126,8 @@ class Fetcher implements CrawlConstants * Adds thumbs for websites with a self::THUMB_URL field by downloading the * linked to images and making a thumb from it. * - * @param array& $sites associative array of web sites information to add - * thumbs for. At least one site in the array should have a + * @param array &$sites associative array of web sites information to add + * thumbs for. At least one site in the array should have a * self::THUMB_URL field that we want have the thumb of */ public function getPageThumbs(&$sites) @@ -2246,7 +2246,7 @@ class Fetcher implements CrawlConstants * Then a crude estimate of the information contained in the links test: * strlen(gzip(text)) is used to extract the best remaining links. * - * @param array& $doc_info a string with a CrawlConstants::LINKS subarray + * @param array &$doc_info a string with a CrawlConstants::LINKS subarray * This subarray in turn contains url => text pairs. * @param string $field field for links default is CrawlConstants::LINKS * @param int $member_cache_time says how long allowed and disallowed url @@ -2289,8 +2289,8 @@ class Fetcher implements CrawlConstants * * @param int $i index to copy to * @param array $site web page info to copy - * @param array& $summarized_site_pages array of summaries of web pages - * @param array& $stored_site_pages array of cache info of web pages + * @param array &$summarized_site_pages array of summaries of web pages + * @param array &$stored_site_pages array of cache info of web pages */ public function copySiteFields($i, $site, &$summarized_site_pages, &$stored_site_pages) @@ -2334,11 +2334,11 @@ class Fetcher implements CrawlConstants * documents to the summaried_size_pages and stored_site_pages * arrays constructed during the execution of processFetchPages() * - * @param int& $i index to begin adding subdocs at + * @param int &$i index to begin adding subdocs at * @param array $site web page that subdocs were from and from * which some subdoc summary info is copied - * @param array& $summarized_site_pages array of summaries of web pages - * @param array& $stored_site_pages array of cache info of web pages + * @param array &$summarized_site_pages array of summaries of web pages + * @param array &$stored_site_pages array of cache info of web pages */ public function processSubdocs(&$i, $site, &$summarized_site_pages, &$stored_site_pages) diff --git a/src/executables/MediaUpdater.php b/src/executables/MediaUpdater.php index 2172a7b98..bfea35d8c 100644 --- a/src/executables/MediaUpdater.php +++ b/src/executables/MediaUpdater.php @@ -41,7 +41,8 @@ use seekquarry\yioop\library\WikiParser; if (php_sapi_name() != 'cli' || defined("seekquarry\\yioop\\configs\\IS_OWN_WEB_SERVER")) { - echo "BAD REQUEST"; exit(); + echo "BAD REQUEST"; + exit(); } /** We do want logging, but crawl model and others will try to turn off * if we don't set this @@ -178,7 +179,11 @@ class MediaUpdater implements CrawlConstants L\crawlLog("Done checking Name Server for Media Updater properties"); } /** - * @param array $jobs_list + * Given a list of MediaUpdate jobs, updates $this->jobs to contain + * instantiated objects of the the corresponding jobs, requiring + * classes that have not been loaded yet as needed. + * + * @param array $jobs_list list of MediaUpdater jobs */ public function loadJobs($jobs_list) { diff --git a/src/executables/QueueServer.php b/src/executables/QueueServer.php index 910bbea39..c4dac30c3 100755 --- a/src/executables/QueueServer.php +++ b/src/executables/QueueServer.php @@ -116,14 +116,21 @@ class QueueServer implements CrawlConstants, Join */ public $channel; /** + * Controls whether a repeating crawl (negative man no) is being done + * and if so its frequency in second * @var int */ public $repeat_type; /** + * If a crawl quiescent period is being used with the crawl, then + * this stores the time of day at which that period starts * @var string */ public $sleep_start; /** + * If a crawl quiescent period is being used with the crawl, then + * this sproperty will be positive and indicate the number of seconds + * duration for the quiescent period. * @var string */ public $sleep_duration; @@ -2369,7 +2376,7 @@ class QueueServer implements CrawlConstants, Join * pastes more than MAX_FETCH_SIZE many urls into the initial seed sites * of a crawl in the UI. * - * @param array& $sites array containing to crawl data + * @param array &$sites array containing to crawl data */ public function dumpBigScheduleToSmall(&$sites) { @@ -2917,7 +2924,7 @@ class QueueServer implements CrawlConstants, Join * This function is used to schedule slots for crawl-delayed host. * * @param int $index location to begin searching for an empty slot - * @param array& $arr list of slots to look in + * @param array &$arr list of slots to look in * @return int index of first available slot */ public function getEarliestSlot($index, &$arr) diff --git a/src/index.php b/src/index.php index 3a797e438..d18f111b1 100644 --- a/src/index.php +++ b/src/index.php @@ -89,7 +89,7 @@ function bootstrap($web_site = null, $start_new_session = true) to images containing HTML. Also, might help against PRSSI attacks. */ if ($start_new_session) { - if (checkCookieConsent($web_site)) { + if (checkCookieConsent()) { $options = ['name' => C\SESSION_NAME, 'cookie_lifetime' => C\COOKIE_LIFETIME]; if (C\nsdefined("SECURE_COOKIE") && C\SECURE_COOKIE) { @@ -197,9 +197,15 @@ function bootstrap($web_site = null, $start_new_session = true) $controller->processRequest(); } /** + * Checks if a cookie consent form was obtained. This + * This function returns true if a session cookie + * was received from the browser, or a form variable + * saying cookies are okay was received, or the cookie + * Yioop profile says the consent mechanism is disabled * + * @return bool cookie consent (true) else false */ -function checkCookieConsent($web_site) +function checkCookieConsent() { if (C\PROFILE && intval(C\COOKIE_LIFETIME) > 0 && empty($_COOKIE[C\SESSION_NAME]) @@ -215,7 +221,8 @@ function checkCookieConsent($web_site) * Developers can add new routes by creating a Routes class in * the app_dir with a static method getRoutes which should return * an associating array of incoming_path => handler function - * @param object $web_site + * @param object $web_site used to send error pages if configuration + * fails */ function configureRewrites($web_site) { @@ -317,8 +324,11 @@ function configureRewrites($web_site) } } /** - * @param array $route_args - * @return bool + * Used to handle routes that will eventually just serve + * files from either the APP_DIR + * These include files like css, scripts, suggest tries, images, and videos. + * @param array $route_args of url parts (split on slash) + * @return bool whether was able to compute a route or not */ function routeAppFile($route_args) { @@ -394,7 +404,11 @@ function routeAppFile($route_args) return false; } /** - * + * Used to handle routes that will eventually just serve + * files from either the BASE_DIR + * These include files like css, scripts, images, and robots.txt. + * @param array $route_args of url parts (split on slash). + * @return bool whether was able to compute a route or not */ function routeBaseFile($route_args) { @@ -532,12 +546,16 @@ function routeFeeds($route_args) * @param bool $with_delim whether it should be terminated with nothing or * ? or & * @param string $controller which controller is being used to access the - * feed: usuall admin or group + * feed: usually admin or group + * @param bool $use_short_base_url whether to create the url as a relative + * url using C\SHORT_BASE_URL or as a full url using C\BASE_URL + * (the latter is useful for mail notifications) * @return string url for the page in question */ -function feedsUrl($type, $id, $with_delim = false, $controller = "group") +function feedsUrl($type, $id, $with_delim = false, $controller = "group", + $use_short_base_url = true) { - $base_url = C\SHORT_BASE_URL; + $base_url = ($use_short_base_url) ? C\SHORT_BASE_URL : C\BASE_URL; if (C\REDIRECTS_ON && $controller == 'group') { $delim = ($with_delim) ? "?" : ""; $path = ($type == "") ? "group" : "$type/$id"; diff --git a/src/library/BTree.php b/src/library/BTree.php index 576ed61b9..34bd02579 100644 --- a/src/library/BTree.php +++ b/src/library/BTree.php @@ -431,7 +431,7 @@ class BTree } /** * Deletes key-value pair from a leaf node in a B-Tree - * @param object& $node is the leaf node containing the key-value pair + * @param BTNode &$node is the leaf node containing the key-value pair * @param int $pos in node to delete */ public function deleteFromLeaf(&$node, $pos) @@ -455,7 +455,7 @@ class BTree } /** * Deletes key-value pair from a non-leaf node in a B-Tree - * @param object& $node is the non-leaf node containing the key-value pair + * @param BTNode &$node is the non-leaf node containing the key-value pair * @param int $pos link position in node to delete */ public function deleteFromNonLeaf(&$node, $pos) @@ -630,9 +630,9 @@ class BTree * Gives a child node an extra key by moving a key from the parent to the * child node, and by moving a key from the child's right sibling to the * parent node - * @param object& $parent is the parent node - * @param object& $child is the child node - * @param object& $next is the $child's right sibling node + * @param BTNode &$parent is the parent node + * @param BTNode &$child is the child node + * @param BTNode &$next is the $child's right sibling node * @param int $pos is the link from $parent to $child */ public function adjustChildUsingRightSiblingAndParent(&$parent, &$child, diff --git a/src/library/BloomFilterBundle.php b/src/library/BloomFilterBundle.php index c112be516..8fd9d4150 100644 --- a/src/library/BloomFilterBundle.php +++ b/src/library/BloomFilterBundle.php @@ -133,7 +133,7 @@ class BloomFilterBundle * Removes from the passed array those elements $elt who either are in * the filter bundle or whose $elt[$field_name] is in the bundle. * - * @param array& $arr the array to remove elements from + * @param array &$arr the array to remove elements from * @param array $field_names if not null an array of field names of $arr * to use to do filtering */ diff --git a/src/library/Bzip2BlockIterator.php b/src/library/Bzip2BlockIterator.php index b9ca7e22f..1608deb5d 100644 --- a/src/library/Bzip2BlockIterator.php +++ b/src/library/Bzip2BlockIterator.php @@ -289,8 +289,8 @@ class BZip2BlockIterator * Computes a new bzip2 block portions and bits left over after adding * $bytes to the passed $block. * - * @param string& $block the block to add to - * @param int& $bits used to hold bits left over + * @param string &$block the block to add to + * @param int &$bits used to hold bits left over * @param string $bytes what to add to the bzip block * @param int $num_extra_bits how many extra bits there are */ diff --git a/src/library/ComputerVision.php b/src/library/ComputerVision.php index 91802bd8c..1539190e3 100644 --- a/src/library/ComputerVision.php +++ b/src/library/ComputerVision.php @@ -42,16 +42,29 @@ require_once __DIR__ . "/Utility.php"; */ require_once __DIR__ . "/LocaleFunctions.php"; /** - * + * Class used to encapsulate verious methods related to computer + * vision that might be useful for indexing documents. These + * include recognizing text in images */ class ComputerVision { + /** + * Returns whether or not this Yioop system can recognize text in images + * Currently, this is down using the tesseract external program, so this + * method checks if a path to that program has been defined. + * @return bool whether a path to tesseract has been defined. + */ public static function ocrEnabled() { return C\nsdefined("TESSERACT"); } /** + * Given a file path to a image file and set of target languages, returns + * the text in those languages that the image contained * + * @param string $image_path a filepath to an image + * @param array $langs locale_tags of languages we want to extract text for + * @return string text extracted from image */ public static function recognizeText($image_path, $langs = [C\DEFAULT_LOCALE]) diff --git a/src/library/ContextTagger.php b/src/library/ContextTagger.php new file mode 100644 index 000000000..ed4482074 --- /dev/null +++ b/src/library/ContextTagger.php @@ -0,0 +1,413 @@ +<?php +/** + * SeekQuarry/Yioop -- + * Open Source Pure PHP Search Engine, Crawler, and Indexer + * + * Copyright (C) 2009 - 2020 Chris Pollett chris@pollett.org + * + * LICENSE: + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <https://www.gnu.org/licenses/>. + * + * @author Chris Pollett chris@pollett.org + * @license https://www.gnu.org/licenses/ GPL3 + * @link https://www.seekquarry.com/ + * @copyright 2009 - 2019 + * @filesource + */ + +namespace seekquarry\yioop\library; + +use seekquarry\yioop\configs as C; +/** + * Abstract, base context tagger class. + * A context tagger is used to apply a sequence of labels to a sequence terms + * or characters of text based on a surrounding context. Context Taggers + * typically make use of n-gram context of a term such as the n/2 - terms + * before and after the term and maybe the earlier tags from a same phrase or + * sentence to make prediction + * + * @author Chris Pollett + */ +abstract class ContextTagger +{ + /** + * Locale tag of language this recognizer is for + * @var string + */ + public $lang; + /** + * The name of the file where the tagging model should be stored and read + * from + * @var string + */ + public $tagger_file = "tagger.txt.gz"; + /** + * Complete file system path to the file where the tagging model should be + * stored and read from + * @var string + */ + public $tagger_path = ""; + /** + * 2D weights for features involving the prior two words to the + * current word and the next two words after the current word + * For a given word position, one has vector, that gives te + * value for each term in the complete training term set, unknown term set, + * and rule based tag term set, what its weight is + * Determined during training + * @var array + */ + public $word_feature; + /** + * The bias vector for features we are training + * + * Determined during training + * @var array + */ + public $bias; + /** + * The weights for features involving the prior two tags to the + * current word whose tag we are trying to determine + * Determined during training + * @var array + */ + public $tag_feature; + /** + * Array of strings for each possible tag for a term + * associated as [tag => tag index] + * @var array + */ + public $tag_set; + /** + * Minimum allowed value for a weight component + * @var float + */ + public $min_w; + /** + * Maximum allowed value for a weight component + * @var float + */ + public $max_w; + /** + * Tokenizer for the language this tagger tags for + * @var Tokenizer + */ + public $tokenizer; + /** + * Constructor for the ContextTagger. + * Sets the language this tagger tags for and sets up the path for + * where it should be stored + * @param string $lang locale tag of the language this tagger tags is for + */ + public function __construct($lang) + { + $lang = str_replace("-", "_", $lang); + $this->lang = $lang; + $this->tagger_path = C\LOCALE_DIR . "/$lang/resources/" . + $this->tagger_file; + $this->tokenizer = PhraseParser::getTokenizer($lang); + } + /** + * Converts training data from the format tagged sentence with terms of the + * form term_tag into a pair of arrays [[terms_in_sentence], + * [tags_in_sentence]] + * @param mixed $text_files can be a file or an array of file names + * @param string $term_tag_separator separator used to separate term and tag + * for terms in input sentence + * @param function $term_callback callback function applied to a term + * before adding term to sentence term array + * @param function $tag_callback callback function applied to a part of + * speech tag before adding tag to sentence tag array + * @return array of separated sentences, each sentence having the format of + * [[terms...], [tags...]] + * Currently, the training data needs to fit Chinese Treebank format: + * term followed by a underscore and followed by the tag + * e.g. "新_VA 的_DEC 南斯拉夫_NR 会国_NN" + * To adapt to other language, some modifications are needed + */ + public static function processTexts($text_files, $term_tag_separator = "_", + $term_callback = null, $tag_callback = null) + { + $ret = []; + foreach($text_files as $text_file) { + if (file_exists($text_file)) { + $fh = fopen($text_file, "r"); + while (!feof($fh)) { + $line = fgets($fh); + if(strpos($line, '<') !== false) { + continue; + } + $word_tag_pairs = preg_split("/[\s ]+/u", $line); + if (!count($word_tag_pairs)) { + continue; + } + $ret[] = []; + $ret[count($ret) - 1][0] = []; + $ret[count($ret) - 1][1] = []; + foreach ($word_tag_pairs as $word_tag_pair) { + $t = explode($term_tag_separator, $word_tag_pair); + if (count($t) == 2) { + $ret[count($ret) - 1][0][] = + $term_callback ? $term_callback($t[0]) : $t[0]; + $ret[count($ret)-1][1][] = + $tag_callback ? $tag_callback($t[1]) : $t[1]; + } + } + } + fclose($fh); + } + } + return $ret; + } + /** + * Maps a term to a corresponding key if the term matches some simple + * pattern such as being a number + * @param string $term is the term to be checked + * @return mixed either the int key for those matrices of just the term + * itself if the tokenizer does not ave the method getPosKey for the + * current language + */ + public function getKey($term) + { + if (!empty($this->tokenizer) && method_exists($this->tokenizer, + "getPosKey")) { + return $this->tokenizer::getPosKey($term); + } + return $term; + } + /** + * Given a sentence (array $terms), find the key for the term at position + * $index + * @param int $index position of term to get key for + * @param array $terms an array of terms typically from and in the order of + * a sentence + * @return mixed key position in word_feature weights and bias arrays + * could be either an int, or the term itself, or the simple rule + * based part of speec it belongs to + */ + public function getIndex($index, $terms) + { + if ($index < 0) { + $k = $index - 2; + } else if ($index >= count($terms)) { + $k = $index - count($terms) - 2; + } else { + $k = $this->getKey($terms[$index]); + } + return $k; + } + /** + * Save the trained weight to disk + */ + public function saveWeights() + { + $out = []; + $out["min_w"] = $this->min_w; + $out["max_w"] = $this->max_w; + $out["w"] = []; + foreach(array_keys($this->word_feature) as $key) { + $out["w"][$key] = $this->packW($key); + } + foreach(array_keys($this->tag_feature) as $key) { + $out["t"][$key] = $this->packT($key); + } + $out["b"] = $this->packB(); + $out["tag_set"] = $this->tag_set; + echo "Saving..."; + file_put_contents($this->tagger_path, + gzencode(serialize($out), 9)); + echo " ok\n"; + } + /** + * Load the trained data from disk + * @param bool $for_training whether we are continuing to train (true) or + * whether we are using the loaded data for prediction + */ + public function loadWeights($for_training = false) + { + if (!file_exists($this->tagger_path)) { + echo "$this->tagger_path does not exist!"; + exit(); + } + $f = unserialize(gzdecode(file_get_contents($this->tagger_path)), + ['allowed_classes' => false]); + $this->word_feature = $f["w"]; + $this->tag_feature = $f["t"] ?? []; + $this->bias = $f["b"]; + $this->min_w = $f["min_w"]; + $this->max_w = $f["max_w"]; + $this->tag_set = $f["tag_set"]; + if ($for_training) { + foreach(array_keys($this->word_feature) as $key) { + $this->word_feature[$key] = $this->unpackW($key); + } + foreach(array_keys($this->tag_feature) as $key) { + $this->tag_feature[$key] = $this->unpackT($key); + } + $this->bias = $this->unpackB(); + } + } + /** + * Pack the bias vector represented as an array into a string + * @return string the bias vector packed as a string + */ + public function packB() + { + return pack("f*", ...$this->bias); + } + /** + * Unpack the bias represented as a string into an array + * @return array the bias vector unpacked from a string + */ + public function unpackB() + { + return array_merge(unpack("f" . strval(count($this->tag_set)), + $this->bias)); + } + /** + * Pack the tag_feature represented as an array into a string + * @param int $key in tag_feature set corresponding to a part of speech + * @return string packed tag_feature vector + */ + public function packT($key) + { + return pack("f*", ...$this->tag_feature[$key]); + } + /** + * Unpack the tag_feature represented as a string into an array + * @param int $key in tag_feature set corresponding to a part of speech + * @return array unpacked tag_feature vector + */ + public function unpackT($key) + { + return array_merge(unpack("f" . strval(count($this->tag_set)), + $this->tag_feature[$key])); + } + /** + * Pack the weights matrix to a string for a particular part of speech key + * @param int $key index corresponding to a part of speech according to + * $this->tag_set + * @return string the packed weights matrix + */ + public function packW($key) + { + $bin_str = ""; + foreach ($this->word_feature[$key] as $i => $t) { + foreach ($t as $u) { + $v = 65535 * ($u - $this->min_w) / + ($this->max_w - $this->min_w); + $bin_str .= pack("S", intval($v)); + } + } + return $bin_str; + } + /** + * Unpack the weight matrix for a given part of speech key. This + * is a 5 x term_set_size matrix the 5 rows corresponds to + * -2, -1, 0, 1, 2, locations in a 5-gram. + * An (i, j) entry roughly gives the probability of the j term in location i + * having the part of speech given by $key + * @param int $key in word_feature set corresponding to a part of speech + * @return array of weights corresponding to that key + */ + public function unpackW($key) + { + $weights = []; + $size = count($this->tag_set); + for ($i = 0; $i < 5; $i++) { + $weights[$i - 2] = array_merge(unpack("S" . strval($size), + $this->word_feature[$key], 2 * $i * count($this->tag_set))); + for($j = 0; $j < $size; $j++) { + $weights[$i - 2][$j] = ($weights[$i - 2][$j] / 65535) * + ($this->max_w - $this->min_w) + $this->min_w; + } + } + return $weights; + } + /** + * Get the bias value for a tag + * @param int $tag_index the index of tag's value within the bias string + * @return float bias value for tag + */ + public function getB($tag_index) + { + return unpack("f", $this->bias, $tag_index * 4)[1]; + } + /** + * Set the bias value for tag + * @param int $tag_index the index of tag's value within the bias string + * @param float $value bias value to associate to tag + */ + public function setB($tag_index, $value) + { + $this->bias = substr_replace($this->bias, pack("f", $value), + $tag_index * 4, 4); + } + /** + * Get the tag feature value for tag + * @param int $key in tag_feature set corresponding to a part of speech + * @param int $tag_index the index of tag's value within the tag feature + * string + * @return float tag feature value for tag + */ + public function getT($key, $tag_index) + { + return unpack("f", $this->tag_feature[$key], $tag_index * 4)[1]; + } + /** + * Get the weight value for term at position for tag + * @param string $term to get weight of + * @param int $position of term within the current 5-gram + * @param int $tag_index index of the particular tag we are trying to see + * the term's weight for + * @return float + */ + public function getW($term, $position, $tag_index) + { + $t = unpack("S", $this->word_feature[$term], 2 * ($position + 2) * + count($this->tag_set) + $tag_index * 2)[1] / 65535 * + ($this->max_w - $this->min_w) + $this->min_w;; + return $t; + } + /** + * Uses text files to train a tagger for terms or chars in a document + * @param mixed $text_files with training data. These can be a file or + * an array of file names. + * @param string $term_tag_separator separator used to separate term and tag + * for terms in input sentence + * @param float $learning_rate learning rate when cycling over data trying + * to minimize the cross-entropy loss in the prediction of the tag of the + * middle term. + * @param int $num_epoch number of times to cycle through the + * complete data set. Default value of 1200 seems to avoid overfitting + * @param function $term_callback callback function applied to a term + * before adding term to sentence term array as part of processing and + * training with a sentence. + * @param function $tag_callback callback function applied to a part of + * speech tag before adding tag to sentence tag array as part of + * processing and training with a sentence. + */ + public abstract function train($text_files, $term_tag_separator = "-", + $learning_rate = 0.1, $num_epoch = 1200, $term_callback = null, + $tag_callback = null, $resume = false); + /** + * Predicts a tagging for all elements of $sentence + * + * @param mixed $sentence is an array of segmented terms/chars + * or a string that will be split on white space + * @return array predicted tags. The ith entry in the returned results + * is the tag of ith element of $sentence + */ + public abstract function predict($sentence); +} diff --git a/src/library/ContextWeightedNamedEntityRecognizer.php b/src/library/ContextWeightedNamedEntityRecognizer.php deleted file mode 100644 index e6b11d88e..000000000 --- a/src/library/ContextWeightedNamedEntityRecognizer.php +++ /dev/null @@ -1,606 +0,0 @@ -<?php -/** - * SeekQuarry/Yioop -- - * Open Source Pure PHP Search Engine, Crawler, and Indexer - * - * Copyright (C) 2009 - 2019 Chris Pollett chris@pollett.org - * - * LICENSE: - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <https://www.gnu.org/licenses/>. - * - * @author Xianghong Sun sxh19911230@gmail.com - * @license https://www.gnu.org/licenses/ GPL3 - * @link https://www.seekquarry.com/ - * @copyright 2009 - 2019 - * @filesource - */ -namespace seekquarry\yioop\library; - -use seekquarry\yioop\configs as C; -use seekquarry\yioop\locale\zh_CN\resources as ZH; - -/** - * Machine learning based NER tagger. Typically, ContextWeightedNERTagger.php - * can train the language with some dataset and predict - * the tag given a list of word. - * - * @author Xianghong Sun - */ -class ContextWeightedNamedEntityRecognizer -{ - /** - * Current Language, only tested on Simplified Chinese - * Might be extensable for other languages in the furture - * @var string - */ - public $lang; - /** - * The word weight feature - * y = wx + b - * Generized by training method - * @var array - */ - public $word_feature; - /** - * The tag weight feature - * y = wx + b - * Generized by training method - * @var array - */ - public $tag_feature; - /** - * The bias - * y = wx + b - * Generized by training method - * @var array - */ - public $bias; - /** - * All Possiable tag set - * Generized by training method - * @var associative array [tag => tag index] - */ - private $tag_set; - /** - * The constructer of the pos tagger - * To extend to other languages, some work are needed: - * Define $this->getKeyImpl, $this->rule_defined_key - * See Chinese example. - * @param @string $lang describes current langauge - * @param @book $packed describes how weight and bias would look like - */ - public function __construct($lang) - { - switch($lang) { - case("zh_CN"): - case("zh-CH"): - $this->lang = "zh_CN"; - break; - default: - $this->lang = $lang; - } - } - - /** - * A function that process the trainning data - * @param @mixed $text_files can be a file or an array of file names - * @return @array of seperated sentences, each sentenfce have the format of - * [[words...],[tags...]] - * Data format MSRA: - * 我们/o 是/o 受到/o 郑振铎/nr 先生/o 、/o 阿英/nr 先生/o 著作/o 的/o - * 启示/o ,/o 从/o 个人/o 条件/o 出发/o ,/o 瞄准/o 现代/o 出版/o 史/o - * 研究/o 的/o 空白/o ,/o 重点/o 集/o 藏/o 解放区/o 、/o 国民党/nt 毁/o - * 禁/o 出版物/o 。/o - * To adapt to other language, some modifications are needed - */ - public static function processTexts($text_files, $term_tag_splier="/", - $term_process = null, $tag_process = null) - { - $ret=[]; - foreach($text_files as $text_file) { - if (file_exists($text_file)) { - $fn = fopen($text_file,"r"); - while(! feof($fn)) { - $line = fgets($fn); - if(strpos($line, '<') !== false) { - continue; - } - $word_tag_pairs = preg_split("/[\s ]+/u", $line); - if (!count($word_tag_pairs)) { - continue; - } - $ret[] = []; - $ret[count($ret)-1][0] = []; - $ret[count($ret)-1][1] = []; - foreach ($word_tag_pairs as $word_tag_pair) { - $t = explode("/", $word_tag_pair); - if (count($t) == 2) { - $tag = $tag_process ? $tag_process($t[1]) : $t[1]; - foreach(preg_split('//u', $t[0], null, - PREG_SPLIT_NO_EMPTY) as $ch) { - $ret[count($ret)-1][0][] = - $term_process ? $term_process($ch) : $ch; - $ret[count($ret)-1][1][] = $tag; - } - } - } - } - fclose($fn); - } - } - return $ret; - } - - /** - * Function to train a data - * Notice: This function might run very long time, depending on training set - * @param @mixed $text_files are training data - * can be a file or an array of file names - * @param @float $learning_rate - * @param @int $max_epoch 1200 might be a good one, - * the weight will overfit if it's greater than this number - * @param @function $term_process is a preporcess on term before training - * @param @function $tag_process is a preporcess on tag before training - */ - public function train($text_files, $learning_rate=0.1, $max_epoch = 1200, - $term_process = null, $tag_process = null) - { - if (is_string($text_files)) { - $text_files = [$text_files]; - } - echo "Reading files\n"; - // term_tag_sentences[sentence#]=[[words...],[tags...]] - $term_tag_sentences = self::processTexts($text_files, - $term_process, $tag_process); - $this->word_feature=[]; - $this->tag_set=[]; - $tag_index = 0; - for ($i = -4; $i <= -1; $i++) { - $this->word_feature[$i] = []; - } - foreach ($term_tag_sentences as $term_tag_pairs) { - $terms=$term_tag_pairs[0]; - $tags=$term_tag_pairs[1]; - $this->tag_feature["start"]=[]; - $this->tag_feature["start-start"]=[]; - for ($i = 0; $i < count($terms); $i++) { - if (!isset($this->tag_set[$tags[$i]])) { - $this->tag_set[$tags[$i]] = $tag_index++; - } - if ($i == 0) {} - else if ($i == 1) { - if (!isset($this->tag_feature["start-".$tags[$i-1]])) { - $this->tag_feature["start-".$tags[$i-1]]=[]; - } - if (!isset($this->tag_feature[$tags[$i-1]])) { - $this->tag_feature[$tags[$i-1]]=[]; - } - } else { - if (!isset($this->tag_feature[$tags[$i-2] . "-" . - $tags[$i-1]])) { - $this->tag_feature[$tags[$i-2]."-".$tags[$i-1]] = []; - } - if (!isset($this->tag_feature[$tags[$i-1]])) { - $this->tag_feature[$tags[$i-1]]=[]; - } - } - if (!isset($this->word_feature[$terms[$i]])) { - $this->word_feature[$terms[$i]] = []; - } - } - } - foreach (array_keys($this->word_feature) as $key) { - for ($i=-2; $i<=2;$i++) { - if (!isset($this->word_feature[$key][$i])) { - $this->word_feature[$key][$i] = []; - } - foreach($this->tag_set as $possiable_tag => $tag_index) { - if (!isset($this->word_feature[$key][$i][$tag_index])) { - $this->word_feature[$key][$i][$tag_index] = 0; - } - } - } - } - foreach (array_keys($this->tag_feature) as $key) { - foreach($this->tag_set as $possiable_tag => $tag_index) { - if (!isset($this->tag_feature[$key][$tag_index])) { - $this->tag_feature[$key][$tag_index] = 0; - } - } - } - foreach($this->tag_set as $possiable_tag => $tag_index) { - if (!isset($this->bias[$tag_index])) { - $this->bias[$tag_index] = 0; - } - } - echo "Training...\n"; - //train the weight - $cross_entropy_loss = 1; - $pre_cross_entropy_loss = 2; - for ($epoch = 0; ($epoch < $max_epoch) && - $pre_cross_entropy_loss - $cross_entropy_loss > 0.000001; - $epoch++) { - $this->min_w=0; - $this->max_w=0; - $time = time(); - $dy_dw = []; - $dy_dw_n = []; - $pre_cross_entropy_loss = $cross_entropy_loss; - $cross_entropy_loss = 0; - $cross_entropy_loss_n = 0; - - $dy_db=[]; - $dy_db_n=[]; - - $dy_dt=[]; - $dy_dt_n=[]; - for($i = 0; $i < count($this->tag_set); $i++) { - $dy_db[$i] = 0; - $dy_db_n[$i] = 0; - } - //for each sentence - foreach ($term_tag_sentences as $term_tag_pairs) { - $terms=$term_tag_pairs[0]; - $tags=$term_tag_pairs[1]; - for ($i = 0; $i < count($terms); $i++) { - $k=[]; - for ($j=-2; $j<=2;$j++) { - $k[$j]= $this->getIndex($i+$j,$terms); - } - foreach ($this->tag_set as $possiable_tag => $tag_index) { - $equality = $possiable_tag == $tags[$i] ? 1 : 0; - $sum=0; - //5 words including itself - for ($j=-2; $j<=2;$j++) { - $sum += $this->word_feature[$k[$j]][$j][$tag_index]; - } - //previous 2 tags - if ($i == 0) { - $tf1="start"; - $tf2="start-start"; - } else if ($i == 1) { - $tf1=$tags[$i-1]; - $tf2="start-".$tags[$i-1]; - } else { - $tf1=$tags[$i-1]; - $tf2=$tags[$i-2]."-".$tags[$i-1]; - } - $sum += $this->tag_feature[$tf1][$tag_index]; - $sum += $this->tag_feature[$tf2][$tag_index]; - //bias - $sum += $this->bias[$tag_index]; - $sigmoid = 1 / (1 + exp(-1 * $sum)); - for ($j=-2; $j<=2;$j++) { - if (!isset($dy_dw[$k[$j]])) { - $dy_dw[$k[$j]] = []; - $dy_dw_n[$k[$j]] = []; - } - if (!isset($dy_dw[$k[$j]][$j])) { - $dy_dw[$k[$j]][$j] = []; - $dy_dw_n[$k[$j]][$j] = []; - } - if (!isset($dy_dw[$k[$j]][$j][$tag_index])) { - $dy_dw[$k[$j]][$j][$tag_index] = 0; - $dy_dw_n[$k[$j]][$j][$tag_index] = 0; - } - - $dy_dw[$k[$j]][$j][$tag_index] += - ($sigmoid - $equality); - $dy_dw_n[$k[$j]][$j][$tag_index] += 1; - - } - //dy_dt - if (!isset($dy_dt[$tf1])) { - $dy_dt[$tf1] = []; - $dy_dt_n[$tf1] = []; - } - if (!isset($dy_dt[$tf1][$tag_index])) { - $dy_dt[$tf1][$tag_index] = 0; - $dy_dt_n[$tf1][$tag_index] = 0; - } - if (!isset($dy_dt[$tf2])) { - $dy_dt[$tf2] = []; - $dy_dt_n[$tf2] = []; - } - if (!isset($dy_dt[$tf2][$tag_index])) { - $dy_dt[$tf2][$tag_index] = 0; - $dy_dt_n[$tf2][$tag_index] = 0; - } - $dy_dt[$tf1][$tag_index] += ($sigmoid - $equality); - $dy_dt_n[$tf1][$tag_index] += 1; - $dy_dt[$tf2][$tag_index] += ($sigmoid - $equality); - $dy_dt_n[$tf2][$tag_index] += 1; - //dy_db - $dy_db[$tag_index] += ($sigmoid - $equality); - $dy_db_n[$tag_index] += 1; - $cross_entropy_loss+= - - $equality*log($sigmoid) - - (1-$equality)*log(1-$sigmoid); - $cross_entropy_loss_n++; - } - } - } - $cross_entropy_loss /= $cross_entropy_loss_n; - $duration = time() - $time; - echo "epoch {$epoch} cross_entropy {$cross_entropy_loss}". - " Takes {$duration} seconds\n"; - foreach ($dy_dw as $i =>$v1) { - foreach ($v1 as $j =>$v2) { - foreach ($v2 as $k =>$v3) { - $this->word_feature[$i][$j][$k] -= - $dy_dw[$i][$j][$k] / - $dy_dw_n[$i][$j][$k] * - $learning_rate; - if ($this->word_feature[$i][$j][$k] < $this->min_w) { - $this->min_w = $this->word_feature[$i][$j][$k]; - } - if ($this->word_feature[$i][$j][$k] > $this->max_w) { - $this->max_w = $this->word_feature[$i][$j][$k]; - } - } - } - } - foreach ($dy_dt as $i => $v1) { - foreach ($v1 as $j => $v2) { - $this->tag_feature[$i][$j] -= - $dy_dt[$i][$j] / - $dy_dt_n[$i][$j] * - $learning_rate; - } - } - foreach ($dy_db as $k => $v) { - $this->bias[$k]-= - $dy_db[$k] / - $dy_db_n[$k] * - $learning_rate; - } - if ($epoch % 10 == 9 ) { - $this->save_weight(); - } - } - $this->save_weight(); - return true; - } - /** - * The primary function to predit the tag - * @param mixed $sentence is an array of segmented words/terms - * or a string needs to be splited by $splitter - * @param function $splitter to process $sentence if $sentence - * is a string - * @return @array all predicted named entities with its tag - * ex. [["郑振铎","nr"],["国民党","nt"]] - */ - public function predict($sentence, $delimiter="",$splitter=null) - { - if (!is_array($sentence)) { - if ($sentence == "") { - $terms=[]; - } else { - $terms=preg_split("/[\s]+/",$sentence); - } - } else { - $terms=$sentence; - } - if (!count($terms)) { - return []; - } - if (!$this->word_feature) { - $this->load_weight(); - } - $result = []; - for($i = 0; $i < count($terms); $i++) { - $term = $terms[$i]; - $score =[]; - foreach($this->tag_set as $possiable_tag => $tag_index) { - $score[$possiable_tag]=0; - for ($j=-2; $j <=2; $j++) { - $k=$this->getIndex($i+$j, $terms); - if (isset($this->word_feature[$k])) { - $score[$possiable_tag] += - $this->getW($k,$j,$tag_index); - } - } - if ($i == 0) { - $tf1="start"; - $tf2="start-start"; - } else if ($i == 1) { - $tf1=$result[$i-1]; - $tf2="start-".$result[$i-1]; - } else { - $tf1=$result[$i-1]; - $tf2=$result[$i-2]."-".$result[$i-1]; - } - $score[$possiable_tag] += $this->getT($tf1,$tag_index); - $score[$possiable_tag] += $this->getT($tf2,$tag_index); - $score[$possiable_tag] += $this->getB($tag_index); - } - $result[]=array_keys($score, max($score))[0]; - } - $pre_tag='o'; - $current_entity=null; - $ret=[]; - for ($i = 0; $i < count($terms); $i++) { - if ($pre_tag != $result[$i] && $pre_tag != "o") { - if (mb_strlen($current_entity) < 10) { - $ret[]=[$current_entity,$pre_tag]; - } - $current_entity=null; - } - if ($result[$i] != "o") { - if ($current_entity) { - $current_entity.=$delimiter.$terms[$i]; - } else { - $current_entity=$terms[$i]; - } - } - $pre_tag=$result[$i]; - } - return $ret; - } - /** - * A list of private helper functions - * Given a setence ($term), find the key at position $index - */ - private function getIndex($index, $terms) - { - if ($index < 0) $k = $index - 2; - else if ($index >= count($terms)) { - $k = $index - count($terms) - 2; - } - else { - $k = $terms[$index]; - } - return $k; - } - - /** - * save the trained weight to disk - */ - private function save_weight() - { - $out_file = C\LOCALE_DIR . "/{$this->lang}/resources/ner_weight.txt.gz"; - $out = []; - $out["min_w"] = $this->min_w; - $out["max_w"] = $this->max_w; - $out["w"]=[]; - foreach(array_keys($this->word_feature) as $key) { - $out["w"][$key] = $this->pack_w($key); - } - foreach(array_keys($this->tag_feature) as $key) { - $out["t"][$key] = $this->pack_t($key); - } - $out["b"] = $this->pack_b(); - $out["tag_set"] = $this->tag_set; - echo "Saving..."; - file_put_contents($out_file, - gzencode(serialize($out),9)); - echo " ok\n"; - } - /** - * load the trained weight from disk - */ - private function load_weight($trainning_load=false) - { - $dic_file - = C\LOCALE_DIR . "/{$this->lang}/resources/ner_weight.txt.gz"; - if (!file_exists($dic_file)) { - echo "$dic_file does not exist!"; - exit(); - } - $f = unserialize(gzdecode(file_get_contents($dic_file)) - ,['allowed_classes' => false]); - $this->word_feature=$f["w"]; - $this->tag_feature=$f["t"]; - $this->bias=$f["b"]; - $this->min_w=$f["min_w"]; - $this->max_w=$f["max_w"]; - $this->tag_set=$f["tag_set"]; - if ($trainning_load) { - foreach(array_keys($this->word_feature) as $key) { - $this->word_feature[$key] = $this->unpack_w($key); - } - foreach(array_keys($this->tag_feature) as $key) { - $this->tag_feature[$key] = $this->unpack_t($key); - } - $this->bias = $this->unpack_b(); - } - } - /** - * Pack the bias - */ - private function pack_b() - { - return pack("f*", ...$this->bias); - } - /** - * Unpack the bias - */ - private function unpack_b() - { - return array_merge(unpack("f" . strval(count($this->tag_set)), - $this->bias)); - } - /** - * Pack the tag_feature - */ - private function pack_t($key) - { - return pack("f*", ...$this->tag_feature[$key]); - } - /** - * Unpack the tag_feature - */ - private function unpack_t($key) - { - return array_merge(unpack("f".strval(count($this->tag_set)), - $this->tag_feature[$key])); - } - /** - * Pack the word_feature - */ - private function pack_w($key) - { - $bin_str = ""; - foreach($this->word_feature[$key] as $i => $t) { - foreach($t as $u) { - $v = 65535 * ($u-$this->min_w) / ($this->max_w-$this->min_w); - $bin_str .= pack("S", intval($v)); - } - } - return $bin_str; - } - /** - * Unpack the word_feature - */ - private function unpack_w($key) - { - $tmp = []; - $size = count($this->tag_set); - for ($i = 0; $i < 5; $i++) { - $tmp[$i-2] = array_merge(unpack("S".strval($size), - $this->word_feature[$key], 2*$i*count($this->tag_set))); - for($j = 0; $j < $size; $j++) { - $tmp[$i-2][$j] = $tmp[$i-2][$j] / 65535 - * ($this->max_w-$this->min_w) + $this->min_w; - } - } - return $tmp; - } - /** - * Get the bias value for tag - */ - private function getB($tag_index) - { - return unpack("f",$this->bias,$tag_index*4)[1]; - } - /** - * Get the bias value for tag - */ - private function getT($key, $tag_index) - { - return unpack("f",$this->tag_feature[$key],$tag_index*4)[1]; - } - /** - * Get the weight value for term at postion for tag - */ - private function getW($term, $position, $tag_index) - { - $t = unpack("S",$this->word_feature[$term], - 2*($position+2)*count($this->tag_set)+$tag_index*2)[1] - / 65535 - * ($this->max_w-$this->min_w) + $this->min_w;; - return $t; - } -} diff --git a/src/library/ContextWeightedPosTagger.php b/src/library/ContextWeightedPosTagger.php deleted file mode 100644 index d354d3fde..000000000 --- a/src/library/ContextWeightedPosTagger.php +++ /dev/null @@ -1,601 +0,0 @@ -<?php -/** - * SeekQuarry/Yioop -- - * Open Source Pure PHP Search Engine, Crawler, and Indexer - * - * Copyright (C) 2009 - 2019 Chris Pollett chris@pollett.org - * - * LICENSE: - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <https://www.gnu.org/licenses/>. - * - * @author Xianghong Sun sxh19911230@gmail.com - * @license https://www.gnu.org/licenses/ GPL3 - * @link https://www.seekquarry.com/ - * @copyright 2009 - 2019 - * @filesource - */ -namespace seekquarry\yioop\library; - -use seekquarry\yioop\configs as C; -use seekquarry\yioop\locale\zh_CN\resources as ZH; - -/** - * Machine learning based POS tagger. Typically, ContextWeightedPosTagger.php - * can train the language with some dataset and predict - * the tag given a list of word. - * - * Instruction to add a new language: - * Add a switch case in the constructor. - * Define the following functions: - * getKeyImpl - * See the class function 'getKey' for more information - * - * @author Xianghong Sun - */ -class ContextWeightedPosTagger -{ - /** - * Current Language, only tested on Simplified Chinese - * Might be extensable for other languages in the furture - * @var string - */ - public $lang; - /** - * The weight for predicting the pos tag - * y = wx + b - * Generized by training method - * @var array - */ - public $w; - /** - * The bias for predicting the pos tag - * y = wx + b - * Generized by training method - * @var array - */ - public $b; - /** - * range of w - */ - private $min_w; - private $max_w; - /** - * All Possiable tag set - * Generized by training method - * @var associative array [tag => tag index] - */ - private $tag_set; - /** - * The unknown words should be picked from these tags - */ - private $unknown_word_possiable_tags=[]; - /** - * Check if all the chars in the term is not current language - * @param $term is a string that to be checked - * @return true if all the chars in $term is not current language - * false otherwise - */ - public function notCurrentLang($term) - { - return preg_match("/^[^\p{Han}]+$/u", $term); - } - /** - * The constructer of the pos tagger - * To extend to other languages, some work are needed: - * Define $this->getKeyImpl, $this->rule_defined_key - * See Chinese example. - * @param @string $lang describes current langauge - * @param @book $packed describes how weight and bias would look like - */ - public function __construct($lang, $packed = true) - { - //$this->packed = $packed; - switch($lang) { - case("zh_CN"): - case("zh-CH"): - $this->lang = "zh_CN"; - /* - * Some Exception of Tags. Some tags are detemined by ruls. - * e.x. There are infinity amount of Arabic numerals. - */ - $this->getKeyImpl = function($term) { - $key = ZH\Tokenizer::POSGetKey($term); - return $key ? $this->tag_set[$key] : $term; - }; - //Tags from above - $this->rule_defined_key = ['PU','CD','OD','NT','FW']; - //Unknown word possiable tag - $this->unknown_word_possiable_tags=["NN","NR","VV","VA"]; - break; - default: - $this->lang = $lang; - } - } - /** - * __call for calling dynamic methods - * @param string $method method of this class to call - * @param array $args arguments to pass to method - * @return mixed result of method calculation - */ - public function __call($method, $args) - { - return call_user_func_array($this->$method, $args); - } - /** - * __get for getting dynamic variables - * @param string $var_name variable to retrieve - * @return mixed result of retrieval - */ - public function __get($var_name) - { - return $this->$var_name; - } - /** - * __set for assigning dynamic variables - * @param string $var_name variable to assign - * @param mixed $value value to assign to it - */ - public function __set($var_name, $value) - { - $this->$var_name = $value; - } - /** - * check if the term can be determined by algorithm, - * usually by regualr expression, because there are infinity - * amount of them. - * ex. 13th is an ordinal number, 123 is a cardinal number - * then use the determined tag to be the weight key - * @param @string $term is the term to be checked - * @return right key in feature matrix - */ - public function getKey($term) - { - if (isset($this->getKeyImpl)) { - return $this->getKeyImpl($term); - } - return $term; - } - - /** - * A function that process the trainning data - * @param @mixed $text_files can be a file or an array of file names - * @return @array of seperated sentences, each sentenfce have the format of - * [[words...],[tags...]] - * Currently, the trainning data needs to fit CTB format: - * term followed by a underscore and followed by the tag - * e.g. "新_VA 的_DEC 南斯拉夫_NR 会国_NN" - * To adapt to other language, some modifications are needed - */ - public static function processTexts($text_files, $term_tag_splier="_", - $term_process = null, $tag_process = null) - { - $ret=[]; - foreach($text_files as $text_file) { - if (file_exists($text_file)) { - $fn = fopen($text_file,"r"); - while(! feof($fn)) { - $line = fgets($fn); - if(strpos($line, '<') !== false) { - continue; - } - $word_tag_pairs = preg_split("/[\s ]+/u", $line); - if (!count($word_tag_pairs)) { - continue; - } - $ret[]=[]; - $ret[count($ret)-1][0]=[]; - $ret[count($ret)-1][1]=[]; - foreach ($word_tag_pairs as $word_tag_pair) { - $t = explode($term_tag_splier, $word_tag_pair); - - if (count($t) == 2) { - $ret[count($ret)-1][0][] = - $term_process ? $term_process($t[0]) : $t[0]; - $ret[count($ret)-1][1][] = - $tag_process ? $tag_process($t[1]) : $t[1]; - } - } - } - fclose($fn); - } - } - return $ret; - } - /** - * Function to train a data - * Notice: This function might run very long time, depending on training set - * @param @mixed $text_files are training data - * can be a file or an array of file names - * @param @float $learning_rate - * @param @int $max_epoch 1200 might be a good one, - * the weight will overfit if it's greater than this number - * @param @bool $resume if true, read the weight file and continue training - * if false, start from beginning - */ - public function train($text_files, $term_tag_splier="_", $learning_rate=0.1, - $max_epoch = 1200, $term_process = null, $tag_process = null, - $resume = false) - { - if (is_string($text_files)) { - $text_files = [$text_files]; - } - echo "Reading files\n"; - // term_tag_sentences[sentence#]=[[words...],[tags...]] - $term_tag_sentences = self::processTexts($text_files, $term_tag_splier, - $term_process, $tag_process); - if ($resume) { - echo "Loading weights... "; - $this->load_weight(true); - $tag_index = count($this->tag_set); - echo "ok\n"; - } else { - $this->w=[]; - $this->tag_set=[]; - $tag_index = 0; - if (isset($this->rule_defined_key)) { - foreach($this->rule_defined_key as $k) { - $this->tag_set[$k] = $tag_index++; - } - } - for ($i = -4; $i <= -1; $i++) { - $this->w[$i] = []; - } - } - foreach ($term_tag_sentences as $term_tag_pairs) { - $terms=$term_tag_pairs[0]; - $tags=$term_tag_pairs[1]; - for ($i = 0; $i < count($terms); $i++) { - if (!isset($this->tag_set[$tags[$i]])) { - $this->tag_set[$tags[$i]] = $tag_index++; - } - $k = $this->getIndex($i,$terms); - if (!isset($this->w[$k])) { - $this->w[$k] = []; - } - } - } - foreach (array_keys($this->w) as $key) { - for ($i=-2; $i<=2;$i++) { - if (!isset($this->w[$key][$i])) { - $this->w[$key][$i] = []; - } - foreach($this->tag_set as $possiable_tag => $tag_index) { - if (!isset($this->w[$key][$i][$tag_index])) { - $this->w[$key][$i][$tag_index] = 0; - } - } - } - } - foreach($this->tag_set as $possiable_tag => $tag_index) { - if (!isset($this->b[$tag_index])) { - $this->b[$tag_index] = 0; - } - } - echo "Training\n"; - //train the weight - $cross_entropy_loss = 1; - $pre_cross_entropy_loss = 2; - for ($epoch = 0; $epoch < $max_epoch && $pre_cross_entropy_loss - - $cross_entropy_loss > 0.000001; $epoch++) { - $this->min_w=0; - $this->max_w=0; - $time = time(); - $dy_dw = []; - $dy_dw_n = []; - $pre_cross_entropy_loss = $cross_entropy_loss; - $cross_entropy_loss = 0; - $cross_entropy_loss_n = 0; - - $dy_db=[]; - $dy_db_n=[]; - for($i = 0; $i < count($this->tag_set); $i++) { - $dy_db[$i] = 0; - $dy_db_n[$i] = 0; - } - //for each sentence - foreach ($term_tag_sentences as $term_tag_pairs) { - $terms=$term_tag_pairs[0]; - $tags=$term_tag_pairs[1]; - for ($i = 0; $i < count($terms); $i++) { - $k=[]; - for ($j=-2; $j<=2;$j++) { - $k[$j]= $this->getIndex($i+$j,$terms); - } - foreach ($this->tag_set as $possiable_tag => $tag_index) { - $equality = $possiable_tag == $tags[$i] ? 1 : 0; - $sum=0; - for ($j=-2; $j<=2;$j++) { - $sum += $this->w[$k[$j]][$j][$tag_index]; - } - $sum += $this->b[$tag_index]; - $sigmoid = 1 / (1 + exp(-1 * $sum)); - for ($j=-2; $j<=2;$j++) { - if (!isset($dy_dw[$k[$j]])) { - $dy_dw[$k[$j]] = []; - $dy_dw_n[$k[$j]] = []; - } - if (!isset($dy_dw[$k[$j]][$j])) { - $dy_dw[$k[$j]][$j] = []; - $dy_dw_n[$k[$j]][$j] = []; - } - if (!isset($dy_dw[$k[$j]][$j][$tag_index])) { - $dy_dw[$k[$j]][$j][$tag_index] = 0; - $dy_dw_n[$k[$j]][$j][$tag_index] = 0; - } - - $dy_dw[$k[$j]][$j][$tag_index] += - ($sigmoid - $equality); - $dy_dw_n[$k[$j]][$j][$tag_index] += 1; - - } - //dy_db - $dy_db[$tag_index] += ($sigmoid - $equality); - $dy_db_n[$tag_index] += 1; - $cross_entropy_loss+= - - $equality*log($sigmoid) - - (1-$equality)*log(1-$sigmoid); - $cross_entropy_loss_n++; - } - } - } - $cross_entropy_loss /= $cross_entropy_loss_n; - $duration = time() - $time; - echo "epoch {$epoch} cross_entropy {$cross_entropy_loss}" . - " Takes {$duration} seconds\n"; - foreach ($dy_dw as $i =>$v1) { - foreach ($v1 as $j =>$v2) { - foreach ($v2 as $k =>$v3) { - $this->w[$i][$j][$k] -= - $dy_dw[$i][$j][$k] / - $dy_dw_n[$i][$j][$k] * - $learning_rate; - if ($this->w[$i][$j][$k] < $this->min_w) { - $this->min_w = $this->w[$i][$j][$k]; - } - if ($this->w[$i][$j][$k] > $this->max_w) { - $this->max_w = $this->w[$i][$j][$k]; - } - } - } - } - foreach ($dy_db as $k =>$v) { - $this->b[$k]-= - $dy_db[$k] / - $dy_db_n[$k] * - $learning_rate; - } - if ($epoch % 10 == 9 ) { - $this->save_weight(); - } - } - $this->save_weight(); - return true; - } - /** - * The primary function to predit the tag - * @param mixed $sentence is an array of segmented words/terms - * or a string with words/terms seperated by space - * @return @array of tags - */ - public function predict($sentence) - { - if (!is_array($sentence)) { - if ($sentence == "") { - $terms=[]; - } else { - $terms=preg_split("/[\s]+/",$sentence); - } - } else { - $terms=$sentence; - } - if (!count($terms)) { - return []; - } - if (!$this->w) { - $this->load_weight(); - } - $ret = []; - for($i = 0; $i < count($terms); $i++) { - $term = $terms[$i]; - $score =[]; - $key=$this->getKey($term); - foreach($this->tag_set as $possiable_tag => $tag_index) { - $score[$possiable_tag]=0; - for ($j=-2; $j <=2; $j++) { - $k=$this->getIndex($i+$j, $terms); - if (isset($this->w[$k])) { - $score[$possiable_tag] += - $this->getW($k,$j,$tag_index); - } else if ($j==0&&!in_array($possiable_tag, - $this->unknown_word_possiable_tags)) { - $score[$possiable_tag] += $this->min_w; - } - } - - $score[$possiable_tag] += $this->getB($tag_index); - - //$score[$possiable_tag] - // += 1 / (1 + exp(-1 * $score[$possiable_tag])); - } - $ret[]=array_keys($score, max($score))[0]; - } - return $ret; - } - /** - * Wrap function for predict - * @param $texts to be a @string of texts - * @param $return_string is a boolean to determing if the user - * want it to out put to stdout or a return value - * @return @string if $return_string is true; - * @boolean true otherwise - * e.g. 中国_NR 人民_NN 将_AD 满怀信心_VV - * 地_DEV 开创_VV 新_VA 的_DEC 业绩_NN 。_PU - */ - public function tag($texts, $return_string=false) - { - if ($return_string) { - $ret = ""; - } - $sentences = preg_split('/\r\n|\r|\n/', $texts); - foreach($sentences as $sentence) { - $sentence=explode(" ",trim($sentence)); - $term_pos = $this->predict($sentence); - for($i = 0; $i < count($term_pos); $i++) { - $term_pos[$i]=$sentence[$i]."_".$term_pos[$i]; - } - $t = join(" ", $term_pos); - if ($return_string) { - $ret .= $t; - } else { - echo $t, "\n"; - } - } - if ($return_string) { - return $ret; - } else { - return true; - } - } - /** - * A list of private helper functions - * Given a setence ($term), find the key at position $index - */ - private function getIndex($index, $terms) - { - if ($index < 0) $k = $index - 2; - else if ($index >= count($terms)) { - $k = $index - count($terms) - 2; - } - else { - $k = $this->getKey($terms[$index]); - } - return $k; - } - /** - * Get the bias value for tag - */ - private function getB($tag_index) - { - return unpack("f",$this->b,$tag_index*4)[1]; - } - /** - * Set the bias value for tag - */ - private function setB($tag_index, $value) - { - $this->b = substr_replace($this->b,pack("f",$value),$tag_index*4,4); - } - /** - * Get the weight value for term at postion for tag - */ - private function getW($term, $position, $tag_index) - { - $t = unpack("S",$this->w[$term], - 2*($position+2)*count($this->tag_set)+$tag_index*2)[1] - / 65535 - * ($this->max_w-$this->min_w) + $this->min_w;; - return $t; - } - /** - * save the trained weight to disk - */ - private function save_weight() - { - $out_file = C\LOCALE_DIR . "/{$this->lang}/resources/pos_weight.txt.gz"; - $out = []; - $out["min_w"] = $this->min_w; - $out["max_w"] = $this->max_w; - $out["w"]=[]; - foreach(array_keys($this->w) as $key) { - $out["w"][$key] = $this->pack_w($key); - } - $out["b"] = $this->pack_b(); - $out["tag_set"] = $this->tag_set; - echo "Saving..."; - file_put_contents($out_file, - gzencode(serialize($out),9)); - echo " ok\n"; - } - /** - * load the trained weight from disk - */ - private function load_weight($trainning_load=false) - { - $dic_file = C\LOCALE_DIR . - "/{$this->lang}/resources/pos_weight.txt.gz"; - if (!file_exists($dic_file)) { - echo "$dic_file does not exist!"; - exit(); - } - $f = unserialize(gzdecode(file_get_contents($dic_file)) - ,['allowed_classes' => false]); - $this->w=$f["w"]; - $this->b=$f["b"]; - $this->min_w=$f["min_w"]; - $this->max_w=$f["max_w"]; - $this->tag_set=$f["tag_set"]; - if ($trainning_load) { - foreach(array_keys($this->w) as $key) { - $this->w[$key] = $this->unpack_w($key); - } - $this->b = $this->unpack_b($this->b); - } - } - /** - * Pack the bias - */ - private function pack_b() - { - return pack("f*", ...$this->b); - } - /** - * Unpack the bias - */ - private function unpack_b() - { - return array_merge(unpack("f".strval(count($this->tag_set)),$this->b)); - } - /** - * Pack the weight - */ - private function pack_w($key) - { - $bin_str = ""; - foreach($this->w[$key] as $i => $t) { - foreach($t as $u) { - $v = 65535 * ($u-$this->min_w) / ($this->max_w-$this->min_w); - $bin_str .= pack("S", intval($v)); - } - } - return $bin_str; - } - /** - * Unpack the weight - */ - private function unpack_w($key) - { - $tmp = []; - $size = count($this->tag_set); - for ($i = 0; $i < 5; $i++) { - $tmp[$i-2] = array_merge(unpack("S".strval($size), - $this->w[$key], 2*$i*count($this->tag_set))); - for($j = 0; $j < $size; $j++) { - $tmp[$i-2][$j] = $tmp[$i-2][$j] / 65535 - * ($this->max_w-$this->min_w) + $this->min_w; - } - } - return $tmp; - } -} diff --git a/src/library/CrawlDaemon.php b/src/library/CrawlDaemon.php index efac71dfe..c7265ab7f 100644 --- a/src/library/CrawlDaemon.php +++ b/src/library/CrawlDaemon.php @@ -239,7 +239,7 @@ class CrawlDaemon implements CrawlConstants * Sends the message to stardard out if crawlLog not set up; otherwise, * sends to crawlLog() * - * @param string $masg string to log to either standard out or + * @param string $msg string to log to either standard out or * to Yioop's crawlLog * @param int $exit_type the exit_type used by init() and start() * values of absolute value >2 are only used if crawlLog has diff --git a/src/library/DoubleIndexBundle.php b/src/library/DoubleIndexBundle.php index f4cf5c69b..e71165ebd 100644 --- a/src/library/DoubleIndexBundle.php +++ b/src/library/DoubleIndexBundle.php @@ -210,7 +210,7 @@ class DoubleIndexBundle implements CrawlConstants * * @param int $generation field used to select partition * @param string $offset_field field used to record offsets after storing - * @param array& $pages data to store + * @param array &$pages data to store * @param int $visited_urls_count number to add to the count of visited urls * (visited urls is a smaller number than the total count of objects * stored in the index). diff --git a/src/library/FeedArchiveBundle.php b/src/library/FeedArchiveBundle.php index ac6df08fc..ef41f1674 100644 --- a/src/library/FeedArchiveBundle.php +++ b/src/library/FeedArchiveBundle.php @@ -37,18 +37,26 @@ use seekquarry\yioop\configs as C; */ require_once __DIR__ . '/Utility.php'; /** + * Subclass of IndexArchiveBundle with bloom filters to make it easy to check + * if a news feed item has been added to the bundle already before adding it * * @author Chris Pollett */ class FeedArchiveBundle extends IndexArchiveBundle { /** - * + * Used to store unique identifiers of feed itemms that have been stored + * in this FeedArchiveBundle. This filter_a is used for checking if items + * are already in the archive, when it has URL_FILTER_SIZE/2 items + * filter_b is added to as well as filter_a. When filter_a is of size + * URL_FILTER_SIZE filter_a is deleted, filter_b is renamed to filter_a + * and the process is repeated. * @var BloomFilterFile */ public $filter_a; /** - * + * Auxiliary BloomFilterFile used in checking if feed items are in this + * archive or not. @see $filter_a * @var BloomFilterFile */ public $filter_b; @@ -96,7 +104,7 @@ class FeedArchiveBundle extends IndexArchiveBundle * @param string $offset_field field used to record offsets after storing * @param string $key_field field used to store unique identifier for a * each page item. - * @param array& $pages data to store + * @param array &$pages data to store * @param int $visited_urls_count number to add to the count of visited urls * (visited urls is a smaller number than the total count of objects * stored in the index). @@ -112,7 +120,12 @@ class FeedArchiveBundle extends IndexArchiveBundle $visited_urls_count); } /** - * + * Adds the key (often GUID) of a feed item to the bloom filter pair + * associated with this archive. This always adds to filter a, if + * filter a is more than half full it adds to filter b. If filter a is full + * it is deletedand filter b is renamed filter a and te process continues + * where a new filter b is created when this becomee half full. + * @param string $key unique identifier of a feed item */ public function addFilters($key) { @@ -138,7 +151,10 @@ class FeedArchiveBundle extends IndexArchiveBundle } } /** - * + * Whether the active filter for this feed contain thee feed item + * of thee supplied key + * @param string $key the feed item id to check if in arcive + * @return bool true if it is in the archive, false otherwise */ public function contains($key) { diff --git a/src/library/FetchUrl.php b/src/library/FetchUrl.php index 449eebc5a..42074ec88 100755 --- a/src/library/FetchUrl.php +++ b/src/library/FetchUrl.php @@ -582,7 +582,7 @@ class FetchUrl implements CrawlConstants * Computes a hash of a string containing page data for use in * deduplication of pages with similar content * - * @param string& $page reference to web page data + * @param string &$page reference to web page data * @return string 8 byte hash to identify page contents */ public static function computePageHash(&$page) diff --git a/src/library/IndexArchiveBundle.php b/src/library/IndexArchiveBundle.php index cef420567..693bbd1d9 100644 --- a/src/library/IndexArchiveBundle.php +++ b/src/library/IndexArchiveBundle.php @@ -176,7 +176,7 @@ class IndexArchiveBundle implements CrawlConstants * * @param int $generation field used to select partition * @param string $offset_field field used to record offsets after storing - * @param array& $pages data to store + * @param array &$pages data to store * @param int $visited_urls_count number to add to the count of visited urls * (visited urls is a smaller number than the total count of objects * stored in the index). diff --git a/src/library/IndexDictionary.php b/src/library/IndexDictionary.php index 45dcbb435..3c7da6a92 100644 --- a/src/library/IndexDictionary.php +++ b/src/library/IndexDictionary.php @@ -1110,15 +1110,15 @@ class IndexDictionary implements CrawlConstants * @param int $file_num which prefix file to read from (always reads * a file at the max_tier level) * @param int $num_aux_records - * @param int& $total_count + * @param int &$total_count * @param int $threshold - * @param array& $info - * @param int& $previous_generation - * @param int& $num_generations + * @param array &$info + * @param int &$previous_generation + * @param int &$num_generations * @param int $offset * @param int $num_distinct_generations - * @param int& $max_retained_generation - * @param array& $id_info + * @param int &$max_retained_generation + * @param array &$id_info */ public function addAuxInfoRecords($id, $file_num, $num_aux_records, &$total_count, $threshold, &$info, &$previous_generation, @@ -1173,7 +1173,7 @@ class IndexDictionary implements CrawlConstants * $max_retained_generation, $info) and filters blank entries from * $info and returns the resulting triple * - * @param int& $total_count + * @param int &$total_count * @param int $max_retained_generation * @param array $info * @return array resulting triple @@ -1197,14 +1197,14 @@ class IndexDictionary implements CrawlConstants * the quadruple array for * @param array $record current record from dictionary that we may or may * not add to info - * @param array& $info quadruple array we are adding to - * @param int& $total_count count of items in $info - * @param int& $previous_generation last generation added to $info - * @param int& $previous_id last exact if added to $info - * @param int& $num_generations + * @param array &$info quadruple array we are adding to + * @param int &$total_count count of items in $info + * @param int &$previous_generation last generation added to $info + * @param int &$previous_id last exact if added to $info + * @param int &$num_generations * @param int $num_distinct_generations - * @param int& $max_retained_generation - * @param array& $id_info + * @param int &$max_retained_generation + * @param array &$id_info */ public function addLookedUpEntry($id, $word_id, $record, &$info, &$total_count, &$previous_generation, &$previous_id, @@ -1285,7 +1285,7 @@ class IndexDictionary implements CrawlConstants * @param int $file_num which dictionary file (given by first letter prefix) * to read from * @param int $bytes byte offset to start reading from - * @return &string data fromIndexShard file + * @return string &data fromIndexShard file */ public function &readBlockDictAtOffset($file_num, $bytes) { diff --git a/src/library/IndexManager.php b/src/library/IndexManager.php index c841c1871..23e33d2a2 100644 --- a/src/library/IndexManager.php +++ b/src/library/IndexManager.php @@ -245,18 +245,12 @@ class IndexManager implements CrawlConstants } /** * Returns the number of document that a given term or phrase appears in - * in the given index + * in the given index where we discount later generation -- those with + * lower document rank more * - * @param string $term_or_phrase what to look up in the indexes dictionary + * @param string $term what to look up in the indexes dictionary * no mask is used for this look up * @param string $index_name index to look up term or phrase in - * @param int $threshold if set and positive then once threshold many - * documents are found the search for more documents to add to the - * total is stopped - * @param int $start_generation what generation in the index to start - * finding occurrence of phrase from - * @param int $num_distinct_generations from $start_generation how - * many generation to search forward to * @return int number of documents */ public static function discountedNumDocsTerm($term, $index_name) diff --git a/src/library/IndexShard.php b/src/library/IndexShard.php index 6c41e8d29..26f1f957e 100644 --- a/src/library/IndexShard.php +++ b/src/library/IndexShard.php @@ -31,12 +31,10 @@ namespace seekquarry\yioop\library; use seekquarry\yioop\configs as C; - /** * Load charCopy */ require_once __DIR__ . "/Utility.php"; - /** * Data structure used to store one generation worth of the word document * index (inverted index). This data structure consists of three main @@ -634,7 +632,7 @@ class IndexShard extends PersistentStructure implements CrawlConstants * * @param int $start_offset of the current posting list for query term * used in calculating BM25F. - * @param int& $next_offset where to start in word docs + * @param int &$next_offset where to start in word docs * @param int $last_offset offset at which to stop by * @param int $len number of documents desired * @param int $direction which direction to iterate through elements @@ -997,7 +995,7 @@ class IndexShard extends PersistentStructure implements CrawlConstants * Computes BM25F relevance and a score for the supplied item based * on the supplied parameters. * - * @param array& $item doc summary to compute a relevance and score for. + * @param array &$item doc summary to compute a relevance and score for. * Pass-by-ref so self::RELEVANCE and self::SCORE fields can be changed * @param int $occurrences - number of occurences of the term in the item * @param int $doc_len number of words in doc item represents @@ -1036,9 +1034,9 @@ class IndexShard extends PersistentStructure implements CrawlConstants * * @param int $current an index into the word_docs strings * corresponds to a start search loc of $current * self::POSTING_LEN - * @param int& $posting_start after function call will be + * @param int &$posting_start after function call will be * index of start of nearest posting to current - * @param int& $posting_end after function call will be + * @param int &$posting_end after function call will be * index of end of nearest posting to current * @return string the substring of word_docs corresponding to the posting */ @@ -1194,7 +1192,7 @@ class IndexShard extends PersistentStructure implements CrawlConstants * position $current forward until either $end is reached or a * posting with document index bigger than $doc_index is found * - * @param int& $current current posting offset into posting list + * @param int &$current current posting offset into posting list * @param int $doc_index document index want bigger than or equal to * @param int $end last index of posting list * @return int document index bigger than or equal to $doc_index. Since @@ -2027,7 +2025,7 @@ class IndexShard extends PersistentStructure implements CrawlConstants * @param int $bytes byte offset to start reading from * @param bool $cache whether to cache disk blocks that have been read to * RAM - * @return &string data fromIndexShard file + * @return mixed data fromIndexShard file if found, false otherwise */ public function readBlockShardAtOffset($bytes, $cache = true) { @@ -2141,9 +2139,9 @@ class IndexShard extends PersistentStructure implements CrawlConstants * Load an IndexShard from a file or string * * @param string $fname the name of the file to the IndexShard from/to - * @param string& $data stringified shard data to load shard from. If null + * @param string &$data stringified shard data to load shard from. If null * then the data is loaded from the $fname if possible - * @return object the IndexShard loaded + * @return IndexShard the IndexShard loaded */ public static function load($fname, &$data = null) { @@ -2216,7 +2214,7 @@ class IndexShard extends PersistentStructure implements CrawlConstants * Callback function for load method. splits a word_key . word_info string * into an entry in the passed shard $shard->words[word_key] = $word_info. * - * @param string& $value the word_key . word_info string + * @param string &$value the word_key . word_info string * @param int $key index in array - we don't use * @param object $shard IndexShard to add the entry to word table for */ diff --git a/src/library/LocaleFunctions.php b/src/library/LocaleFunctions.php index b514c7c71..5450dda77 100755 --- a/src/library/LocaleFunctions.php +++ b/src/library/LocaleFunctions.php @@ -22,9 +22,6 @@ * * END LICENSE * - * This file contains global functions connected to localization that - * are used throughout the web site part of Yioop! - * * @author Chris Pollett chris@pollett.org * @license https://www.gnu.org/licenses/ GPL3 * @link https://www.seekquarry.com/ @@ -33,10 +30,15 @@ */ namespace seekquarry\yioop\library; +/** + * This file contains global functions connected to localization that + * are used throughout the web site part of Yioop! + */ use seekquarry\yioop\configs as C; use seekquarry\yioop\models\LocaleModel; - -/** For Yioop global defines */ +/** + * For Yioop global defines + */ require_once __DIR__."/../configs/Config.php"; /** * Returns an array of locales that have a stop words list and a stop words @@ -50,7 +52,9 @@ function localesWithStopwordsList() 'vi-VN', 'zh-CN']; } /** - * + * Converts a $locale_tag (major-minor) to an Iso 632-2 language name + * @param string $locale_tag want to convert + * @return string corresponding Iso 632-2 language tag */ function localeTagToIso639_2Tag($locale_tag) { @@ -277,7 +281,7 @@ function guessEncodingHtmlXml($html, $return_loc_info = false) * Converts page data in a site associative array to UTF-8 if it is not * already in UTF-8 * - * @param array& $site an associative of info about a web site + * @param array &$site an associative of info about a web site * @param string $page_field the field in the associative array that * contains the $site's web page as a string. * @param string $encoding_field the field in the associative array that diff --git a/src/library/MailServer.php b/src/library/MailServer.php index 9d8f45c37..d71308155 100644 --- a/src/library/MailServer.php +++ b/src/library/MailServer.php @@ -220,7 +220,9 @@ class MailServer implements MediaConstants $data = ""; while($line = fgets($this->connection)) { $data .= $line; - if ($line[self::SMTP_CODE_LEN] == ' ') { break; } + if ($line[self::SMTP_CODE_LEN] == ' ') { + break; + } } $this->messages .= $data; return substr($data, 0, self::SMTP_CODE_LEN); diff --git a/src/library/NamedEntityContextTagger.php b/src/library/NamedEntityContextTagger.php new file mode 100644 index 000000000..94a2cba73 --- /dev/null +++ b/src/library/NamedEntityContextTagger.php @@ -0,0 +1,360 @@ +<?php +/** + * SeekQuarry/Yioop -- + * Open Source Pure PHP Search Engine, Crawler, and Indexer + * + * Copyright (C) 2009 - 2020 Chris Pollett chris@pollett.org + * + * LICENSE: + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <https://www.gnu.org/licenses/>. + * + * @author Xianghong Sun sxh19911230@gmail.com + * @license https://www.gnu.org/licenses/ GPL3 + * @link https://www.seekquarry.com/ + * @copyright 2009 - 2019 + * @filesource + */ +namespace seekquarry\yioop\library; + +use seekquarry\yioop\configs as C; + +/** + * Machine learning based named entity recognizer. + * NamedEntityContextTagger is used by @see StochasticTermSegmenter + * to help in segmenting sentences in which no term separators such as spaces + * are used. + * + * @author Xianghong Sun (Principal), + * Chris Pollett (mainly simplifications, and documentation) + */ +class NamedEntityContextTagger extends ContextTagger +{ + /** + * Constructor for the NamedEntityContextTagger. + * Sets the language this tagger tags for and sets up the path for + * where it should be stored + * @param string $lang locale tag of the language this tagger tags is for + */ + public function __construct($lang) + { + $this->tagger_file = "nect_weights.txt.gz"; + parent::__construct($lang); + } + /** + * Uses text files containing sentences to create a matrix + * so that from a two chars before a term, two chars after a char context, + * together with a two tags before a term context and a term, + * the odds that a named entity as been found can be calculated + * + * @param mixed $text_files with training data. These can be a file or + * an array of file names. + * @param string $term_tag_separator separator used to separate term and tag + * for terms in input sentence + * @param float $learning_rate learning rate when cycling over data trying + * to minimize the cross-entropy loss in the prediction of the tag of the + * middle term. + * @param int $num_epoch number of times to cycle through the + * complete data set. Default value of 1200 seems to avoid overfitting + * @param function $term_callback callback function applied to a term + * before adding term to sentence term array as part of processing and + * training with a sentence. + * @param function $tag_callback callback function applied to a part of + * speech tag before adding tag to sentence tag array as part of + * processing and training with a sentence. + */ + public function train($text_files, $term_tag_separator = "-", + $learning_rate = 0.1, $num_epoch = 1200, $term_callback = null, + $tag_callback = null, $resume = false) + { + if (is_string($text_files)) { + $text_files = [$text_files]; + } + echo "Reading files\n"; + // term_tag_sentences[sentence#]=[[words...],[tags...]] + $term_tag_sentences = self::processTexts($text_files, + $term_tag_separator, $term_callback, $tag_callback); + $this->word_feature = []; + $this->tag_set = []; + $tag_index = 0; + for ($i = -4; $i <= -1; $i++) { + $this->word_feature[$i] = []; + } + foreach ($term_tag_sentences as $term_tag_pairs) { + $terms = $term_tag_pairs[0]; + $tags = $term_tag_pairs[1]; + $this->tag_feature["start"] = []; + $this->tag_feature["start-start"] = []; + for ($i = 0; $i < count($terms); $i++) { + if (!isset($this->tag_set[$tags[$i]])) { + $this->tag_set[$tags[$i]] = $tag_index++; + } + if ($i == 0) {} + else if ($i == 1) { + if (!isset($this->tag_feature["start-" . $tags[$i-1]])) { + $this->tag_feature["start-".$tags[$i - 1]] = []; + } + if (!isset($this->tag_feature[$tags[$i - 1]])) { + $this->tag_feature[$tags[$i - 1]] = []; + } + } else { + if (!isset($this->tag_feature[$tags[$i - 2] . "-" . + $tags[$i - 1]])) { + $this->tag_feature[$tags[$i - 2] . "-" . + $tags[$i - 1]] = []; + } + if (!isset($this->tag_feature[$tags[$i - 1]])) { + $this->tag_feature[$tags[$i - 1]] = []; + } + } + if (!isset($this->word_feature[$terms[$i]])) { + $this->word_feature[$terms[$i]] = []; + } + } + } + foreach (array_keys($this->word_feature) as $key) { + for ($i = -2; $i <= 2 ;$i++) { + if (!isset($this->word_feature[$key][$i])) { + $this->word_feature[$key][$i] = []; + } + foreach($this->tag_set as $possible_tag => $tag_index) { + if (!isset($this->word_feature[$key][$i][$tag_index])) { + $this->word_feature[$key][$i][$tag_index] = 0; + } + } + } + } + foreach (array_keys($this->tag_feature) as $key) { + foreach($this->tag_set as $possible_tag => $tag_index) { + if (!isset($this->tag_feature[$key][$tag_index])) { + $this->tag_feature[$key][$tag_index] = 0; + } + } + } + foreach($this->tag_set as $possible_tag => $tag_index) { + if (!isset($this->bias[$tag_index])) { + $this->bias[$tag_index] = 0; + } + } + echo "Training...\n"; + //train the weight + $cross_entropy_loss = 1; + $pre_cross_entropy_loss = 2; + for ($epoch = 0; ($epoch < $num_epoch) && + $pre_cross_entropy_loss - $cross_entropy_loss > 0.000001; + $epoch++) { + $this->min_w = 0; + $this->max_w = 0; + $time = time(); + $dy_dw = []; + $dy_dw_n = []; + $pre_cross_entropy_loss = $cross_entropy_loss; + $cross_entropy_loss = 0; + $cross_entropy_loss_n = 0; + $dy_db = []; + $dy_db_n = []; + $dy_dt = []; + $dy_dt_n = []; + for($i = 0; $i < count($this->tag_set); $i++) { + $dy_db[$i] = 0; + $dy_db_n[$i] = 0; + } + //for each sentence + foreach ($term_tag_sentences as $term_tag_pairs) { + $terms=$term_tag_pairs[0]; + $tags=$term_tag_pairs[1]; + for ($i = 0; $i < count($terms); $i++) { + $k=[]; + for ($j=-2; $j <= 2;$j++) { + $k[$j]= $this->getIndex($i + $j,$terms); + } + foreach ($this->tag_set as $possible_tag => $tag_index) { + $equality = $possible_tag == $tags[$i] ? 1 : 0; + $sum=0; + //5 words including itself + for ($j=-2; $j <= 2; $j++) { + $sum += $this->word_feature[$k[$j]][$j][$tag_index]; + } + //previous 2 tags + if ($i == 0) { + $tf1 = "start"; + $tf2 = "start-start"; + } else if ($i == 1) { + $tf1 = $tags[$i - 1]; + $tf2 = "start-" . $tags[$i-1]; + } else { + $tf1 = $tags[$i - 1]; + $tf2 = $tags[$i - 2] . "-" . $tags[$i - 1]; + } + $sum += $this->tag_feature[$tf1][$tag_index]; + $sum += $this->tag_feature[$tf2][$tag_index]; + //bias + $sum += $this->bias[$tag_index]; + $sigmoid = 1 / (1 + exp(-1 * $sum)); + for ($j=-2; $j<=2;$j++) { + if (!isset($dy_dw[$k[$j]])) { + $dy_dw[$k[$j]] = []; + $dy_dw_n[$k[$j]] = []; + } + if (!isset($dy_dw[$k[$j]][$j])) { + $dy_dw[$k[$j]][$j] = []; + $dy_dw_n[$k[$j]][$j] = []; + } + if (!isset($dy_dw[$k[$j]][$j][$tag_index])) { + $dy_dw[$k[$j]][$j][$tag_index] = 0; + $dy_dw_n[$k[$j]][$j][$tag_index] = 0; + } + $dy_dw[$k[$j]][$j][$tag_index] += + ($sigmoid - $equality); + $dy_dw_n[$k[$j]][$j][$tag_index] += 1; + } + //dy_dt + if (!isset($dy_dt[$tf1])) { + $dy_dt[$tf1] = []; + $dy_dt_n[$tf1] = []; + } + if (!isset($dy_dt[$tf1][$tag_index])) { + $dy_dt[$tf1][$tag_index] = 0; + $dy_dt_n[$tf1][$tag_index] = 0; + } + if (!isset($dy_dt[$tf2])) { + $dy_dt[$tf2] = []; + $dy_dt_n[$tf2] = []; + } + if (!isset($dy_dt[$tf2][$tag_index])) { + $dy_dt[$tf2][$tag_index] = 0; + $dy_dt_n[$tf2][$tag_index] = 0; + } + $dy_dt[$tf1][$tag_index] += ($sigmoid - $equality); + $dy_dt_n[$tf1][$tag_index] += 1; + $dy_dt[$tf2][$tag_index] += ($sigmoid - $equality); + $dy_dt_n[$tf2][$tag_index] += 1; + //dy_db + $dy_db[$tag_index] += ($sigmoid - $equality); + $dy_db_n[$tag_index] += 1; + $cross_entropy_loss -= ($equality * log($sigmoid) + + (1 - $equality) * log(1 - $sigmoid)); + $cross_entropy_loss_n++; + } + } + } + $cross_entropy_loss /= $cross_entropy_loss_n; + $duration = time() - $time; + echo "Epoch {$epoch} cross_entropy {$cross_entropy_loss}". + " took {$duration} seconds\n"; + foreach ($dy_dw as $i => $v1) { + foreach ($v1 as $j => $v2) { + foreach ($v2 as $k => $v3) { + $this->word_feature[$i][$j][$k] -= $dy_dw[$i][$j][$k] / + $dy_dw_n[$i][$j][$k] * $learning_rate; + if ($this->word_feature[$i][$j][$k] < $this->min_w) { + $this->min_w = $this->word_feature[$i][$j][$k]; + } + if ($this->word_feature[$i][$j][$k] > $this->max_w) { + $this->max_w = $this->word_feature[$i][$j][$k]; + } + } + } + } + foreach ($dy_dt as $i => $v1) { + foreach ($v1 as $j => $v2) { + $this->tag_feature[$i][$j] -= $dy_dt[$i][$j] / + $dy_dt_n[$i][$j] * $learning_rate; + } + } + foreach ($dy_db as $k => $v) { + $this->bias[$k] -= $dy_db[$k] / $dy_db_n[$k] * $learning_rate; + } + if ($epoch % 10 == 9) { + $this->saveWeights(); + } + } + $this->saveWeights(); + } + /** + * Predicts named entities that exists in a sentence. + * @param mixed $sentence is an array of segmented words/terms + * or a string that will be split on white space + * @return array all predicted named entities together with a tag + * indicating kind of named entity + * ex. [["郑振铎","nr"],["国民党","nt"]] + */ + public function predict($sentence) + { + if (!is_array($sentence)) { + if ($sentence == "") { + $terms = []; + } else { + $terms = preg_split("/[\s]+/u", $sentence); + } + } else { + $terms = $sentence; + } + if (!count($terms)) { + return []; + } + if (!$this->word_feature) { + $this->loadWeights(); + } + $result = []; + for($i = 0; $i < count($terms); $i++) { + $term = $terms[$i]; + $score = []; + foreach($this->tag_set as $possible_tag => $tag_index) { + $score[$possible_tag] = 0; + for ($j = -2; $j <= 2; $j++) { + $k = $this->getIndex($i + $j, $terms); + if (isset($this->word_feature[$k])) { + $score[$possible_tag] += + $this->getW($k, $j, $tag_index); + } + } + if ($i == 0) { + $tf1 = "start"; + $tf2 = "start-start"; + } else if ($i == 1) { + $tf1 = $result[$i - 1]; + $tf2 = "start-" . $result[$i - 1]; + } else { + $tf1 = $result[$i - 1]; + $tf2 = $result[$i - 2] . "-" . $result[$i-1]; + } + $score[$possible_tag] += $this->getT($tf1, $tag_index); + $score[$possible_tag] += $this->getT($tf2, $tag_index); + $score[$possible_tag] += $this->getB($tag_index); + } + $result[] = array_keys($score, max($score))[0]; + } + $pre_tag = 'o'; + $current_entity = ""; + $ret = []; + for ($i = 0; $i < count($terms); $i++) { + if ($pre_tag != $result[$i] && $pre_tag != "o") { + if (mb_strlen($current_entity) < 10) { + $ret[] = [$current_entity, $pre_tag]; + } + $current_entity = ""; + } + if ($result[$i] != "o") { + if ($current_entity) { + $current_entity .= $terms[$i]; + } else { + $current_entity = $terms[$i]; + } + } + $pre_tag = $result[$i]; + } + return $ret; + } +} diff --git a/src/library/PageRuleParser.php b/src/library/PageRuleParser.php index 8b71a2538..70c3073af 100644 --- a/src/library/PageRuleParser.php +++ b/src/library/PageRuleParser.php @@ -201,7 +201,7 @@ class PageRuleParser implements CrawlConstants * Executes either the internal $rule_trees or the passed $rule_trees * on the provided $page_data associative array * - * @param array& $page_data an associative array of containing summary + * @param array &$page_data an associative array of containing summary * info of a web page/record (will be changed by this operation) * @param array $rule_trees an array of annotated syntax trees to * for rules used to update $page_data @@ -223,7 +223,7 @@ class PageRuleParser implements CrawlConstants * Used to execute a single command rule on $page_data * * @param array $tree annotated syntax tree of a function call rule - * @param array& $page_data an associative array of containing summary + * @param array &$page_data an associative array of containing summary * info of a web page/record (will be changed by this operation) */ public function executeFunctionRule($tree, &$page_data) @@ -250,7 +250,7 @@ class PageRuleParser implements CrawlConstants * Used to execute a single assignment rule on $page_data * * @param array $tree annotated syntax tree of an assignment rule - * @param array& $page_data an associative array of containing summary + * @param array &$page_data an associative array of containing summary * info of a web page/record (will be changed by this operation) */ public function executeAssignmentRule($tree, &$page_data) @@ -304,7 +304,7 @@ class PageRuleParser implements CrawlConstants * of meta words for this page * * @param $field the key in $page_data to use - * @param array& $page_data an associative array of containing summary + * @param array &$page_data an associative array of containing summary * info of a web page/record */ public function addMetaWord($field, &$page_data) @@ -329,7 +329,7 @@ class PageRuleParser implements CrawlConstants * which when clicked would perform a Yioop search on madonna. * * @param $field the key in $page_data to use - * @param array& $page_data an associative array of containing summary + * @param array &$page_data an associative array of containing summary * info of a web page/record */ public function addKeywordLink($field, &$page_data) @@ -348,7 +348,7 @@ class PageRuleParser implements CrawlConstants * Set field variable to be used as a stack * * @param $field what field variable to use for current stack - * @param array& $page_data an associative array of containing summary + * @param array &$page_data an associative array of containing summary * info of a web page/record */ public function setStack($field, &$page_data) @@ -367,7 +367,7 @@ class PageRuleParser implements CrawlConstants * stack * * @param $field what field to get data to push onto fcurrent stack - * @param array& $page_data an associative array of containing summary + * @param array &$page_data an associative array of containing summary * info of a web page/record */ public function pushStack($field, &$page_data) @@ -390,7 +390,7 @@ class PageRuleParser implements CrawlConstants * stack * * @param $field what field to get data to push onto fcurrent stack - * @param array& $page_data an associative array of containing summary + * @param array &$page_data an associative array of containing summary * info of a web page/record */ public function popStack($field, &$page_data) @@ -406,7 +406,7 @@ class PageRuleParser implements CrawlConstants * * @param $dir output directory in which to write data.txt files containing * the contents of some fields after writeOutput commands - * @param array& $page_data an associative array of containing summary + * @param array &$page_data an associative array of containing summary * info of a web page/record */ public function setOutputFolder($dir, &$page_data) @@ -417,7 +417,7 @@ class PageRuleParser implements CrawlConstants * Set output format * * @param $format can be either csv or sql - * @param array& $page_data an associative array of containing summary + * @param array &$page_data an associative array of containing summary * info of a web page/record */ public function setOutputFormat($format, &$page_data) @@ -430,7 +430,7 @@ class PageRuleParser implements CrawlConstants * Set output table * * @param $table table to use if output format is sql - * @param array& $page_data an associative array of containing summary + * @param array &$page_data an associative array of containing summary * info of a web page/record */ public function setOutputTable($table, &$page_data) @@ -444,7 +444,7 @@ class PageRuleParser implements CrawlConstants * * * @param $field the key in $page_data to use - * @param array& $page_data an associative array of containing summary + * @param array &$page_data an associative array of containing summary * info of a web page/record */ public function toArray($field, &$page_data) @@ -463,7 +463,7 @@ class PageRuleParser implements CrawlConstants * and stores the result back into $page_data[$field] * * @param $field the key in $page_data to use - * @param array& $page_data an associative array of containing summary + * @param array &$page_data an associative array of containing summary * info of a web page/record */ public function toString($field, &$page_data) @@ -479,7 +479,7 @@ class PageRuleParser implements CrawlConstants * it just sets it to the empty string * * @param $field the key in $page_data to use - * @param array& $page_data an associative array of containing summary + * @param array &$page_data an associative array of containing summary * info of a web page/record */ public function unsetVariable($field, &$page_data) @@ -496,7 +496,7 @@ class PageRuleParser implements CrawlConstants * format. If the field is not set nothing is written * * @param $field the key in $page_data to use - * @param array& $page_data an associative array of containing summary + * @param array &$page_data an associative array of containing summary * info of a web page/record */ public function writeOutput($field, &$page_data) diff --git a/src/library/PartOfSpeechContextTagger.php b/src/library/PartOfSpeechContextTagger.php new file mode 100644 index 000000000..10fa5eec9 --- /dev/null +++ b/src/library/PartOfSpeechContextTagger.php @@ -0,0 +1,284 @@ +<?php +/** + * SeekQuarry/Yioop -- + * Open Source Pure PHP Search Engine, Crawler, and Indexer + * + * Copyright (C) 2009 - 2020 Chris Pollett chris@pollett.org + * + * LICENSE: + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <https://www.gnu.org/licenses/>. + * + * @author Xianghong Sun sxh19911230@gmail.com + * @license https://www.gnu.org/licenses/ GPL3 + * @link https://www.seekquarry.com/ + * @copyright 2009 - 2019 + * @filesource + */ +namespace seekquarry\yioop\library; + +use seekquarry\yioop\configs as C; + +/** + * Machine learning based Part of Speech tagger. + * A PartOfSpeechContextTagger can be used to train a tagger for a language + * according to some dataset. Once training is complete it can be used to + * predict the tags for terms in a string or array of terms. + * + * @author Xianghong Sun (Principal), + * Chris Pollett (mainly simplifications, and documentation) + */ +class PartOfSpeechContextTagger extends ContextTagger +{ + /** + * Constructor for the part of speech tagger. + * Sets the language this tagger tags for and sets up the path for + * where it should be stored + * @param string $lang locale tag of the language this tagger tags is for + */ + public function __construct($lang) + { + $this->tagger_file = "pos_weights.txt.gz"; + parent::__construct($lang); + } + /** + * Uses text files containing sentences to create a matrix + * so that from a two term before a term, two term after a term context + * and a term, the odds of each of its possible parts of speech can be + * calculated + * + * @param mixed $text_files with training data. These can be a file or + * an array of file names. For now these files are assumed to be in + * Chinese Treebank format. + * @param string $term_tag_separator separator used to separate term and tag + * for terms in input sentence + * @param float $learning_rate learning rate when cycling over data trying + * to minimize the cross-entropy loss in the prediction of the tag of the + * middle term. + * @param int $num_epoch number of times to cycle through the + * complete data set. Default value of 1200 seems to avoid overfitting + * @param function $term_callback callback function applied to a term + * before adding term to sentence term array as part of processing and + * training with a sentence. + * @param function $tag_callback callback function applied to a part of + * speech tag before adding tag to sentence tag array as part of + * processing and training with a sentence. + * @param bool $resume if true, read the weight file and continue training + * if false, start from beginning + */ + public function train($text_files, $term_tag_separator = "-", + $learning_rate = 0.1, $num_epoch = 1200, $term_callback = null, + $tag_callback = null, $resume = false) + { + if (is_string($text_files)) { + $text_files = [$text_files]; + } + echo "Reading files\n"; + // term_tag_sentences[sentence#] = [[words...], [tags...]] + $term_tag_sentences = self::processTexts($text_files, + $term_tag_separator, $term_callback, $tag_callback); + if ($resume) { + echo "Loading weights... "; + $this->loadWeights(true); + $tag_index = count($this->tag_set); + echo "ok\n"; + } else { + $this->word_feature = []; + $this->tag_set = []; + $tag_index = 0; + if (!empty($this->tokenizer) && method_exists($this->tokenizer, + "getPosKeyList")) { + $pos_key_list = $this->tokenizer::getPosKeyList(); + foreach($pos_key_list as $k) { + $this->tag_set[$k] = $tag_index++; + } + } + for ($i = -4; $i <= -1; $i++) { + $this->word_feature[$i] = []; + } + } + foreach ($term_tag_sentences as $term_tag_pairs) { + $terms = $term_tag_pairs[0]; + $tags = $term_tag_pairs[1]; + for ($i = 0; $i < count($terms); $i++) { + if (!isset($this->tag_set[$tags[$i]])) { + $this->tag_set[$tags[$i]] = $tag_index++; + } + $k = $this->getIndex($i, $terms); + if (!isset($this->word_feature[$k])) { + $this->word_feature[$k] = []; + } + } + } + foreach (array_keys($this->word_feature) as $key) { + for ($i = -2; $i <= 2; $i++) { + if (!isset($this->word_feature[$key][$i])) { + $this->word_feature[$key][$i] = []; + } + foreach($this->tag_set as $possible_tag => $tag_index) { + if (!isset($this->word_feature[$key][$i][$tag_index])) { + $this->word_feature[$key][$i][$tag_index] = 0; + } + } + } + } + foreach($this->tag_set as $possible_tag => $tag_index) { + if (!isset($this->bias[$tag_index])) { + $this->bias[$tag_index] = 0; + } + } + echo "Training\n"; + //train the weight + $cross_entropy_loss = 1; + $pre_cross_entropy_loss = 2; + for ($epoch = 0; $epoch < $num_epoch && + $pre_cross_entropy_loss - $cross_entropy_loss > 0.000001; $epoch++){ + $this->min_w = 0; + $this->max_w = 0; + $time = time(); + $dy_dw = []; + $dy_dw_n = []; + $pre_cross_entropy_loss = $cross_entropy_loss; + $cross_entropy_loss = 0; + $cross_entropy_loss_n = 0; + $dy_db = []; + $dy_db_n = []; + for($i = 0; $i < count($this->tag_set); $i++) { + $dy_db[$i] = 0; + $dy_db_n[$i] = 0; + } + //for each sentence + foreach ($term_tag_sentences as $term_tag_pairs) { + $terms = $term_tag_pairs[0]; + $tags = $term_tag_pairs[1]; + for ($i = 0; $i < count($terms); $i++) { + $k=[]; + for ($j = -2; $j <= 2; $j++) { + $k[$j] = $this->getIndex($i + $j, $terms); + } + foreach ($this->tag_set as $possible_tag => $tag_index) { + $equality = ($possible_tag == $tags[$i]) ? 1 : 0; + $sum = 0; + for ($j = -2; $j <= 2; $j++) { + $sum += $this->word_feature[$k[$j]][$j][$tag_index]; + } + $sum += $this->bias[$tag_index]; + $sigmoid = 1 / (1 + exp(-1 * $sum)); + for ($j = -2; $j <= 2; $j++) { + if (!isset($dy_dw[$k[$j]])) { + $dy_dw[$k[$j]] = []; + $dy_dw_n[$k[$j]] = []; + } + if (!isset($dy_dw[$k[$j]][$j])) { + $dy_dw[$k[$j]][$j] = []; + $dy_dw_n[$k[$j]][$j] = []; + } + if (!isset($dy_dw[$k[$j]][$j][$tag_index])) { + $dy_dw[$k[$j]][$j][$tag_index] = 0; + $dy_dw_n[$k[$j]][$j][$tag_index] = 0; + } + $dy_dw[$k[$j]][$j][$tag_index] += + ($sigmoid - $equality); + $dy_dw_n[$k[$j]][$j][$tag_index] += 1; + } + //dy_db + $dy_db[$tag_index] += ($sigmoid - $equality); + $dy_db_n[$tag_index] += 1; + $cross_entropy_loss -= ($equality * log($sigmoid) + + (1 - $equality) * log(1 - $sigmoid)); + $cross_entropy_loss_n++; + } + } + } + $cross_entropy_loss /= $cross_entropy_loss_n; + $duration = time() - $time; + echo "Epoch {$epoch} cross_entropy {$cross_entropy_loss}" . + " took {$duration} seconds\n"; + foreach ($dy_dw as $i => $v1) { + foreach ($v1 as $j => $v2) { + foreach ($v2 as $k => $v3) { + $this->word_feature[$i][$j][$k] -= + $dy_dw[$i][$j][$k] / + $dy_dw_n[$i][$j][$k] * + $learning_rate; + if ($this->word_feature[$i][$j][$k] < $this->min_w) { + $this->min_w = $this->word_feature[$i][$j][$k]; + } + if ($this->word_feature[$i][$j][$k] > $this->max_w) { + $this->max_w = $this->word_feature[$i][$j][$k]; + } + } + } + } + foreach ($dy_db as $k => $v) { + $this->bias[$k] -= $dy_db[$k] / $dy_db_n[$k] * $learning_rate; + } + if ($epoch % 10 == 9 ) { + $this->saveWeights(); + } + } + $this->saveWeights(); + } + /** + * Predicts the part of speech tag for each term in a sentence + * @param mixed $sentence is an array of segmented words/terms + * or a string with words/terms seperated by space + * @return array of tags for these terms + */ + public function predict($sentence) + { + if (!is_array($sentence)) { + if ($sentence == "") { + $terms = []; + } else { + $terms = preg_split("/[\s]+/", $sentence); + } + } else { + $terms = $sentence; + } + if (!count($terms)) { + return []; + } + if (!$this->word_feature) { + $this->loadWeights(); + } + $ret = []; + $pos_unknown_tags_list = []; + if (!empty($this->tokenizer) && method_exists($this->tokenizer, + "getPosUnknownTagsList")) { + $pos_unknown_tags_list = $this->tokenizer::getPosUnknownTagsList(); + } + for($i = 0; $i < count($terms); $i++) { + $term = $terms[$i]; + $score = []; + $key = $this->getKey($term); + foreach($this->tag_set as $possible_tag => $tag_index) { + $score[$possible_tag] = 0; + for ($j = -2; $j <= 2; $j++) { + $k = $this->getIndex($i + $j, $terms); + if (isset($this->word_feature[$k])) { + $score[$possible_tag] += + $this->getW($k, $j, $tag_index); + } else if ($j == 0 && !in_array($possible_tag, + $pos_unknown_tags_list)) { + $score[$possible_tag] += $this->min_w; + } + } + $score[$possible_tag] += $this->getB($tag_index); + } + $ret[] = array_keys($score, max($score))[0]; + } + return $ret; + } +} diff --git a/src/library/PhraseParser.php b/src/library/PhraseParser.php index c16882e1b..86ffeebdb 100755 --- a/src/library/PhraseParser.php +++ b/src/library/PhraseParser.php @@ -270,7 +270,7 @@ class PhraseParser * a format that does not involved punctuation that will be stripped * as we extract phrases. * - * @param string& $string a string of words, etc which might involve such + * @param string &$string a string of words, etc which might involve such * terms * @param $lang a language tag to use as part of the canonicalization * process not used right now @@ -315,7 +315,7 @@ class PhraseParser * Given a string, hyphenates words in the string which appear in * a bloom filter for the given locale as phrases. * - * @param string& $string a string of words, etc which might involve such + * @param string &$string a string of words, etc which might involve such * terms * @param $lang a language tag to use as part of the canonicalization * process @@ -949,7 +949,7 @@ class PhraseParser * index for (server:apache) even if the document itself did not contain * them. * - * @param array& $site associated array containing info about a downloaded + * @param array &$site associated array containing info about a downloaded * (or read from archive) document. * @return array of meta words to be associate with this document */ @@ -1165,7 +1165,10 @@ class PhraseParser * @param string $link_text text of the anchor tag link came from * @param string $site_url url of the page link was on * @param array $url_info key value pairs which may have been generated - * as part of the page processor + * as part of the page processor + * @param array $link_word_lists list of words used in anchor text + * associated with this link and their positionns in the anchor text + * @return array meta words associated with the link */ public static function calculateLinkMetas($url, $link_host, $link_text, $site_url, $url_info = [], $link_word_lists = []) diff --git a/src/library/StochasticTermSegmenter.php b/src/library/StochasticTermSegmenter.php index bab52ca31..b6f096d90 100644 --- a/src/library/StochasticTermSegmenter.php +++ b/src/library/StochasticTermSegmenter.php @@ -28,26 +28,12 @@ */ namespace seekquarry\yioop\library; -use seekquarry\yioop\locale\zh_CN\resources as ZH; use seekquarry\yioop\configs as C; + /** - * A Stochastic Finite-State Word-Segmenter. - * This class contains necessary tools to segment terms - * from sentences. - * - * Currently only supports Chinese. - * Instruction to add a new language: - * Add a switch case in the constructor. - * Define the following function: - * isExceptionImpl - * See the class function 'isException' for more information - * isPunctuationImpl - * See the class function 'isPunctuation' for more information - * isNotCurrentLangImpl - * See the class function 'notCurrentLang' for more information - * Chinese example is provided in the constructor + * Class for segmenting terms using Stochastic Finite State Word Segmentation * - * @author Xianghong Sun + * @author Xianghong Sun and Chris Pollett (tweaks to adding new language) */ class StochasticTermSegmenter { @@ -61,22 +47,22 @@ class StochasticTermSegmenter * In the test of Chinese Segmentation on pku dataset, * the speed is 43.803s vs. 1.540s * Default value = 0.06 - * The time and Peak Memory are 5.094 s and 98.97MB + * The time and Peak Memory are 5.094s and 98.97MB * @var number from 0 - 1.0 */ private $cache_pct; /** - * Cache. Will have runtime data for the segmentation + * Cache of sub trie of dictionary trie used to speed up look up * @var array */ - private $cache=[]; + private $cache = []; /** - * The language currently being used e.g. zh_CN, ja + * The language currently being used e.g. zh-CN, ja * @var string */ public $lang; /** - * regular expression to determine if the non of the char in this + * Regular expression to determine if the non of the char in this * term is in current language * Recommanded expression for: * Chinese: \p{Han} @@ -91,123 +77,97 @@ class StochasticTermSegmenter */ public $unknown_term_score; /** - * A dictionary file that contains the statistic infomation of - * the terms + * A dictionary that contains statistical information on terms for a + * language. A non-empty dictionary should have two fields: + * N, the number of terms in the dictionary; dic, + * a trie implemented using nested php arrays that implements the + * dictionary. The leaves of the trie have frequency counts for terms + * stored in the trie. * @var array */ - public $dictionary_file; + public $dictionary; + /** + * Path on disk to where segmentor dictionary should be stored + * @var string + */ + public $dictionary_path; /** - * Construct an instance of this class used for segmenting string with + * Constructs an instance of this class used for segmenting string with * respect to words in a locale using a probabilistic approach to evaluate * segmentation possibilities. - * @param string $lang is a string to indicate the language + * @param string $lang locale this instance will do segmentation for + * @param float $cache_pct percentage of whole trie that can be + * cached for faster look-up */ function __construct($lang, $cache_pct = 0.06) { + $lang = str_replace("-", "_", $lang); + $this->lang = $lang; + $this->dictionary_path = C\LOCALE_DIR . + "/$lang/resources/term_weights.txt.gz"; $this->cache_pct = $cache_pct; - /* Add different attribute for different languages - * Currently only Chinese + $this->tokenizer = PhraseParser::getTokenizer($lang); + if (!is_object($this->tokenizer)) { + return; + } + /* + * To use a StocasticTermSegmenter, a locale's Tokenizer should + * implement isCardinalNumber, isOrdinalNumber, isDate, + * isPunctuation, isNotCurrentLang and optionally getNamedEntityTagger */ - switch($lang) - { - case "zh_CN": - case "zh-CN": - $this->lang = "zh_CN"; - /* - * Check if the term passed in is an exception term - */ - $this->isExceptionImpl = function($term) { - return ZH\Tokenizer::isCardinalNumber($term) - || ZH\Tokenizer::isOrdinalNumber($term) - || ZH\Tokenizer::isDate($term); - }; - /* - * Check if the term passed in is a punctuation - */ - $this->isPunctuationImpl = function($term) - { - return ZH\Tokenizer::isPunctuation($term); - }; - /* - * Check if all the chars in the term is NOT current language - */ - $this->isNotCurrentLangImpl = function($term) - { - return ZH\Tokenizer::isNotCurrentLang($term); - }; - /* - * named entity recognizer; - */ - $this->NER = ZH\Tokenizer::getNER(); - break; - default: - $this->lang = $lang; + if (method_exists($this->tokenizer, "getNamedEntityTagger")) { + /* + * Named entity recognizer; + */ + $this->named_entity_tagger = + $this->tokenizer::getNamedEntityTagger(); } } - /** - * __call for calling dynamic methods - * @param string $method method of this class to call - * @param array $args arguments to pass to method - * @return mixed result of method calculation - */ - public function __call($method, $args) - { - return call_user_func_array($this->$method, $args); - } - /** - * __get for getting dynamic variables - * @param string $var_name variable to retrieve - * @return mixed result of retrieval - */ - public function __get($var_name) - { - return $this->$var_name; - } - /** - * __set for assigning dynamic variables - * @param string $var_name variable to assign - * @param mixed $value value to assign to it - */ - public function __set($var_name, $value) - { - $this->$var_name = $value; - } /** * Check if the term passed in is an exception term * Not all valid terms should be indexed. * e.g. there are infinite combinations of numbers in the world. * isExceptionImpl should be defined in constructor if needed - * @param $term is a string that to be checked + * @param string $term is a string that to be checked * @return true if $term is an exception term, false otherwise */ public function isException($term) { - if (isset($this->isExceptionImpl)) - return $this->isExceptionImpl($term); + if (method_exists($this->tokenizer, "isCardinalNumber") && + method_exists($this->tokenizer, "isOrdinalNumber") && + method_exists($this->tokenizer, "isDate")) { + return $this->tokenizer::isCardinalNumber($term) + || $this->tokenizer::isOrdinalNumber($term) + || $this->tokenizer::isDate($term); + } return false; } /** - * Check if the term passed in is a punctuation + * Check if the term passed in is a punctuation character * isPunctuationImpl should be defined in constructor if needed - * @param $term is a string that to be checked - * @return true if $term is a punctuation, false otherwise + * @param string $term is a string that to be checked + * @return true if $term is some kind of punctuation, false otherwise */ public function isPunctuation($term) { - if (isset($this->isPunctuationImpl)) - return $this->isPunctuationImpl($term); + if (!empty($this->tokenizer) && + method_exists($this->tokenizer, "isPunctuation")) { + return $this->tokenizer::isPunctuation($term); + } return false; } /** - * Check if all the chars in the term is NOT current language - * @param $term is a string that to be checked - * @return bool true if all the chars in $term is NOT current language - * false otherwise + * Check if all the chars in the term are NOT from the current language + * @param string $term is a string that to be checked + * @return bool true if all the chars in $term are NOT from the current + * language false otherwise */ public function notCurrentLang($term) { - if (isset($this->isNotCurrentLangImpl)) - return $this->isNotCurrentLangImpl($term); + if (!empty($this->tokenizer) && + method_exists($this->tokenizer, "isNotCurrentLang")) { + return $this->tokenizer::isNotCurrentLang($term); + } return false; } /** @@ -219,20 +179,18 @@ class StochasticTermSegmenter */ public function train($text_files, $format = "default") { - $ctb_fmt=false; + $ctb_fmt = false; switch ($format) { case("default"): break; case("CTB"): - $ctb_fmt=true; + $ctb_fmt = true; break; default: echo "Unrecognized format"; exit(); } - $out_file = C\LOCALE_DIR . - "/{$this->lang}/resources/term_weight.txt.gz"; - echo "Saving file to: $out_file\n"; + echo "Saving file to: {$this->dictionary_path}\n"; $dictionary = []; $N = 0; if (is_string($text_files)) { @@ -261,31 +219,31 @@ class StochasticTermSegmenter fclose($fh); } } - $this->dictionary_file = []; - $this->dictionary_file["N"] = 0; - $this->dictionary_file["dic"] = []; - ksort ($dictionary); + $this->dictionary = []; + $this->dictionary["N"] = 0; + $this->dictionary["dic"] = []; + ksort($dictionary); $start_char = null; - $tmp_array=[]; + $tmp_array = []; foreach ($dictionary as $key => $value) { - if (mb_substr($key,0,1)!=$start_char) { - $this->dictionary_file["dic"][$start_char] - = json_encode($tmp_array[$start_char]); - $tmp_array=[]; - $start_char=mb_substr($key,0,1); + if (mb_substr($key, 0, 1) != $start_char) { + $this->dictionary["dic"][$start_char] = + json_encode($tmp_array[$start_char]); + $tmp_array = []; + $start_char = mb_substr($key, 0, 1); } $this->add($key, $value, $tmp_array); - $this->dictionary_file["N"]++; + $this->dictionary["N"]++; } $this->unknown_term_score = $this->getScore(1); - file_put_contents($out_file, - gzencode(json_encode($this->dictionary_file), 9)); + file_put_contents($this->dictionary_path, + gzencode(json_encode($this->dictionary), 9)); return true; } /** - * This function is used to segment a list of files - * @param $text_files can be a file name or a list of file names + * Segments the text in a list of files + * @param mixed $text_files can be a file name or a list of file names * to be segmented * @param bool $return_string return segmented string if true, * print to stdout otherwise @@ -325,11 +283,11 @@ class StochasticTermSegmenter return true; } /** - * Segment texts. Words are seperated by space - * @param string $text to be segmented + * Segments text into words separated by space + * @param string $text to be segmented * @param bool $return_string return segmented string if true, * print otherwise - * @return string segmented words with space or true/false; + * @return mixed segmented words with space or true/false; */ public function segmentText($text, $return_string = false) { @@ -340,10 +298,12 @@ class StochasticTermSegmenter foreach ($sentences as $line) { if (mb_strlen($line)) { $t = $this->segmentSentence($line); - if ($return_string) { - $result .= join( " ", $t) . "\n"; - } else { - echo join( " ", $t) . "\n"; + if (!empty($t)) { + if ($return_string) { + $result .= join( " ", $t) . "\n"; + } else { + echo join( " ", $t) . "\n"; + } } } } @@ -353,35 +313,37 @@ class StochasticTermSegmenter return true; } /** - * Segment a sentence into arrays of words. - * Need NOT contain any new line characters. + * Segments a single sentence into an array of words. + * Must NOT contain any new line characters. * @param string $sentence is a string without newline to be segmented * @return array of segmented words */ public function segmentSentence($sentence) { - $t=preg_split("/[\s ]+/u", trim($sentence)); + $t = preg_split("/[\s ]+/u", trim($sentence)); if(count($t) > 1) { $ret = []; foreach($t as $s) { - $ret=array_merge($ret,$this->segmentSentence($s)); + $segments = $this->segmentSentence($s); + if (is_array($segments)) { + $ret = array_merge($ret, $segments); + } } return $ret; } - if (!$this->dictionary_file) { - $dic_file = C\LOCALE_DIR . - "/{$this->lang}/resources/term_weight.txt.gz"; - if (!file_exists($dic_file)) { - crawlLog("$dic_file does not exist!"); + if (!$this->dictionary) { + if (!file_exists($this->dictionary_path)) { + crawlLog("{$this->dictionary_path} does not exist!"); return null; } - $this->dictionary_file = - json_decode(gzdecode(file_get_contents($dic_file)), true); + $this->dictionary = + json_decode(gzdecode(file_get_contents( + $this->dictionary_path)), true); gc_collect_cycles(); $this->unknown_term_score = $this->getScore(1); } $cache_size = - floor(count($this->dictionary_file['dic']) * $this->cache_pct); + floor(count($this->dictionary['dic']) * $this->cache_pct); if ($cache_size == 0) { $cache_size = 1; } @@ -390,11 +352,12 @@ class StochasticTermSegmenter if (!count($characters)) { return []; } - $ner_dict=[]; - if (isset($this->NER)) { - $named_entities=$this->NER->predict($characters); + $net_dict = []; + if (isset($this->named_entity_tagger)) { + $named_entities = $this->named_entity_tagger->predict( + $characters); foreach($named_entities as $e) { - $this->add($e[0],1,$ner_dict); + $this->add($e[0], 1, $net_dict); } } $score = []; @@ -407,7 +370,7 @@ class StochasticTermSegmenter && !$this->isPunctuation($characters[$index])) { $current_char = $characters[$index]; for($j = $index + 1; $j < count($characters); $j++) { - if ($this->notCurrentLang($current_char.$characters[$j]) + if ($this->notCurrentLang($current_char . $characters[$j]) && !$this->isPunctuation($characters[$j])) { $current_char .= $characters[$j]; } else { @@ -424,7 +387,7 @@ class StochasticTermSegmenter //If date or number if ($this->isException($characters[$index]) ) { $current_char = $characters[$index]; - for($j = $index+1; $j<count($characters); $j++) { + for($j = $index+1; $j < count($characters); $j++) { if (!$this->isException( $current_char . $characters[$j])) { break; @@ -468,11 +431,11 @@ class StochasticTermSegmenter $path[$index] = $index - 1; } //if entry exists, look for the term - if (isset($this->dictionary_file["dic"][$characters[$index]])) { + if (isset($this->dictionary["dic"][$characters[$index]])) { if (!isset($this->cache[$characters[$index]])) { $this->cache = [$characters[$index] => json_decode( - $this->dictionary_file["dic"][$characters[$index]], + $this->dictionary["dic"][$characters[$index]], true)] + $this->cache; while (count($this->cache) > $cache_size) { array_pop($this->cache); @@ -493,9 +456,9 @@ class StochasticTermSegmenter } } } - //check NER dictionary - if (isset($ner_dict[$characters[$index]])) { - $subdic = $ner_dict; + //Check Named Entity Tagger dictionary + if (isset($net_dict[$characters[$index]])) { + $subdic = $net_dict; for ($j = $index; $j < count($characters); $j++) { if (!isset($subdic[$characters[$j]])) { break; @@ -520,9 +483,9 @@ class StochasticTermSegmenter } $result = []; $t = 0; - foreach(array_reverse($tmp) as $nextnode) { + foreach(array_reverse($tmp) as $next_node) { $result_word = ""; - while($t <= $nextnode) { + while($t <= $next_node) { $result_word .= $characters[$t]; $t++; } @@ -531,40 +494,42 @@ class StochasticTermSegmenter return $result; } /** - * This is the function to calculate scores for each word + * Calculates a score for a term based on its frequency versus that + * of the whole trie. * @param int $frequency is an integer tells the frequency of a word * @return float the score of the term. */ - private function getScore($frequency) + public function getScore($frequency) { - if (!empty($this->dictionary_file["N"]) && - is_numeric($this->dictionary_file["N"])) { - return -log($frequency / $this->dictionary_file["N"]); + if (!empty($this->dictionary["N"]) && + is_numeric($this->dictionary["N"])) { + return -log($frequency / $this->dictionary["N"]); } else { return 0; } } /** - * Adds a term to the dictionary + * Adds a (term, frequency) pair to an array based trie * - * @param string $key the term to be inserted - * @param string $value the frequency to be inserted - * @param array $array for insertion + * @param string $term the term to be inserted + * @param string $frequency the frequency to be inserted + * @param array & $trie array based trie we want to insert the key value + * pair into */ - private function add($key, $value, & $array) + public function add($term, $frequency, & $trie) { - $trie_array = & $array; - for ($i = 0; $i < mb_strlen($key,"utf-8"); $i++) { - $character = mb_substr($key, $i, 1, "utf-8"); + $sub_trie = & $trie; + for ($i = 0; $i < mb_strlen($term, "utf-8"); $i++) { + $character = mb_substr($term, $i, 1, "utf-8"); $enc_char = $character; // If letter doesnt exist then create one by // assigning new array - if (!isset($trie_array[$enc_char])) { - $trie_array[$enc_char] = []; + if (!isset($sub_trie[$enc_char])) { + $sub_trie[$enc_char] = []; } - $trie_array = & $trie_array[$enc_char]; + $sub_trie = & $sub_trie[$enc_char]; } // Set end of term marker - $trie_array['$'] = $value; + $sub_trie['$'] = $frequency; } } diff --git a/src/library/SuffixTree.php b/src/library/SuffixTree.php index f0bcdb837..25eb36fd0 100644 --- a/src/library/SuffixTree.php +++ b/src/library/SuffixTree.php @@ -177,7 +177,7 @@ class SuffixTree * The number of elements out of $this->text that this node is currently * responsible for * - * @param array& $node the node to compute the length of + * @param array &$node the node to compute the length of */ public function edgeLength(&$node) { @@ -288,7 +288,7 @@ class SuffixTree * @param int $index a node in the suffix tree * @param string $path from root to current node * @param int $len number of nodes from root to current node in suffix tree - * @param array& $maximal assoc array of phrase => (cond_max => pos of + * @param array &$maximal assoc array of phrase => (cond_max => pos of * conditional maximal subphrase, [0] => pos_1st_occurrence of phrase, * [1]=>pos_2nd_occurrence of phrase, etc) */ diff --git a/src/library/UpgradeFunctions.php b/src/library/UpgradeFunctions.php index ea463e0d9..d4389ea89 100644 --- a/src/library/UpgradeFunctions.php +++ b/src/library/UpgradeFunctions.php @@ -93,7 +93,7 @@ function upgradeLocales() /** * Used to force push the default Public and Wiki pages into the current * database - * @param object& $db datasource to use to upgrade + * @param resource &$db datasource to use to upgrade */ function upgradePublicHelpWiki(&$db) { @@ -242,7 +242,7 @@ function getWikiHelpPages() * Inserting at an ID rather than at the end is useful since activities are * displayed in admin panel in order of increasing id. * - * @param resource& $db database handle where Yioop database stored + * @param resource &$db database handle where Yioop database stored * @param string $string_id message identifier to give translations for * for activity * @param string $method_name admin_controller method to be called to perform @@ -288,7 +288,7 @@ function addActivityAtId(&$db, $string_id, $method_name, $activity_id) * Adds or replaces a translation for a database message string for a given * IANA locale tag. * - * @param resource& $db database handle where Yioop database stored + * @param resource &$db database handle where Yioop database stored * @param string $string_id message identifier to give translation for * @param string $locale_tag the IANA language tag to update the strings of * @param string $translation the translation for $string_id in the language diff --git a/src/library/Utility.php b/src/library/Utility.php index 4e80d00cf..bc3796db9 100755 --- a/src/library/Utility.php +++ b/src/library/Utility.php @@ -180,7 +180,7 @@ function getIniAssignMatch($matches) * bytes to destination string * * @param string $source string to copy from - * @param string& $destination string to copy to + * @param string &$destination string to copy to * @param int $start starting offset * @param int $length number of bytes to copy * @param string $timeout_msg for long copys message to print if taking more @@ -228,7 +228,7 @@ function vByteEncode($pos_int) /** * Decodes from a string using variable byte coding an integer. * - * @param string& $str string to use for decoding + * @param string &$str string to use for decoding * @param int $offset byte offset into string when var int stored * @return int the decoded integer */ @@ -278,7 +278,7 @@ function packPosting($doc_index, $position_list, $delta = true) * * @param string $posting a string containing * a doc index position list pair coded encoded using modified9 - * @param int& $offset a offset into the string where the modified9 posting + * @param int &$offset a offset into the string where the modified9 posting * is encoded * @param bool $dedelta if true then assumes the list is a sequence of * differences (a delta list) and undoes the difference to get @@ -309,7 +309,7 @@ function unpackPosting($posting, &$offset, $dedelta = true) * Given a string of postings adds $add_offset add to each offset to the * document map in each posting. * - * @param string& $postings a string of index shard postings + * @param string &$postings a string of index shard postings * @param int $add_offset an fixed amount to add to each postings doc map offset * * @return string $new_postings where each doc offset has had $add_offset added @@ -525,8 +525,8 @@ function packListModified9($continue_bits, $cnt, $pack_list) * Returns the next complete posting string from $input_string being at offset. * Does not do any decoding. * - * @param string& $input_string a string of postings - * @param int& $offset an offset to this string which will be updated after call + * @param string &$input_string a string of postings + * @param int &$offset an offset to this string which will be updated after call * @return string undecoded posting */ function nextPostString(&$input_string, &$offset) @@ -562,7 +562,7 @@ function nextPostString(&$input_string, &$offset) * encoded using Modified 9 * * @param string $input_string string to decode from - * @param int& $offset where to string in the string, after decode + * @param int &$offset where to string in the string, after decode * points to where one was after decoding. * @return array sequence of positive integers that were decoded * @see encodeModified9 @@ -950,28 +950,6 @@ function intToMetric($num) } return $num; } -/** - * - */ -function binomial($n, $k, $p = 1) -{ - //modified from wikipedia - if ($k < 0 || $k > $n) { - return 0; - } - if ($k == 0 || $k == $n) { - return 1; - } - $k = min($k, $n - $k); // symmetry - $res = 1; - for ($i = 0; $i < $k; $i++) { - $res *= ($p*($n - $i))/($i + 1); - } - if ($p != 1) { - $res *= pow(1 - $p, $n - $k); - } - return $res; -} /** * Logs a message to a logfile or the screen * @@ -1604,11 +1582,15 @@ function microTimestamp() return vsprintf('%d.%06d', gettimeofday()); } /** + * Checks that a timestamp is within the time interval given by a + * start time (HH:mm) and a duration * - * @param string $start_time - * @param string $duration - * @param int $time - * @return int + * @param string $start_time string of the form (HH:mm) + * @param string $duration string containting an int in seconds + * @param int $time a Unix timestamp. + * @return int -1 if the time of day of $time is not within the given interval. + * Otherwise, the Unix timestamp at which the interval will be over for + * the same day as $time. */ function checkTimeInterval($start_time, $duration, $time = -1) { @@ -2213,7 +2195,7 @@ function computeLCS($lines1, $lines2, $offset = 0) * @param int $offset a number to add to each line number output into $lcs. * This is useful if we have trimmed off the initially common lines from * our two strings we are trying to compute the LCS of - * @param array& $lcs an array of triples + * @param array &$lcs an array of triples * (index_string1, index_string2, line) * the indexes indicate the line number in each string, line is the line * in common the two strings diff --git a/src/library/WebArchive.php b/src/library/WebArchive.php index 692a3ea79..55909598b 100755 --- a/src/library/WebArchive.php +++ b/src/library/WebArchive.php @@ -158,7 +158,7 @@ class WebArchive * * @param resource $fh resource for the web archive file. If null * the web archive is open first and close when the data is written - * @param array& $data data to write into the info block of the archive + * @param array &$data data to write into the info block of the archive */ public function writeInfoBlock($fh = null, &$data = null) { @@ -215,7 +215,7 @@ class WebArchive * * @param string $offset_field field in objects to return the byte offset * at which they were stored - * @param array& $objects references to objects that will be stored + * @param array &$objects references to objects that will be stored * the offset field in these references will be adjusted if * @param array $data data to write in the WebArchive's info block * @param string $callback name of a callback diff --git a/src/library/WebArchiveBundle.php b/src/library/WebArchiveBundle.php index 8d748a693..272c3f9c0 100755 --- a/src/library/WebArchiveBundle.php +++ b/src/library/WebArchiveBundle.php @@ -166,7 +166,7 @@ class WebArchiveBundle * the resulting offsets given by $offset_field. * * @param string $offset_field field used to record offsets after storing - * @param array& $pages data to store + * @param array &$pages data to store * @return int the write_partition the pages were stored in */ public function addPages($offset_field, &$pages) diff --git a/src/library/WebQueueBundle.php b/src/library/WebQueueBundle.php index 8234b2b55..b06ef43ee 100755 --- a/src/library/WebQueueBundle.php +++ b/src/library/WebQueueBundle.php @@ -521,7 +521,7 @@ class WebQueueBundle implements Notifier } /** * Removes all url objects from $url_array which have been seen - * @param array& $url_array objects to check if have been seen + * @param array &$url_array objects to check if have been seen * @param array $field_names an array of components of a url_array element * which contain a url to check if seen */ diff --git a/src/library/WebSite.php b/src/library/WebSite.php index 8d59ab521..04e38c9d1 100644 --- a/src/library/WebSite.php +++ b/src/library/WebSite.php @@ -1,9 +1,9 @@ <?php /** - * seekquarry\yioop\Website -- a small web server and web routing engine + * seekquarry\yioop\Website -- + * a small web server and web routing engine * - * - * Copyright (C) 2018 Chris Pollett chris@pollett.org + * Copyright (C) 2018-2020 Chris Pollett chris@pollett.org * * LICENSE: * @@ -46,6 +46,8 @@ use seekquarry\yioop\configs as C; * PHP superglobals like $_GET, $_POST, $_REQUEST, $_COOKIE, $_SESSION, * $_FILES, etc and endeavors to make it easy to code apps in a rapid PHP * style. + * + * @author Chris Pollett */ class WebSite { diff --git a/src/library/archive_bundle_iterators/OdpRdfArchiveBundleIterator.php b/src/library/archive_bundle_iterators/OdpRdfArchiveBundleIterator.php index d0442ec56..8c61cacdd 100644 --- a/src/library/archive_bundle_iterators/OdpRdfArchiveBundleIterator.php +++ b/src/library/archive_bundle_iterators/OdpRdfArchiveBundleIterator.php @@ -193,7 +193,7 @@ class OdpRdfArchiveBundleIterator extends TextArchiveBundleIterator * document * * @param object $dom document object for one Topic tag tag - * @param array& $site a reference to an array of header and page info + * @param array &$site a reference to an array of header and page info * for an html page */ public function processTopic($dom, &$site) @@ -229,7 +229,7 @@ class OdpRdfArchiveBundleIterator extends TextArchiveBundleIterator * document * * @param object $dom document object for one Topic tag tag - * @param array& $site a reference to an array of header and page info + * @param array &$site a reference to an array of header and page info * for an html page */ public function processExternalPage($dom, &$site) diff --git a/src/library/archive_bundle_iterators/TextArchiveBundleIterator.php b/src/library/archive_bundle_iterators/TextArchiveBundleIterator.php index 2349a3e5c..bef2ad6e3 100644 --- a/src/library/archive_bundle_iterators/TextArchiveBundleIterator.php +++ b/src/library/archive_bundle_iterators/TextArchiveBundleIterator.php @@ -316,7 +316,7 @@ class TextArchiveBundleIterator extends ArchiveBundleIterator * Helper function for nextChunk to advance the parition if we are * at the end of the current archive file * - * @param array& $info a struct with data about current chunk. will up start + * @param array &$info a struct with data about current chunk. will up start * partition flag */ public function updatePartition(&$info) @@ -724,7 +724,7 @@ class TextArchiveBundleIterator extends ArchiveBundleIterator { $info = $this->getNextTagsData([$tag]); if (!isset($info[1])) { - return $info; + return $info; } return $info[0]; } diff --git a/src/library/classifiers/Classifier.php b/src/library/classifiers/Classifier.php index 63cb9d23b..1d217de31 100644 --- a/src/library/classifiers/Classifier.php +++ b/src/library/classifiers/Classifier.php @@ -984,8 +984,8 @@ class Classifier implements CrawlConstants * @param array $summary page summary to classify, passed by reference * @param array $classifiers list of Classifier instances, each prepared * for classifying (via the prepareToClassify method) - * @param array& $active_classifiers - * @param array& $active_rankers + * @param array &$active_classifiers + * @param array &$active_rankers */ public static function labelPage(&$summary, $classifiers, &$active_classifiers, &$active_rankers) diff --git a/src/library/index_bundle_iterators/DocIterator.php b/src/library/index_bundle_iterators/DocIterator.php index 75122a164..15399e50a 100755 --- a/src/library/index_bundle_iterators/DocIterator.php +++ b/src/library/index_bundle_iterators/DocIterator.php @@ -96,9 +96,17 @@ class DocIterator extends IndexBundleIterator * Creates a word iterator with the given parameters. * @param string $index_name time_stamp of the to use * @param SearchfiltersModel $filter Model responsible for keeping - * track of edited and deleted search results + * track of edited and deleted search results + * @param int $results_per_block number of results in a block of results + * return in one go from the iterator + * @param int $direction when results are access from $index_name in + * which order they should be presented. self::ASCENDING is from first + * added to last added, self::DESCENDING is from last added to first + * added. Note: this value is not saved permanently. So you + * could in theory open two read only versions of the same bundle but + * reading the results in different directions * @param int $results_per_block the maximum number of results that can - * be returned by a findDocsWithWord call + * be returned by a findDocsWithWord call */ public function __construct($index_name, $filter = null, $results_per_block = IndexBundleIterator::RESULTS_PER_BLOCK, @@ -225,7 +233,9 @@ class DocIterator extends IndexBundleIterator return $results; } /** - * + * Get the document offset prior to the current $doc_offset + * @param int $doc_offset an offset into the document map of an IndexShard + * @return int previous doc_offset */ public function getPreviousDocOffset($doc_offset) { diff --git a/src/library/index_bundle_iterators/GroupIterator.php b/src/library/index_bundle_iterators/GroupIterator.php index 6c9f18b8f..e303f37a7 100644 --- a/src/library/index_bundle_iterators/GroupIterator.php +++ b/src/library/index_bundle_iterators/GroupIterator.php @@ -227,7 +227,7 @@ class GroupIterator extends IndexBundleIterator * have been remembered in grouped_keys and will be ignored in the return * result of this function. * - * @param array& $pages pages to group + * @param array &$pages pages to group * @return array $pre_out_pages pages after grouping */ public function groupByHashUrl(&$pages) @@ -267,7 +267,7 @@ class GroupIterator extends IndexBundleIterator * that group as its representative. The function then modifies the * supplied argument array to make it an array of group representatives. * - * @param array& $pre_out_pages documents previously grouped by hash of url + * @param array &$pre_out_pages documents previously grouped by hash of url */ public function groupByHashAndAggregate(&$pre_out_pages) { @@ -319,7 +319,7 @@ class GroupIterator extends IndexBundleIterator * of single summarized documents for each group. These single summarized * documents have aggregated scores. * - * @param array& $pre_out_pages array of groups of pages for which out pages + * @param array &$pre_out_pages array of groups of pages for which out pages * are to be generated. * @return array $out_pages array of single summarized documents */ @@ -379,7 +379,7 @@ class GroupIterator extends IndexBundleIterator * @param string $hash_url the crawlHash of the url of the page we are * scoring which will be compared with that of the host to see if * the current page has the url of a hostname. - * @param array& $pre_hash_page pages to compute scores for + * @param array &$pre_hash_page pages to compute scores for */ public function aggregateScores($hash_url, &$pre_hash_page) { diff --git a/src/library/index_bundle_iterators/IndexBundleIterator.php b/src/library/index_bundle_iterators/IndexBundleIterator.php index c2230b60e..3c7ee844b 100644 --- a/src/library/index_bundle_iterators/IndexBundleIterator.php +++ b/src/library/index_bundle_iterators/IndexBundleIterator.php @@ -149,6 +149,8 @@ abstract class IndexBundleIterator implements CrawlConstants * * @param array $gen_doc1 first ordered pair * @param array $gen_doc2 second ordered pair + * @param int $direction whether the comparison should be done for + * a self::ASCEDNING or a self::DESCENDING search * @return int -1,0,1 depending on which is bigger */ public function genDocOffsetCmp($gen_doc1, $gen_doc2, $direction = @@ -185,7 +187,13 @@ abstract class IndexBundleIterator implements CrawlConstants return 0; } /** - * + * Returns the direction of a IndexBundleIterator. Depending on the + * iterator could be either forward from the start of an index + * (self::ASCENDING) or backward from the end of the index + * (self::DESCENDING). For this base class, the function always returns + * self::ASCENDING, but subclasses might return different values. + * @return int either CrawlConstants::ASCENDING or + * CrawlConstants::DESCENDING */ public function getDirection() { diff --git a/src/library/index_bundle_iterators/IntersectIterator.php b/src/library/index_bundle_iterators/IntersectIterator.php index 97c1c21d8..d6012b2e0 100644 --- a/src/library/index_bundle_iterators/IntersectIterator.php +++ b/src/library/index_bundle_iterators/IntersectIterator.php @@ -266,8 +266,8 @@ class IntersectIterator extends IndexBundleIterator * @param mixed $next_pos * or int if * next_pos must be >= $cur_pos * +len_search_term. $next_pos represents the position the next * quoted term should be at - * @param $qp $position_list_index => $len_of_list_term pairs - * @return -1 on failure, 0 on backtrack, 1 on success + * @param array $qp $position_list_index => $len_of_list_term pairs + * @return int -1 on failure, 0 on backtrack, 1 on success */ public function checkQuote(&$position_lists, $cur_pos, $next_pos, $qp) { @@ -308,10 +308,10 @@ class IntersectIterator extends IndexBundleIterator * Given the position_lists of a collection of terms computes * a score for how close those words were in the given document * - * @param array& $word_position_lists a 2D array item + * @param array &$word_position_lists a 2D array item * number => position_list (locations in doc where item occurred) for * that item. - * @param array& $word_len_lists length for each item of its position list + * @param array &$word_len_lists length for each item of its position list * @param bool $is_doc whether this is the position list of a document * or a link * @return sum of inverse of all covers computed by plane sweep algorithm diff --git a/src/library/index_bundle_iterators/NetworkIterator.php b/src/library/index_bundle_iterators/NetworkIterator.php index fbbd9d9fa..7839255f2 100644 --- a/src/library/index_bundle_iterators/NetworkIterator.php +++ b/src/library/index_bundle_iterators/NetworkIterator.php @@ -351,9 +351,18 @@ class NetworkIterator extends IndexBundleIterator return $pages; } /** + * If we want the top $num_results results (a block) and we have + * $num_machines, this computes how many results we shhould request + * of each machine. * Buttcher, Clark, Cormack give an exact formula to compute this, * but it is slow to compute - * We instead compute a (1/$num_machines^{3/4})* $num_results +5; + * We instead compute a (1/$num_machines^{3/4})* $num_results + 5; + * @param int $num_machines number of machines each having a portion + * of the results + * @param int $num_results, the k value that we want the top k best + * overall results. + * @return int number of best results we should ask from each machine + * to ensure get top k best results overall */ public static function serverAdjustedResultsPerBlock($num_machines, $num_results) diff --git a/src/library/indexing_plugins/IndexingPlugin.php b/src/library/indexing_plugins/IndexingPlugin.php index 2a018bdb1..d6a312680 100644 --- a/src/library/indexing_plugins/IndexingPlugin.php +++ b/src/library/indexing_plugins/IndexingPlugin.php @@ -173,11 +173,13 @@ abstract class IndexingPlugin * them in the getAdditionalMetaWords function for this plugin, or they * will not be recognized in queries. * - * @param array& $summary the summary data produced by the relevant page + * @param array &$summary the summary data produced by the relevant page * processor's handle method; modified in-place. * @param string $url the url where the summary contents came from */ - public function pageSummaryProcessing(&$summary, $url) {return null;} + public function pageSummaryProcessing(&$summary, $url) { + return null; + } /** * This method is called by the queue_server with the name of * a completed index. This allows the indexing plugin to diff --git a/src/library/indexing_plugins/WordfilterPlugin.php b/src/library/indexing_plugins/WordfilterPlugin.php index 28b6e66ed..8570c2c06 100644 --- a/src/library/indexing_plugins/WordfilterPlugin.php +++ b/src/library/indexing_plugins/WordfilterPlugin.php @@ -57,7 +57,7 @@ require_once C\BASE_DIR. "/library/LocaleFunctions.php"; * NOYDIR, NONE or can be the word NOPROCESS, JUSTFOLLOW, NOTCONTAIN. * The preconditions is checked in the function checkFilter. Details on * what constitutes are legal precondition are described in the - * @see $filter_rules and @see $rules_string documentation. + * See $filter_rules and $rules_string documentation. * Usually, if checkFilter returns true then pageSummaryProcessing adds the * meta tags to the document summary and returns. If one of the actions * was NOTCONTAIN, then only if checkFilter returned false are the meta tags @@ -215,7 +215,7 @@ EOD; * whether the summary title and description satisfy various rules * in $this->filter_rules * - * @param array& $summary the summary data produced by the relevant page + * @param array &$summary the summary data produced by the relevant page * processor's handle method; modified in-place. * @param string $url the url where the summary contents came from */ @@ -272,7 +272,7 @@ EOD; /** * Used to check if $precondition is met by a supplied string. * - * @see $filter_terms to see what constitutes a valid precondition. + * See $filter_terms to see what constitutes a valid precondition. * * @param string $preconditions the terms and their * frequencies to search for @@ -350,7 +350,7 @@ EOD; * it. It then modifies $data so that if the plugin's configuration view * is drawn it makes use of the current plugin configuration info. * - * @param array& $data info to be used by the admin view to draw itself. + * @param array &$data info to be used by the admin view to draw itself. */ public function configureHandler(&$data) { @@ -469,7 +469,7 @@ EOD; /** * Used to draw the HTML configure screen for the word filter plugin. * - * @param array& $data contains configuration data to be used in drawing + * @param array &$data contains configuration data to be used in drawing * the view */ public function configureView(&$data) diff --git a/src/library/media_jobs/AnalyticsJob.php b/src/library/media_jobs/AnalyticsJob.php index 86f6d9287..a9eea2b1a 100644 --- a/src/library/media_jobs/AnalyticsJob.php +++ b/src/library/media_jobs/AnalyticsJob.php @@ -141,7 +141,7 @@ class AnalyticsJob extends MediaJob * for which statistics have been requested but not yet computed. * If these queries take too long it saves partial results and returns. * - * @param array& $data associative array which will have all the statistics + * @param array &$data associative array which will have all the statistics * data collected. */ public function computeCrawlStatistics() diff --git a/src/library/media_jobs/FeedsUpdateJob.php b/src/library/media_jobs/FeedsUpdateJob.php index 2171d3511..6e54fd832 100644 --- a/src/library/media_jobs/FeedsUpdateJob.php +++ b/src/library/media_jobs/FeedsUpdateJob.php @@ -61,14 +61,18 @@ class FeedsUpdateJob extends MediaJob */ public $db; /** - * @var IndexArchiveBundle + * The FeedArchiveBundle to put feed items into periodically + * @var FeedArchiveBundle */ public $index_archive; /** + * News Feed Items found from the current feed * @var array */ public $found_items; /** + * Used to keep track of image urls of thumbnails to download + * for feed items * @var array */ public $media_urls; @@ -189,7 +193,12 @@ class FeedsUpdateJob extends MediaJob $this->media_urls = []; } /** - * @param array $thumb_sites + * Download images and create thumbnails for a list of image urls. + * + * @param array $thumb_sites array of arrays. The sub-array should contain + * a field CrawlConstants::THUMB_URL with url to download. + * After download the thumb_nail is saved in the file + * CrawlConstants::FILE_NAME. */ private function getThumbs($thumb_sites) { @@ -828,7 +837,7 @@ class FeedsUpdateJob extends MediaJob * Updates trending term counts based on the string from the current * feed item. * - * @param array& $term_counts lang => [term => occurrences] + * @param array &$term_counts lang => [term => occurrences] * @param string $source_phrase original non-stemmed phrase from feed * item to adjust $term_counts with. Used to remember non-stemmed * terms. We assume we have already extracted position lists from @@ -1204,6 +1213,10 @@ class FeedsUpdateJob extends MediaJob } } } + /** + * Sets the value of $this->index_archive to point to + * the FeedArchiveBundle associated to feeds on this instance of Yioop + */ public function getFeedArchive() { $dir = C\CRAWL_DIR . '/cache/' . self::feed_index_data_base_name; diff --git a/src/library/processors/ImageProcessor.php b/src/library/processors/ImageProcessor.php index 3f54d9b1e..9bc716e08 100755 --- a/src/library/processors/ImageProcessor.php +++ b/src/library/processors/ImageProcessor.php @@ -58,7 +58,12 @@ class ImageProcessor extends PageProcessor return null; } /** + * Used to save a temporary file with the data downloaded for a url + * while carrying out image processing * + * @param string $page contains data about an image that one needs to save + * @param string $url where $page data came from + * @param string $file_extension to be associated wit the $page data */ public function saveTempFile($page, $url, $file_extension) { diff --git a/src/library/processors/PageProcessor.php b/src/library/processors/PageProcessor.php index 0f9468f8d..a6826e29f 100644 --- a/src/library/processors/PageProcessor.php +++ b/src/library/processors/PageProcessor.php @@ -173,7 +173,7 @@ abstract class PageProcessor implements CrawlConstants /** * Should be implemented to compute a summary based on a * text string of a document. This method is called from - * @see handle($page, $url) + * @see PageProcessor::handle * * @param string $page string of a document * @param string $url location the document came from diff --git a/src/library/processors/TextProcessor.php b/src/library/processors/TextProcessor.php index c08f29093..fca5817fa 100755 --- a/src/library/processors/TextProcessor.php +++ b/src/library/processors/TextProcessor.php @@ -194,7 +194,7 @@ class TextProcessor extends PageProcessor * If an end of file is reached before closed tags are seen, this methods * closes these tags in the correct order. * - * @param string& $page a reference to an xml or html document + * @param string &$page a reference to an xml or html document */ public static function closeDanglingTags(&$page) { diff --git a/src/library/summarizers/Summarizer.php b/src/library/summarizers/Summarizer.php index 3c2890eee..032c23b2b 100644 --- a/src/library/summarizers/Summarizer.php +++ b/src/library/summarizers/Summarizer.php @@ -41,8 +41,8 @@ use seekquarry\yioop\library\processors\PageProcessor; * document and produces a summary of that document up to * PageProcessor::$max_description_len many characters. Summarizers * also contain various methods to generate word cloud from such a summary - * @see wordCloudFromSummary and/or document centroids - * @see wordCloudFromTermVector. + * @see Summarizer::wordCloudFromSummary and/or document centroids + * wordCloudFromTermVector. * * @author Charles Bocage charles.bocage@sjsu.edu * Chris Pollett chris@pollett.org diff --git a/src/locale/en_US/resources/Tokenizer.php b/src/locale/en_US/resources/Tokenizer.php index 31fe5bc3f..2f32b9e65 100755 --- a/src/locale/en_US/resources/Tokenizer.php +++ b/src/locale/en_US/resources/Tokenizer.php @@ -277,7 +277,7 @@ class Tokenizer * This methods tries to handle punctuation in terms specific to the * English language such as abbreviations. * - * @param string& $string a string of words, etc which might involve such + * @param string &$string a string of words, etc which might involve such * terms */ public function canonicalizePunctuatedTerms(&$string) @@ -569,7 +569,7 @@ class Tokenizer * sentence, create a phrase string for each of the next nodes * which belong to part of speech group $type. * - * @param array& $cur_node node within parse tree + * @param array &$cur_node node within parse tree * @param array $tagged_phrase parse tree for phrase * @param string $type self::$noun_type, self::$verb_type, etc * @return string phrase string involving only terms of that $type @@ -1661,7 +1661,7 @@ class Tokenizer * @param array $tagged_tokens array pairs as might come from tagTokenize * @param bool $with_tokens whether to include the terms and the tags * in the output string or just the part of speech tags - * @return $tagged_phrase a phrase with terms in the format token~tag + * @return string $tagged_phrase a phrase with terms in the format token~tag * ($with_token == true) or space separated tags (!$with_token). */ private static function taggedPartOfSpeechTokensToString($tagged_tokens, diff --git a/src/locale/hi/resources/Tokenizer.php b/src/locale/hi/resources/Tokenizer.php index faf51af23..095f7ab48 100755 --- a/src/locale/hi/resources/Tokenizer.php +++ b/src/locale/hi/resources/Tokenizer.php @@ -335,7 +335,7 @@ class Tokenizer * sentence, create a phrase string for each of the next nodes * which belong to part of speech group $type. * - * @param array& $cur_node node within parse tree + * @param array &$cur_node node within parse tree * @param array $tagged_phrase parse tree for phrase * @param string $type self::$noun_type, self::$verb_type, etc * @return string phrase string involving only terms of that $type diff --git a/src/locale/it/resources/Tokenizer.php b/src/locale/it/resources/Tokenizer.php index b70eea3c8..d0f839753 100755 --- a/src/locale/it/resources/Tokenizer.php +++ b/src/locale/it/resources/Tokenizer.php @@ -45,7 +45,7 @@ class Tokenizer /** * A list of frequently occurring terms for this locale which should * be excluded from certain kinds of queries - * @array + * @var array */ public static $stop_words = [ 'http', 'https', @@ -189,7 +189,7 @@ class Tokenizer * * @param $parent_string is the string in which we wish to find the suffix * @param $substring is the suffix we wish to check - * @return $pos as the starting position of the suffix $substring in + * @return int $pos as the starting position of the suffix $substring in * $parent_string if it exists, else false */ private static function checkForSuffix($parent_string,$substring) @@ -221,8 +221,8 @@ class Tokenizer /** * Computes the starting index for region R1 * - * @param $string is the string for which we wish to find the index - * @return $r1_start as the starting index for R1 for $string + * @param string $string for which we wish to find the index + * @return int $r1_start as the starting index for R1 for $string */ private static function r1($string) { @@ -249,8 +249,8 @@ class Tokenizer /** * Computes the starting index for region R2 * - * @param $string is the string for which we wish to find the index - * @return $r2_start as the starting index for R1 for $string + * @param string $string for which we wish to find the index + * @return int $r2_start as the starting index for R1 for $string */ private static function r2($string) { @@ -283,8 +283,8 @@ class Tokenizer /** * Computes the starting index for region RV * - * @param $string is the string for which we wish to find the index - * @return $rv_start as the starting index for RV for $string + * @param string $string for which we wish to find the index + * @return int $rv_start as the starting index for RV for $string */ private static function rv($string) { @@ -351,7 +351,7 @@ class Tokenizer /** * Checks if a character is a vowel or not * - * @param $char is the character to be checked + * @param string $char is the character to be checked * @return bool if $char is a vowel */ private static function isVowel($char) @@ -376,9 +376,9 @@ class Tokenizer * Computes the longest suffix for a given string from a given set of * suffixes * - * @param $string is the for which the maximum suffix is to be found - * @param $suffixes is an array of suffixes - * @return $max_suffix is the longest suffix for $string + * @param string $string for which the maximum suffix is to be found + * @param array $suffixes an array of suffixes + * @return int $max_suffix is the longest suffix for $string */ private static function maxSuffix($string, $suffixes) { @@ -404,9 +404,9 @@ class Tokenizer * Replaces all acute accents in a string by grave accents and also handles * accented characters * - * @param $string is the string from in which the acute accents are to be + * @param string $string in which the acute accents are to be * replaced - * @return $string with changes + * @return string with changes */ private static function acuteByGrave($string) { diff --git a/src/locale/zh_CN/resources/Tokenizer.php b/src/locale/zh_CN/resources/Tokenizer.php index 72665cf7e..3548d8eed 100755 --- a/src/locale/zh_CN/resources/Tokenizer.php +++ b/src/locale/zh_CN/resources/Tokenizer.php @@ -45,7 +45,7 @@ class Tokenizer * A list of frequently occurring terms for this locale which should * be excluded from certain kinds of queries. This is also used * for language detection - * @array + * @var array */ public static $stop_words = ['一', '人', '里', '会', '没', '她', '吗', '去', '也', '有', '这', '那', '不', '什', '个', '来', '要', '就', '我', '你', @@ -65,19 +65,19 @@ class Tokenizer public static $non_char_preg = "/^[^\p{Han}]+$/u"; /** * The dictionary of characters can be used as Chinese Numbers - * @string + * @var string */ public static $num_dict = "1234567890○〇零一二两三四五六七八九十百千万亿". "0123456789壹贰叁肆伍陆柒捌玖拾廿卅卌佰仟萬億"; /** * Dots used in Chinese Numbers - * @string + * @var string */ public static $dot = "\..点"; /** * A list of characters can be used at the end of numbers - * @string + * @var string */ public static $num_end = "%%"; /** @@ -86,56 +86,58 @@ class Tokenizer * ex. "十分" in most of time means "very", but it will * be determined to be "10 minutes" by the function so we * need to remove it - * @array of string + * @var array of string */ public static $exception_list= ["十分","一","一点","千万", "万一", "一一", "拾", "一时", "千千", "万万", "陆"]; /** * A list of characters can be used as Chinese punctuations - * @string + * @var string */ public static $punctuation_preg = "/^([\x{2000}-\x{206F}\x{3000}-\x{303F}\x{FF00}-\x{FF0F}" . "\x{FF1A}-\x{FF20}\x{FF3B}-\x{FF40}\x{FF5B}-\x{FF65}" . "\x{FFE0}-\x{FFEE}\x{21}-\x{2F}\x{21}-\x{2F}" . "\x{3A}-\x{40}\x{5B}-\x{60}\x{25cf}])\\1*$/u"; + /** * Any unique identifier corresponding to the component of a triplet which * can be answered using a question answer list - * @string + * @var string */ public static $question_token = "qqq"; /** * Words array that determine if a sentence passed in is a question - * @array + * @var array */ - public static $question_words=[ - "any"=>["谁"=>"who", - "哪儿|哪里"=>"where", - "哪个"=>"which", - "哪些"=>"list", - "哪"=>["after"=>[ "1|一"=>"which", + public static $question_words = [ + "any" => ["谁" => "who", + "哪儿|哪里" => "where", + "哪个" => "which", + "哪些" => "list", + "哪" => ["after" => [ "1|一"=>"which", "[2-9]|[1-9][0-9]+"=>"list" ], "other"=>"where" ], - "什么|啥|咋"=>[ "after"=>[ "地方"=>"where", + "什么|啥|咋" => [ "after" => [ "地方"=>"where", "地点"=>"where", "时\w*"=>"when" ], - "other"=>"what"], - "怎么|怎样|怎么样|如何"=>"how", - "为什么"=>"why", - "多少"=>"how many", - "几\w*"=>["any"=>["吗|\?|?"=>"how many"], "other"=>false], - "多久"=>"how long", - "多大"=>"how big" + "other" => "what"], + "怎么|怎样|怎么样|如何" => "how", + "为什么" => "why", + "多少" => "how many", + "几\w*" => ["any" => ["吗|\?|?" => "how many"], + "other" => false], + "多久" => "how long", + "多大" => "how big" ], - "other"=>[ "any"=>[ "吗"=>"yesno", - "呢"=>"what about" + "other" => [ "any" => [ "吗"=>"yesno", + "呢" => "what about" ], - "other"=>[ "other"=>false, - "any"=>["\?|?"=>"yesno"] + "other" => [ "other" => false, + "any" => ["\?|?" => "yesno"] ] ] ]; @@ -143,12 +145,12 @@ class Tokenizer * List of adjective-like parts of speech that might appear in lexicon file * Predicative adjective: VA * other noun-modifier: JJ - * @array + * @var array */ - public static $adjective_type = ["VA","JJ"]; + public static $adjective_type = ["VA", "JJ"]; /** * List of adverb-like parts of speech that might appear in lexicon file - * @array + * @var array */ public static $adverb_type = ["AD"]; /** @@ -156,9 +158,9 @@ class Tokenizer * file * Coordinating conjunction: CC * Subordinating conjunction: CS - * @array + * @var array */ - public static $conjunction_type = ["CC","CS"]; + public static $conjunction_type = ["CC", "CS"]; /** * List of determiner-like parts of speech that might appear in lexicon * file @@ -175,7 +177,7 @@ class Tokenizer * Temporal Noun: NT * Other Noun: NN * Pronoun: PN - * @array + * @var array */ public static $noun_type = ["NR", "NT", "NN", "PN"]; /** @@ -185,32 +187,32 @@ class Tokenizer * Other verb: VV * Short passive voice: SB * Long passive voice: LB - * @array + * @var array */ public static $verb_type = ["VC", "VE", "VV", "SB", "LB"]; /** * List of particle-like parts of speech that might appear in lexicon file * No meaning words that can appear anywhere - * @array + * @var array */ public static $particle_type = [ "AS", "ETC", "DEC", "DEG", "DEV", "MSP", "DER", "SP", "IJ", "FW"]; /** * Stochastic Term Segmenter instance - * @object + * @var object */ - private static $stochasticTermSegmenter; + private static $stochastic_term_segmenter; /** - * named Entity Recognizer instance - * @object + * Named Entity tagger instance + * @var object */ - private static $namedEntityRecognizer; + private static $named_entity_tagger; /** * PosTagger instance - * @object + * @var object */ - private static $posTagger; + private static $pos_tagger; /** * Removes the stop words from the page (used for Word Cloud generation * and language detection) @@ -237,16 +239,16 @@ class Tokenizer * @param string $method indicates which method to use * @return string with words separated by space */ - public static function segment($pre_segment, $method="STS") + public static function segment($pre_segment, $method = "STS") { switch($method) { - case("RMM"): + case "RMM": return PhraseParser::reverseMaximalMatch($pre_segment, "zh-CN", ['/^\d+$/', '/^[a-zA-Z]+$/']); break; - case("STS"): + case "STS": return self::getStochasticTermSegmenter() - ->segmentText($pre_segment,true); + ->segmentText($pre_segment, true); break; } } @@ -283,7 +285,7 @@ class Tokenizer "]+(年|年代|月|日|时|小时|時|小時|" . "点|点钟|點|點鐘|分|分鐘|秒|秒鐘)$/u",$term); } - /* + /** * Check if the term is a punctuation */ public static function isPunctuation($term) @@ -300,31 +302,26 @@ class Tokenizer { return preg_match(self::$non_char_preg, $term); } - /* - * Create stochastic term segmenter + /** + * Get the segmenter instance + * @return StochasticTermSegmenter */ - public static function createStochasticTermSegmenter($cache_pct=0.06) + public static function getStochasticTermSegmenter() { - self::$stochasticTermSegmenter - = new L\StochasticTermSegmenter("zh_CN", $cache_pct); + if (!self::$stochastic_term_segmenter) { + self::$stochastic_term_segmenter + = new L\StochasticTermSegmenter("zh-CN"); + } + return self::$stochastic_term_segmenter; } - /* - * Destory stochastic term segmenter + /** + * Determines the part of speech tag of a term using simple rules if + * possible + * @param string $term to see if can get a part of speech for via a rule + * @return string part of speech tag or $term if can't be determine */ - public static function destoryStochasticTermSegmenter() + public static function getPosKey($term) { - self::$stochasticTermSegmenter = null; - } - /* - * Get the segmenter instance - */ - public static function getStochasticTermSegmenter() { - if (!self::$stochasticTermSegmenter) { - self::createStochasticTermSegmenter(); - } - return self::$stochasticTermSegmenter; - } - public static function POSGetKey($term) { if (self::isPunctuation($term)) { return 'PU'; } else if (self::isCardinalNumber($term)) { @@ -336,55 +333,47 @@ class Tokenizer } else if (self::isNotCurrentLang($term)) { return 'FW'; } - return null; + return $term; } - /* - * Create named entity recognizer instance + /** + * Possible tags a term can have that can be determined by a simple rule + * @return array */ - public static function createNER() + public static function getPosKeyList() { - self::$namedEntityRecognizer - = new L\ContextWeightedNamedEntityRecognizer("zh_CN"); + return ['PU','CD','OD','NT','FW']; } - /* - * Destory named entity recognizer instance + /** + * Return list of possible tags that an unknown term can have + * @return array */ - public static function destoryNER() + public static function getPosUnknownTagsList() { - self::$namedEntityRecognizer = null; + return ["NN","NR","VV","VA"]; } - /* - * Get the named entity recognizer instance - */ - public static function getNER() { - if (!self::$namedEntityRecognizer) { - self::createNER(); - } - return self::$namedEntityRecognizer; - } - /* - * Create POSTagger instance + /** + * Get the named entity tagger instance + * @return NamedEntityContextTagger for Chinese */ - public static function createPosTagger() + public static function getNamedEntityTagger() { - self::$posTagger - = new L\ContextWeightedPosTagger("zh_CN"); + if (!self::$named_entity_tagger) { + self::$named_entity_tagger + = new L\NamedEntityContextTagger("zh-CN"); + } + return self::$named_entity_tagger; } - /* - * Destory POSTagger instance + /** + * Get Part of Speec instance + * @return PartOfSpeechContextTagger for Chinese */ - public static function destoryPosTagger() + public static function getPosTagger() { - self::$posTagger = null; - } - /* - * Get PosTagger instance - */ - public static function getPosTagger() { - if (!self::$posTagger) { - self::createPosTagger(); + if (!self::$pos_tagger) { + self::$pos_tagger + = new L\PartOfSpeechContextTagger("zh-CN"); } - return self::$posTagger; + return self::$pos_tagger; } /** * Scans a word list for phrases. For phrases found generate @@ -405,7 +394,8 @@ class Tokenizer $triplet_types = ['CONCISE', 'RAW']; foreach ($word_and_phrase_list as $word_and_phrase => $position_list) { // strip parentheticals - $word_and_phrase = preg_replace("/[\{\[\(【(][^\}\])】\)]+[\}\]\))】]/u", + $word_and_phrase = preg_replace( + "/[\{\[\(【(][^\}\])】\)]+[\}\]\))】]/u", "", $word_and_phrase); $tagged_phrase = self::tagTokenizePartOfSpeech($word_and_phrase); $parse_tree = ['cur_node' => 0]; @@ -418,7 +408,8 @@ class Tokenizer if (isset($parse_tree['NP'])) { $pre_sub = $parse_tree['NP']; } - $extracted_triplets_set[] = self::rearrangeTripletsByType($triplets); + $extracted_triplets_set[] = self::rearrangeTripletsByType( + $triplets); // next partial sentence while($parse_tree['cur_node'] < count($tagged_phrase) && $tagged_phrase[$parse_tree['cur_node']]["tag"] != "PU") { @@ -434,7 +425,8 @@ class Tokenizer foreach ($questions as $question) { $question_list[$question] = $position_list; } - $question_answer_list = array_merge($question_answer_list, + $question_answer_list = array_merge( + $question_answer_list, $triplets['QUESTION_ANSWER_LIST']); } } @@ -458,10 +450,10 @@ class Tokenizer $segmented = self::getStochasticTermSegmenter()->segmentSentence($text); $tags = self::getPosTagger()->predict($segmented); $result=[]; - for($i=0; $i<count($segmented); $i++) { - $result[$i]=[]; - $result[$i]["token"]=$segmented[$i]; - $result[$i]["tag"]=$tags[$i]; + for($i = 0; $i < count($segmented); $i++) { + $result[$i] = []; + $result[$i]["token"] = $segmented[$i]; + $result[$i]["tag"] = $tags[$i]; } return $result; } @@ -470,7 +462,7 @@ class Tokenizer * sentence, create a phrase string for each of the next nodes * which belong to part of speech group $type. * - * @param array& $cur_node node within parse tree + * @param array &$cur_node node within parse tree * @param array $tagged_phrase parse tree for phrase * @param string $type self::$noun_type, self::$verb_type, etc * @return string phrase string involving only terms of that $type @@ -643,23 +635,25 @@ class Tokenizer $index = 1) { $cur_node = $tree['cur_node']; - // There are two forms of prepostion. - // First one has lc only - // 之前(lc) 他在看书 + /* There are two forms of preposition. + The first one has lc only + 之前(lc) 他在看书 */ if (isset($tagged_phrase[$cur_node]['tag']) && trim($tagged_phrase[$cur_node]['tag']) == "LC") { $tree["LC"] = $tagged_phrase[$cur_node]['token']; $tree['cur_node']+=1; return $tree; } - // Second form: - // format: prep [anything] [locolizer|punctuation] - // 在(p)今天早上,(pu) 他 在(p) 车 里(lc) 睡觉。 - // In the morning today, he was sleeping in the car. + /* Second form: + format: prep [anything] [locolizer|punctuation] + 在(p)今天早上,(pu) 他 在(p) 车 里(lc) 睡觉。 + In the morning today, he was sleeping in the car. + */ if (isset($tagged_phrase[$cur_node]['tag']) && trim($tagged_phrase[$cur_node]['tag']) == "P") { /* can have multiple prep's in a row, for example, - it is known in over 20 countries*/ + it is known in over 20 countries + */ $preposition_string = self::parseTypeList($cur_node, $tagged_phrase, ["P"]); if (!empty($preposition_string)) { @@ -667,7 +661,7 @@ class Tokenizer } while(isset($tagged_phrase[$cur_node]) && isset($tagged_phrase[$cur_node]['tag']) && - !in_array($tagged_phrase[$cur_node]['tag'],["PU","LC"])) { + !in_array($tagged_phrase[$cur_node]['tag'],["PU", "LC"])) { $tree["P"] .= $tagged_phrase[$cur_node]['token']; $cur_node++; } @@ -796,34 +790,34 @@ class Tokenizer * Given a part-of-speeech tagged phrase array generates a parse tree * for the phrase using a recursive descent parser. * - * @param array $tagged_phrase - * an array of pairs of the form ("token" => token_for_term, - * "tag"=> part_of_speech_tag_for_term) + * @param array $tagged_phrase an array of pairs of the form + * ("token" => token_for_term, "tag"=> part_of_speech_tag_for_term) * @param $tree that consists of ["curnode" => - * current parse position in $tagged_phrase] + * current parse position in $tagged_phrase] * @param $tree_np_pre subject found from previous sub-sentence * @return array used to represent a tree. The array has up to three fields - * $tree["cur_node"] index of how far we parsed our$tagged_phrase - * $tree["NP"] contains a subtree for a noun phrase - * $tree["VP"] contains a subtree for a verb phrase + * $tree["cur_node"] index of how far we parsed our$tagged_phrase + * $tree["NP"] contains a subtree for a noun phrase + * $tree["VP"] contains a subtree for a verb phrase */ - public static function parseWholePhrase($tagged_phrase, $tree, $tree_np_pre=[]) + public static function parseWholePhrase($tagged_phrase, $tree, + $tree_np_pre = []) { //remove heading adverbs $cur_node = $tree['cur_node']; do { - $start_node=$cur_node; + $start_node = $cur_node; self::parseTypeList($cur_node, $tagged_phrase, self::$adverb_type); self::parseTypeList($cur_node, $tagged_phrase, self::$particle_type); - } while($start_node!=$cur_node); + } while ($start_node != $cur_node); $tree_np = self::parseNounPhrase($tagged_phrase, ["cur_node" => $cur_node]); if ($tree_np['cur_node'] == $cur_node) { if (!empty($tree_np_pre)) { - $tree_np['NP']=$tree_np_pre; - $tree_np["cur_node"]=$cur_node; + $tree_np['NP'] = $tree_np_pre; + $tree_np["cur_node"] = $cur_node; } else { return $tree; } @@ -1054,7 +1048,7 @@ class Tokenizer $keywords = self::isQuestion($question); if ($keywords) { $generated_questions=self::parseQuestion( - $tagged_question, 1, $keywords); + $tagged_question, 1, $keywords); } return $generated_questions; } @@ -1070,7 +1064,7 @@ class Tokenizer $terms=self::getStochasticTermSegmenter()->segmentSentence($phrase); $qt=self::questionType($terms, self::$question_words); if (in_array($qt["types"], ["who","what","which","where","when", - "whose","how", "how many", "how long", "how big"])) { + "whose", "how", "how many", "how long", "how big"])) { return $qt["ques_words"]; } return false; @@ -1084,24 +1078,27 @@ class Tokenizer * @param string $question_word is the question word need to be replaced * @return array parsed triplet */ - public static function parseQuestion($tagged_question, $index, $question_word) + public static function parseQuestion($tagged_question, $index, + $question_word) { $generated_questions = []; $tree = ["cur_node" => 0]; - $parse_tree = self::parseWholePhrase($tagged_question, - $tree); + $parse_tree = self::parseWholePhrase($tagged_question, $tree); $triplets = self::extractTripletsParseTree($parse_tree); $triplet_types = ['CONCISE', 'RAW']; foreach ($triplet_types as $type) { if (!empty($triplets['subject'][$type]) && !empty($triplets['predicate'][$type]) && !empty($triplets['object'][$type])) { - $sub=trim($triplets['subject'][$type]); - $sub=preg_replace("/^.*".$question_word.".*$/",self::$question_token, $sub); - $pre=trim($triplets['predicate'][$type]); - $pre=preg_replace("/^.*".$question_word.".*$/",self::$question_token, $pre); - $obj=trim($triplets['object'][$type]); - $obj=preg_replace("/^.*".$question_word.".*$/",self::$question_token, $obj); + $sub = trim($triplets['subject'][$type]); + $sub = preg_replace("/^.*".$question_word.".*$/u", + self::$question_token, $sub); + $pre = trim($triplets['predicate'][$type]); + $pre = preg_replace("/^.*".$question_word.".*$/u", + self::$question_token, $pre); + $obj = trim($triplets['object'][$type]); + $obj = preg_replace("/^.*".$question_word.".*$/u", + self::$question_token, $obj); $generated_questions[$type][] = $obj . " " . $pre . " " . $sub; $generated_questions[$type][] = $sub . " " . $pre . " " . $obj; } @@ -1119,52 +1116,52 @@ class Tokenizer if (!isset($type_list["any"])) { return ["ques_words"=>"","types"=>""]; } - $types=""; - $ques_words=""; - for($i=0; $i < count($term_array); $i++ ) { + $types = ""; + $ques_words = ""; + for($i = 0; $i < count($term_array); $i++ ) { foreach($type_list["any"] as $key => $value) { if (preg_match('/^('.$key.')$/u',$term_array[$i])) { if (is_array($value)) { if(isset($value["after"])) { - $found_after=false; + $found_after = false; if (array_key_exists($i+1,$term_array)) { foreach($value["after"] as $key2 => $value2) { - if (preg_match('/^('.$key2.')$/u', - $term_array[$i+1])) { - $ques_words=$term_array[$i]. - " ".$term_array[$i+1]; - $types=$value2; - $found_after=true; + if (preg_match('/^(' . $key2 . ')$/u', + $term_array[$i + 1])) { + $ques_words = $term_array[$i]. + " " . $term_array[$i + 1]; + $types = $value2; + $found_after = true; break; } } } - if (!$found_after && isset($type_list["other"]) - && $value["other"]) { - $ques_words=$term_array[$i]; - $types=$value["other"]; + if (!$found_after && isset($type_list["other"]) && + $value["other"]) { + $ques_words = $term_array[$i]; + $types = $value["other"]; } - } elseif(isset($value["any"])) { - $t=self::questionType($term_array,$value); - $ques_words[]=$term_array[$i]; - $types=$t["types"]; + } elseif (isset($value["any"])) { + $t = self::questionType($term_array,$value); + $ques_words[] = $term_array[$i]; + $types = $t["types"]; } } elseif ($value) { - $ques_words=$term_array[$i]; - $types=$value; + $ques_words = $term_array[$i]; + $types = $value; } } } } - if ($types == "" && isset($type_list["other"])){ + if ($types == "" && isset($type_list["other"])) { if (is_array($type_list["other"])) { - $t=self::questionType($term_array, $type_list["other"]); - $ques_words=$t["ques_words"]; - $types=$t["types"]; - } elseif ( $type_list["other"]) { - $types=$type_list["other"]; + $t = self::questionType($term_array, $type_list["other"]); + $ques_words = $t["ques_words"]; + $types = $t["types"]; + } elseif ($type_list["other"]) { + $types = $type_list["other"]; } } - return ["ques_words"=>$ques_words,"types"=>$types]; + return ["ques_words" => $ques_words, "types" => $types]; } } diff --git a/src/locale/zh_CN/resources/ner_weight.txt.gz b/src/locale/zh_CN/resources/nect_weights.txt.gz similarity index 100% rename from src/locale/zh_CN/resources/ner_weight.txt.gz rename to src/locale/zh_CN/resources/nect_weights.txt.gz diff --git a/src/locale/zh_CN/resources/pos_weight.txt.gz b/src/locale/zh_CN/resources/pos_weights.txt.gz similarity index 100% rename from src/locale/zh_CN/resources/pos_weight.txt.gz rename to src/locale/zh_CN/resources/pos_weights.txt.gz diff --git a/src/locale/zh_CN/resources/term_weight.txt.gz b/src/locale/zh_CN/resources/term_weights.txt.gz similarity index 100% rename from src/locale/zh_CN/resources/term_weight.txt.gz rename to src/locale/zh_CN/resources/term_weights.txt.gz diff --git a/src/models/GroupModel.php b/src/models/GroupModel.php index fe9abf374..2b9d6992f 100644 --- a/src/models/GroupModel.php +++ b/src/models/GroupModel.php @@ -40,8 +40,8 @@ use seekquarry\yioop\library\processors\ImageProcessor; use seekquarry\yioop\models\ImpressionModel; /** - * This is class is used to handle - * db results related to Group Administration. Groups are collections of + * This is class is used to handle db results related to Group Administration. + * Groups are collections of * users who might access a common blog/news feed and set of pages. This * method also controls adding and deleting entries to a group feed and * does limited access control checks of these operations. @@ -777,10 +777,10 @@ class GroupModel extends Model implements MediaConstants * @param int $type flag saying what kind of group item this is. One of * STANDARD_GROUP_ITEM, WIKI_GROUP_ITEM (used for threads discussing * a wiki page) - * @param string $url a url associated with this group item (mainly for - * search group) * @param int $post_time timstamp for when this group item was created * default to the current time + * @param string $url a url associated with this group item (mainly for + * search group) * @return int $id of item added */ public function addGroupItem($parent_id, $group_id, $user_id, $title, @@ -1107,7 +1107,9 @@ class GroupModel extends Model implements MediaConstants return $row['NUM'] ?? false; } /** - * + * Returns the most recent post posted to a group + * @param int $group_id id of the group to get the most recent post for + * @return array associate array of post details */ public function getMostRecentGroupPost($group_id) { @@ -1141,7 +1143,9 @@ class GroupModel extends Model implements MediaConstants return $db->fetchArray($result); } /** - * + * Returns the number of distinct threads in a group's feed + * @param int $group_id id of the group to get thread count for + * @return int number of threads */ public function getGroupThreadCount($group_id) { @@ -1157,7 +1161,9 @@ class GroupModel extends Model implements MediaConstants return $row['NUM'] ?? 0; } /** - * + * Returns the number of posts to a group + * @param int $group_id id of the group to get post count for + * @return int number of posts */ public function getGroupPostCount($group_id) { diff --git a/src/models/LocaleModel.php b/src/models/LocaleModel.php index e2c31491c..db54aa209 100755 --- a/src/models/LocaleModel.php +++ b/src/models/LocaleModel.php @@ -138,7 +138,7 @@ class LocaleModel extends Model * @param string $locale one getRows row corresponding to a given locale * @param mixed $args additional arguments that might be used for this * method (none used for this sub-class) - * @return $locale row with PERCENT_WITH_STRINGS field added + * @return array $locale row with PERCENT_WITH_STRINGS field added */ public function rowCallback($locale, $args) { diff --git a/src/models/Model.php b/src/models/Model.php index 5754563c1..8d886bfc2 100755 --- a/src/models/Model.php +++ b/src/models/Model.php @@ -594,7 +594,7 @@ class Model implements CrawlConstants * * @param int $limit starting row from the potential results to return * @param int $num number of rows after start row to return - * @param int& $total gets set with the total number of rows that + * @param int &$total gets set with the total number of rows that * can be returned by the given database query * @param array $search_array each element of this is a * quadruple name of a field, what comparison to perform, a value to diff --git a/src/models/PhraseModel.php b/src/models/PhraseModel.php index 8f5afa79d..a7b6c44e9 100755 --- a/src/models/PhraseModel.php +++ b/src/models/PhraseModel.php @@ -529,7 +529,7 @@ class PhraseModel extends ParallelModel * phrases, the weight that should be put on these query results, and * which archive to use. * - * @param string& $phrase string to extract struct from, if the phrase + * @param strin g&$phrase string to extract struct from, if the phrase * semantics is guessed or an if condition is processed the value of * phrase will be altered. (Helps for feeding to network queries) * @return array struct representing the conjunctive query @@ -1446,8 +1446,8 @@ class PhraseModel extends ParallelModel * are HTTP Location redirect page's then looks these up in turn. * This method handles robot meta tags which might forbid indexing. * - * @param array& $pages of page data without text summaries - * @param array& $queue_servers array of queue server to find data on + * @param array &$pages of page data without text summaries + * @param array &$queue_servers array of queue server to find data on * @param int $raw only lookup locations if 0 * @param bool $groups_with_docs whether to return only groups that * contain at least one doc as opposed to a groups with only links @@ -1660,7 +1660,7 @@ class PhraseModel extends ParallelModel * when making iterator get sub-iterators to advance to gen doc_offset * stored with respect to save_timestamp if exists. * - * @return &object an iterator for iterating through results to the + * @return object an iterator for iterating through results to the * query */ public function getQueryIterator($word_structs, $filter, $raw, diff --git a/src/models/SearchverticalsModel.php b/src/models/SearchverticalsModel.php index 413c14533..4ec2bb695 100644 --- a/src/models/SearchverticalsModel.php +++ b/src/models/SearchverticalsModel.php @@ -143,14 +143,14 @@ class SearchverticalsModel extends GroupModel * the ordered pair is used later for the PARENT_ID and USER_ID in the * (both of which have indexes) Search group look-up. * - * @param string an input to be hash to a pair of integers - * @param bool $compute_hash flag to chheck if a crawlHash is done - * before converting the result to an ordered pair. In some situations - * the url or host of the url has already been hashed so don't want - * to hash it again. - * @return array [int, int] that corresponds to the hash of the input - * to keep postgres happy (no unsigned ints) we make the value - * of this function signed + * @param string $input to be hash to a pair of integers + * @param bool $compute_hash flag to chheck if a crawlHash is done + * before converting the result to an ordered pair. In some situations + * the url or host of the url has already been hashed so don't want + * to hash it again. + * @return array [int, int] that corresponds to the hash of the input + * to keep postgres happy (no unsigned ints) we make the value + * of this function signed */ public function hashIntPair($input, $compute_hash = true) { @@ -164,7 +164,14 @@ class SearchverticalsModel extends GroupModel return [$front, $back]; } /** + * Given a $query and a $locale_tag returns a ordered sets + * of urls to put at the top of the search results for that query + * if such a map has been defined. * + * @param string $query user supplied query + * @param string $locale_tag language that the lookup of urls should + * be done for + * @return array of urls that correspond to the query */ public function getQueryMap($query, $locale_tag) { @@ -187,7 +194,13 @@ class SearchverticalsModel extends GroupModel return $map_urls; } /** - * + * Stores a query map into the public database. + * A query map associate a $query in a $locale_tag language to a set of + * urls desired to be at the top of the search results. + * @param string $query that triggers the mapping + * @param array $map_urls urls that should appear at the top of the search + * results + * @param string $locale_tag for the language the map should apply to */ public function setQueryMap($query, $map_urls, $locale_tag) { @@ -211,13 +224,13 @@ class SearchverticalsModel extends GroupModel * This is used in wiki read mode for search result verticals or edit mode * (the wiki info is not pre-parsed) for editing the knowledge wiki page. * - * @param string $query to get knowledge wiki results for - * @param string $locale the locale tag language that one want the results - * for - * @param bool $edit_mode whether the wiki page should be pre-parsed - * (suitable for display in query results) or left unparsed (suitable - * for editing). - * @return array knowledge wiki page info + * @param string $query to get knowledge wiki results for + * @param string $locale_tag the locale tag language that one want the + * results for + * @param bool $edit_mode whether the wiki page should be pre-parsed + * suitable for display in query results) or left unparsed (suitable + * for editing). + * @return array knowledge wiki page info */ public function getKnowledgeWiki($query, $locale_tag, $edit_mode = false) { diff --git a/src/models/SourceModel.php b/src/models/SourceModel.php index 2d8e38435..e29c3b90b 100644 --- a/src/models/SourceModel.php +++ b/src/models/SourceModel.php @@ -384,7 +384,12 @@ class SourceModel extends ParallelModel $db->execute($sql, [$locale_string]); } /** + * Used to delete any feed data (IndexDataFeed bundle) and trending + * data in this Yioop installation. * + * @param array $machine_urls a list of machines which are running + * MediaUpdaters for this instance of Yioop. If empty assume is just + * the Name Server */ public function clearFeedData($machine_urls = null) { diff --git a/src/models/TrendingModel.php b/src/models/TrendingModel.php index 64e5cfe08..9084f7425 100644 --- a/src/models/TrendingModel.php +++ b/src/models/TrendingModel.php @@ -120,6 +120,8 @@ class TrendingModel extends Model implements MediaConstants * to category supplied. * * @param string $locale_tag language to get random trending terms for + * @param string $category category to compute trending terms for + * @param int $num_terms number of trending terms to return * @return array terms which are trending */ public function randomTrends($locale_tag, $category = 'news', $num_terms = diff --git a/src/views/AdminView.php b/src/views/AdminView.php index 21500e37f..c4d53dd01 100755 --- a/src/views/AdminView.php +++ b/src/views/AdminView.php @@ -31,6 +31,7 @@ namespace seekquarry\yioop\views; /** + * View used to draw activity list and current activty for a logged in user * * @author Chris Pollett */ diff --git a/src/views/ComponentView.php b/src/views/ComponentView.php index b13f72dd8..185097677 100755 --- a/src/views/ComponentView.php +++ b/src/views/ComponentView.php @@ -37,7 +37,8 @@ use seekquarry\yioop\library\UrlParser; use seekquarry\yioop\views\elements\Element; /** - * + * Base class for views created by adding elements to top, sub-top, same, + * opposite, center columns, or bottom possitions * * @author Chris Pollett */ @@ -53,7 +54,8 @@ class ComponentView extends View */ private $containers = []; /** - * + * Method used to draw the components of this ComponentView + * @param array containing fields to render the elements on this view */ public function renderView($data) { diff --git a/src/views/CrawlstatusView.php b/src/views/CrawlstatusView.php index 355329368..19bda48b0 100755 --- a/src/views/CrawlstatusView.php +++ b/src/views/CrawlstatusView.php @@ -32,7 +32,6 @@ namespace seekquarry\yioop\views; use seekquarry\yioop as B; use seekquarry\yioop\configs as C; - /** * This view is used to display information about * crawls that have been made by this seek_quarry instance @@ -47,7 +46,7 @@ class CrawlstatusView extends View * about the currently active crawl.The $data is supplied by the crawlStatus * method of the AdminController. * - * @param array $data info about the current crawl status + * @param array $data info about the current crawl status */ public function renderView($data) { @@ -353,7 +352,9 @@ class CrawlstatusView extends View <?php } /** - * + * Draws the form used to start a new crawl + * @param array $data containing CSRF_TOKEN field and other field used + * to draw this form */ public function renderCrawlForm($data) {?> diff --git a/src/views/elements/AdminElement.php b/src/views/elements/AdminElement.php index 79132dd08..ab2b47848 100644 --- a/src/views/elements/AdminElement.php +++ b/src/views/elements/AdminElement.php @@ -37,6 +37,7 @@ use seekquarry\yioop\library\UrlParser; use seekquarry\yioop\views\elements\Element; /** + * Element used to render the admin interface for a logged in user of Yioop * * @author Chris Pollett */ diff --git a/src/views/elements/AdminbarElement.php b/src/views/elements/AdminbarElement.php index 7e51841fd..3abb6d97e 100644 --- a/src/views/elements/AdminbarElement.php +++ b/src/views/elements/AdminbarElement.php @@ -43,7 +43,7 @@ class AdminbarElement extends Element * Used to draw the navigation bar on the admin portion * of the yioop website * - * @param array $data contains antiCSRF token, as well as data on + * @param array $data contains anti-CSRF token, as well as data on * used to render what the current admin activity is */ public function render($data) @@ -82,6 +82,13 @@ class AdminbarElement extends Element </div> <?php } + /** + * Used to draw the hamburger menu symbol and associated link to the + * settings menu + * + * @param bool $logged_in whether or not the user is logged in. If so, + * the hamburger menu symbol draws the users name + */ public function renderSettingsToggle($logged_in) { ?> <div class="settings" id="settings-toggle" diff --git a/src/views/elements/BotstoryElement.php b/src/views/elements/BotstoryElement.php index f428d7c17..2bc3e1389 100644 --- a/src/views/elements/BotstoryElement.php +++ b/src/views/elements/BotstoryElement.php @@ -139,7 +139,9 @@ class BotstoryElement extends Element <?php } /** - * + * Used to draw the form to add or update a bot story + * @param $data containing field values that have already been + * been filled in and the anti-CSRF attack token */ public function renderBotStoryForm($data) { diff --git a/src/views/elements/GroupfeedElement.php b/src/views/elements/GroupfeedElement.php index 263ad0501..9a4936670 100644 --- a/src/views/elements/GroupfeedElement.php +++ b/src/views/elements/GroupfeedElement.php @@ -134,7 +134,7 @@ class GroupfeedElement extends Element implements CrawlConstants * * @param string $paging_query stem for all links * drawn in view - * @param array& $data fields used to draw the queue + * @param array &$data fields used to draw the queue */ public function renderGroupedView($paging_query, &$data) { @@ -174,7 +174,7 @@ class GroupfeedElement extends Element implements CrawlConstants * @param string $base_query url that serves as the stem for all links * drawn in view * @param string $paging_query base_query concatenated with limit and num - * @param array& $data fields used to draw the queue + * @param array &$data fields used to draw the queue * @return array $page last feed item processed */ public function renderUngroupedView($logged_in, $base_query, $paging_query, @@ -455,9 +455,13 @@ class GroupfeedElement extends Element implements CrawlConstants return $page; } /** + * Used to slightly clean up hypertext links before drawing them + * (get rid of empty queries, avoid double encoding) * + * @param string $url to clean up + * @return string cleaned url */ - public function formatHref($url) + private function formatHref($url) { return rtrim(html_entity_decode($url), '?'); } diff --git a/src/views/elements/ManageclassifiersElement.php b/src/views/elements/ManageclassifiersElement.php index d36756d91..f588f4dc5 100644 --- a/src/views/elements/ManageclassifiersElement.php +++ b/src/views/elements/ManageclassifiersElement.php @@ -58,6 +58,7 @@ class ManageclassifiersElement extends Element <?php } /** + * Draws the table of currently defined classifiers for the Yioop system * @param array $data info about current users and current mixes, CSRF token */ public function renderClassifiersTable($data) diff --git a/src/views/elements/ManagecreditsElement.php b/src/views/elements/ManagecreditsElement.php index b8c9e4b31..a8b535672 100644 --- a/src/views/elements/ManagecreditsElement.php +++ b/src/views/elements/ManagecreditsElement.php @@ -43,7 +43,8 @@ class ManagecreditsElement extends Element { /** * Draws create advertisement form and existing campaigns - * @param array $data + * @param array $data containing field values that have already been + * been filled in, data about exsting campaigns and the anti-CSRF attack token */ public function render($data) { @@ -102,7 +103,10 @@ class ManagecreditsElement extends Element <?php } /** - * + * Draws the form used to create or edit a keyword + * advertisement + * @param array $data containing field values that have already been + * been filled in and the anti-CSRF attack token */ public function renderCreditsForm($data) { ?> diff --git a/src/views/elements/ManagerolesElement.php b/src/views/elements/ManagerolesElement.php index bea8b7002..f09080a3d 100644 --- a/src/views/elements/ManagerolesElement.php +++ b/src/views/elements/ManagerolesElement.php @@ -49,13 +49,15 @@ class ManagerolesElement extends Element * available roles or which activity has what role */ public function render($data) - { ?> + {?> <div class="current-activity"> <?= $this->renderRoleTable($data); ?> </div> <?php } /** + * Draws the table to display thhe currently available roles + * and their properties in this Yioop system * @param array $data info about current users and current roles, CSRF token */ public function renderRoleTable($data) diff --git a/src/views/elements/ManageusersElement.php b/src/views/elements/ManageusersElement.php index 573809eef..eb867cba0 100644 --- a/src/views/elements/ManageusersElement.php +++ b/src/views/elements/ManageusersElement.php @@ -62,6 +62,8 @@ class ManageusersElement extends Element <?php } /** + * Draws the table that displays the users and their properties for + * the Yioop system * @param array $data info about current users and current roles, CSRF token */ public function renderUserTable($data) @@ -534,32 +536,31 @@ class ManageusersElement extends Element ?> <div class="center"> <?php - $action_url = $base_url. "&user_name=". - $data['CURRENT_USER']['user_name']. - "&role_filter=".$data['ROLE_FILTER']. - "&group_filter=".$data['GROUP_FILTER']; - if ($limit >= C\NUM_RESULTS_PER_PAGE ) { - ?><a href='<?= - "$action_url&arg=edituser&$context" . - "&group_limit=". - ($limit - C\NUM_RESULTS_PER_PAGE) ?>' - ><<</a><?php - } - ?> + $action_url = $base_url. "&user_name=". + $data['CURRENT_USER']['user_name']. + "&role_filter=".$data['ROLE_FILTER']. + "&group_filter=".$data['GROUP_FILTER']; + if ($limit >= C\NUM_RESULTS_PER_PAGE ) { + ?><a href='<?= + "$action_url&arg=edituser&$context" . + "&group_limit=". + ($limit - C\NUM_RESULTS_PER_PAGE) ?>' + ><<</a><?php + } ?> <input class="very-narrow-field center" name="group_filter" type="text" maxlength="<?= C\SHORT_TITLE_LEN ?>" value='<?= $data['GROUP_FILTER'] ?>' /> <?php - if ($data['NUM_USER_GROUPS'] > $limit + - C\NUM_RESULTS_PER_PAGE) { - ?><a href='<?= - "$action_url&arg=edituser&$context" . - "&group_limit=". - ($limit + C\NUM_RESULTS_PER_PAGE) - ?>'>>></a> + if ($data['NUM_USER_GROUPS'] > $limit + + C\NUM_RESULTS_PER_PAGE) { + ?><a href='<?= + "$action_url&arg=edituser&$context" . + "&group_limit=". + ($limit + C\NUM_RESULTS_PER_PAGE) + ?>'>>></a> <?php - } + } ?><br /> <button type="submit" name="change_filter" value="group"><?= tl('manageusers_element_filter') diff --git a/src/views/elements/MediajobsElement.php b/src/views/elements/MediajobsElement.php index 4dba0780f..2084ce8e7 100644 --- a/src/views/elements/MediajobsElement.php +++ b/src/views/elements/MediajobsElement.php @@ -34,14 +34,19 @@ use seekquarry\yioop as B; use seekquarry\yioop\configs as C; /** + * Element used to draw toggles indicating which jobs the Media Updater + * will run and letting the user turn these jobs on/off. * * @author Chris Pollett */ class MediajobsElement extends Element { /** + * Draws interface to allow users to say which jobs will run in the + * MediaUpdater. Also used to draw the nameserver/distribbuted mode toggle * - * @param array $data holds data on + * @param array $data with field containing the nonstatic values needed + * to draw this element */ public function render($data) { diff --git a/src/views/elements/MixcrawlsElement.php b/src/views/elements/MixcrawlsElement.php index ede130eb2..d0ce407a3 100644 --- a/src/views/elements/MixcrawlsElement.php +++ b/src/views/elements/MixcrawlsElement.php @@ -56,6 +56,7 @@ class MixcrawlsElement extends Element <?php } /** + * Draw the table that displays the currently defined crawl mixes * @param array $data info about current users and current mixes, CSRF token */ public function renderMixesTable($data) diff --git a/src/views/elements/PaginationElement.php b/src/views/elements/PaginationElement.php index 6332075c4..62c99e740 100644 --- a/src/views/elements/PaginationElement.php +++ b/src/views/elements/PaginationElement.php @@ -32,13 +32,19 @@ namespace seekquarry\yioop\views\elements; use seekquarry\yioop\configs as C; /** - * + * Element responsible for drawing the sequence of available pages for + * search results. * @author Chris Pollett */ class PaginationElement extends Element { /** - * + * Draws the sequence of available pages for + * search results. (next prev links and, group of pages) + * @param array $data containing fields with info about thhe total + * number of search results for the query, the subsearch the query is + * is for, the desired number of results per page, which page we are on, + * etc. */ public function render($data) { diff --git a/src/views/elements/SearchElement.php b/src/views/elements/SearchElement.php index e9c541d49..d92c8dbe4 100644 --- a/src/views/elements/SearchElement.php +++ b/src/views/elements/SearchElement.php @@ -391,7 +391,9 @@ class SearchElement extends Element implements CrawlConstants <?php } /** - * + * Draws the landing page for this instance of Yioop when the default + * big search bar (rather than the Main public wiki page is used) + * @param array $data containing fields used to draw landing page */ public function renderSearchLanding($data) { diff --git a/src/views/elements/SearchsourcesElement.php b/src/views/elements/SearchsourcesElement.php index 9c2310bb4..012c07e62 100644 --- a/src/views/elements/SearchsourcesElement.php +++ b/src/views/elements/SearchsourcesElement.php @@ -34,7 +34,7 @@ use seekquarry\yioop as B; use seekquarry\yioop\configs as C; /** - * Contains the forms for managing search sources for news, etc. + * This element renders the forms for managing search sources for news, etc. * Also, contains form for managing subsearches which appear in SearchView * * @author Chris Pollett diff --git a/src/views/elements/SideadvertisementElement.php b/src/views/elements/SideadvertisementElement.php index bbf0150c7..1696393d0 100644 --- a/src/views/elements/SideadvertisementElement.php +++ b/src/views/elements/SideadvertisementElement.php @@ -32,13 +32,18 @@ namespace seekquarry\yioop\views\elements; use seekquarry\yioop\configs as C; /** + * Element used to draw an external server advertisement (if there is one) as + * a column on the opposite side of a search results page * * @author Chris Pollett */ class SideadvertisementElement extends Element { /** - * + * Draws an external server advertisement (if there is one) as a column + * on othe opposite side of a search results page + * @param array $data with a field SIDE_ADSCRIPT that should contain + * the advertisement text */ public function render($data) { diff --git a/src/views/elements/TopadvertisementElement.php b/src/views/elements/TopadvertisementElement.php index 47ce1d2a0..06b240469 100644 --- a/src/views/elements/TopadvertisementElement.php +++ b/src/views/elements/TopadvertisementElement.php @@ -31,14 +31,20 @@ namespace seekquarry\yioop\views\elements; use seekquarry\yioop\configs as C; + /** + * This element is used to draw the keyword advertisement above search + * results (if present) * * @author Chris Pollett */ class TopadvertisementElement extends Element { /** - * + * Draws a keyword advertisement (if there is one) at the top + * of a search results page + * @param array $data with a field TOP_ADSCRIPT that should contain + * the advertisement text */ public function render($data) { diff --git a/src/views/helpers/FeedsHelper.php b/src/views/helpers/FeedsHelper.php index 701393723..5d5d2342e 100644 --- a/src/views/helpers/FeedsHelper.php +++ b/src/views/helpers/FeedsHelper.php @@ -42,7 +42,6 @@ use seekquarry\yioop\library\UrlParser; */ class FeedsHelper extends Helper implements CrawlConstants { - /** * Takes page summaries for RSS pages and the current query * and draws list of news links and a link to the news link subsearch @@ -51,6 +50,8 @@ class FeedsHelper extends Helper implements CrawlConstants * @param array $feed_pages page data from news feeds * @param string $csrf_token token to prevent cross site request forgeries * @param string $query the current search query + * @param string $subsearch the name of the subsearch of this feed + * For example, one could have sports feed, a news feed, etc * @param boolean $open_in_tabs whether new links should be opened in * tabs */ diff --git a/tests/PdfProcessorTest.php b/tests/PdfProcessorTest.php index fe25bd561..16e255cda 100644 --- a/tests/PdfProcessorTest.php +++ b/tests/PdfProcessorTest.php @@ -80,7 +80,7 @@ class PdfProcessorTest extends UnitTest implements CrawlConstants "Word Extraction 3"); } /** - * + * Tests Tessaract text extraction from Images */ public function textFromImageTestCase() {