viewgit/inc/functions.php:22 Function utf8_encode() is deprecated [8192]
diff --git a/src/executables/Fetcher.php b/src/executables/Fetcher.php index 01735b4e9..22aa9a3be 100755 --- a/src/executables/Fetcher.php +++ b/src/executables/Fetcher.php @@ -15,7 +15,7 @@ * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. + * GNU General Public License for more details.89 * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. @@ -559,6 +559,7 @@ class Fetcher implements CrawlConstants $local_archives = [""]; while (CrawlDaemon::processHandler()) { $start_time = microtime(true); + $info = []; $fetcher_message_file = C\CRAWL_DIR. "/schedules/{$prefix}FetcherMessages.txt"; if (file_exists($fetcher_message_file)) { @@ -577,6 +578,8 @@ class Fetcher implements CrawlConstants if ($info[self::CRAWL_TIME] == 0) { $info[self::STATUS] = self::NO_DATA_STATE; $this->to_crawl = []; + } else { + L\crawlLog("Crawl time is now " . $this->crawl_time); } } else if ($this->crawl_type == self::ARCHIVE_CRAWL && $this->arc_type != "WebArchiveBundle" && @@ -981,6 +984,11 @@ class Fetcher implements CrawlConstants if (isset($info[self::CRAWL_TIME]) && ($info[self::CRAWL_TIME] != $this->crawl_time || $info[self::CRAWL_TIME] == 0)) { + if ($info[self::CRAWL_TIME] > 0) { + L\crawlLog("New Crawl Time Found: {$info[self::CRAWL_TIME]}"); + } else { + L\crawlLog("Crawl Time Changing to 0"); + } $dir = C\CRAWL_DIR."/schedules"; $time_change = true; /* Zero out the crawl. If haven't done crawl before, then scheduler @@ -1033,6 +1041,8 @@ class Fetcher implements CrawlConstants "{$this->crawl_time}.txt") && file_exists( "$dir/$prefix".self::fetch_batch_name. "{$this->crawl_time}.txt")) { + L\crawlLog("Loading old batches for ". + "{$this->crawl_time}."); $info = unserialize(file_get_contents( "$dir/$prefix".self::fetch_crawl_info. "{$this->crawl_time}.txt")); @@ -1762,6 +1772,9 @@ class Fetcher implements CrawlConstants } $doc_info = $processor->handle($site[self::PAGE], $site[self::URL]); + if (C\FETCHER_PROCESS_DELAY > 0 ) { + usleep(C\FETCHER_PROCESS_DELAY); + } if (isset($site[self::REPOSITORY_TYPE]) && $site[self::REPOSITORY_TYPE] == self::REPOSITORY_GIT) { $site[self::URL] = $tmp_url_store; diff --git a/src/library/TripletExtractor.php b/src/library/TripletExtractor.php index 2dd8bbc2f..6bb38874f 100644 --- a/src/library/TripletExtractor.php +++ b/src/library/TripletExtractor.php @@ -23,10 +23,9 @@ * @author Chris Pollett chris@pollett.org * @license http://www.gnu.org/licenses/ GPL3 * @link http://www.seekquarry.com/ - * @copyright 2009 - 2015 + * @copyright 2009 - 2016 * @filesource */ - namespace seekquarry\yioop\library; use seekquarry\yioop\configs as C; @@ -49,10 +48,10 @@ class TripletExtractor /** * Takes a phrase and tags each term in it with its part of speech. * So each term in the original phrase gets mapped to term~part_of_speech - * This tagger is based on a Brill tagger. It makes uses a lexicon + * This tagger is based on a Brill tagger. It uses a lexicon * consisting of words from the Brown corpus together with a list of * part of speech tags that that word had in the Brown Corpus. These are - * used to get an initial part of speech (in word was not present than + * used to get an initial part of speech (if word was not present than * we assume it is a noun). From this a fixed set of rules is used to modify * the initial tag if necessary. * @@ -86,13 +85,13 @@ class TripletExtractor } preg_match_all("/[\w\d]+/", $text, $matches); $tokens = $matches[0]; - $nouns = array('NN', 'NNS', 'NNP'); - $verbs = array('VBD', 'VBP', 'VB'); - $result = array(); - $previous = array('token' => -1, 'tag' => -1); + $nouns = ['NN', 'NNS', 'NNP']; + $verbs = ['VBD', 'VBP', 'VB']; + $result = []; + $previous = ['token' => -1, 'tag' => -1]; $previous_token = -1; sort($tokens); - $dictionary = array(); + $dictionary = []; /* Notice we sorted the tokens, and notice how we use $cur_pos so only advance forward through $lex_string. So the @@ -118,9 +117,9 @@ class TripletExtractor $tag_list = array(); foreach ($matches[0] as $token) { $prev_tag_list = $tag_list; - $tag_list = array(); + $tag_list = []; // default to a common noun - $current = array('token' => $token, 'tag' => 'NN'); + $current = ['token' => $token, 'tag' => 'NN']; // remove trailing full stops $token = strtolower(rtrim($token, ".")); if (isset($dictionary[$token])) { @@ -208,7 +207,7 @@ class TripletExtractor public static function taggedPartOfSpeechTokensToString($tagged_tokens) { $tagged_phrase = ""; - $simplified_parts_of_speech = array( + $simplified_parts_of_speech = [ "NN" => "NN", "NNS" => "NN", "NNP" => "NN", @@ -228,7 +227,7 @@ class TripletExtractor "RBR" => "AV", "RBS" => "AV", "WRB" => "AV" - ); + ]; foreach ($tagged_tokens as $t) { $tag = trim($t['tag']); $tag = (isset($simplified_parts_of_speech[$tag])) ? @@ -257,12 +256,12 @@ class TripletExtractor fclose($fh); } preg_match_all("/[\w\d\.]+/", $text, $matches); - $nouns = array('NN', 'NNS'); - $return = array(); + $nouns = ['NN', 'NNS']; + $return = []; $i = 0; foreach ($matches[0] as $token) { // default to a common noun - $return[$i] = array('token' => $token, 'tag' => 'NN'); + $return[$i] = ['token' => $token, 'tag' => 'NN']; // remove trailing full stops if (substr($token, -1) == '.') { $token = preg_replace('/\.+$/', '', $token); @@ -275,7 +274,7 @@ class TripletExtractor if ($i > 0) { if ($return[$i - 1]['tag'] == 'DT' && in_array($return[$i]['tag'], - array('VBD', 'VBP', 'VB')) + ['VBD', 'VBP', 'VB']) ) { $return[$i]['tag'] = 'NN'; } @@ -346,7 +345,7 @@ class TripletExtractor */ public static function generateParseTreeUsingRDP($tagger_array) { - $tree = array(); + $tree = []; $tree = ["cur_node" => 0]; $tree_np = TripletExtractor::extractNPUsingRDP($tagger_array, $tree); $tree = ["cur_node" => $tree_np['cur_node']]; @@ -578,7 +577,7 @@ class TripletExtractor */ public static function extractTriplet($tree) { - $triplet = array(); + $triplet = []; $triplet['subject'] = TripletExtractor::extractSubjectFromTree($tree); $triplet['predicate'] = TripletExtractor::extractPredicateFromTree($tree); @@ -592,7 +591,7 @@ class TripletExtractor */ public static function processTripletForStorage($triplet_tree) { - $processed_triplet = array(); + $processed_triplet = []; $processed_triplet['RAW'] = TripletExtractor::getRawTripletForStorage($triplet_tree); $processed_triplet['FEATURED'] = @@ -606,8 +605,8 @@ class TripletExtractor */ public static function getRawTripletForStorage($triplet_tree) { - $raw_triplet = array(); - $question_answer_triplet = array(); + $raw_triplet = []; + $question_answer_triplet = []; if (isset($triplet_tree['subject']['RAW']) && isset($triplet_tree['predicate']['RAW']) && isset($triplet_tree['object']['RAW']) @@ -641,8 +640,8 @@ class TripletExtractor */ public static function getFeaturedTripletForStorage($triplet_tree) { - $featured_triplet = array(); - $question_answer_triplet = array(); + $featured_triplet = []; + $question_answer_triplet = []; if (isset($triplet_tree['subject']['FEATURED']) && isset($triplet_tree['predicate']['FEATURED']) && isset($triplet_tree['object']['FEATURED']) @@ -685,7 +684,7 @@ class TripletExtractor */ public static function extractSubjectFromTree($tree) { - $subject = array(); + $subject = []; if (isset($tree['NP']) && $tree['NP'] != null) { $tree_np = $tree['NP']; $value = TripletExtractor::extractFirstNounFromNPTree($tree_np); @@ -709,7 +708,7 @@ class TripletExtractor */ public static function extractPredicateFromTree($tree) { - $predicate = array(); + $predicate = []; if (isset($tree['VP']) && $tree['VP'] != null) { $tree_vp = $tree['VP']; $value = TripletExtractor::extractDeepestVerbFromVBTree($tree_vp); @@ -736,7 +735,7 @@ class TripletExtractor */ public static function extractObjectFromTree($tree) { - $object = array(); + $object = []; if (isset($tree['VP']) && $tree['VP'] != null) { $tree_vp = $tree['VP']; if (isset($tree_vp['NP']) && $tree_vp['NP'] != null) { @@ -798,7 +797,7 @@ class TripletExtractor */ public static function extractAttributes($tree) { - $attribute_map = array(); + $attribute_map = []; if (isset($tree['JJ']) && count($tree['JJ']) > 0) { $attribute_map['JJ'] = $tree['JJ']['JJ']; } @@ -846,8 +845,8 @@ class TripletExtractor */ public static function storeStatementArraysAsTriplet($statement_array) { - $triplets_list = array(); - $question_list = array(); + $triplets_list = []; + $question_list = []; $question_answer_list = array(); foreach ($statement_array as $key => $value) { try { @@ -894,7 +893,7 @@ class TripletExtractor $question_string_tagged = TripletExtractor::partOfSpeechTagger_Brill( $question_string); $index = 0; - $generated_question_array = array(); + $generated_question_array = []; if (isset($question_string_tagged[$index]) && ("WRB" == trim($question_string_tagged[$index]['tag']) || "WP" == trim($question_string_tagged[$index]['tag'])) @@ -934,10 +933,10 @@ class TripletExtractor */ public static function parseWHOQuestion($question_string_tagged, $index) { - $generated_question_array = array(); + $generated_question_array = []; $tree = ["cur_node" => $index]; $tree['NP'] = "WHO"; - $triplet = array(); + $triplet = []; $tree_vp = TripletExtractor::extractVPUsingRDP( $question_string_tagged, $tree); $triplet['predicate'] = TripletExtractor::extractPredicateFromTree( @@ -989,7 +988,7 @@ class TripletExtractor */ public static function parseWHPlusQuestion($question_string_tagged, $index) { - $generated_question_array = array(); + $generated_question_array = []; $aux_verb = ""; while (isset($question_string_tagged[$index]) && ("VB" == trim($question_string_tagged[$index]['tag']) || @@ -1003,7 +1002,7 @@ class TripletExtractor } $tree = ["cur_node" => $index]; $tree['NP'] = "WHPlus"; - $triplet = array(); + $triplet = []; $tree_np = TripletExtractor::extractNPUsingRDP( $question_string_tagged, $tree); $triplet['subject'] = TripletExtractor::extractSubjectFromTree(