fix regex parsing error feedupdatejob, fix filename issue wikimedia job, a=chris

Chris Pollett [2019-01-28 01:Jan:th]
fix regex parsing error feedupdatejob, fix filename issue wikimedia job, a=chris
Filename
src/configs/Createdb.php
src/controllers/Controller.php
src/data/public_default.db
src/library/media_jobs/AnalyticsJob.php
src/library/media_jobs/FeedsUpdateJob.php
src/library/media_jobs/WikiMediaJob.php
diff --git a/src/configs/Createdb.php b/src/configs/Createdb.php
index c01c32cf0..2c160c3da 100755
--- a/src/configs/Createdb.php
+++ b/src/configs/Createdb.php
@@ -458,7 +458,7 @@ $media_sources = [
     ['100000004', 'National Weather Service 4', 'regex', 'weather',
         'http://forecast.weather.gov/product.php?'.
         'site=NWS&issuedby=04&product=SCS&format=txt&version=1&glossary=0',
-        '/WEA\s+LO/HI\n+([^<]+)\n+NATIONAL/mi###/\n/###'.
+        '/WEA\s+LO\/HI\s*\n+([^<]+)\n+NATIONAL/mi###/\n/###'.
         '/^(.+?)\s\s\s+/###/\s\s\s+(.+?)$/###http://www.weather.gov/###',
         'en-US'],
     ['100000005', 'Ted', 'feed_podcast', '2592000',
diff --git a/src/controllers/Controller.php b/src/controllers/Controller.php
index a1124ddd0..258858a86 100755
--- a/src/controllers/Controller.php
+++ b/src/controllers/Controller.php
@@ -764,10 +764,10 @@ abstract class Controller
             case "file_name":
                 if (isset($value)) {
                     $value = str_replace("&amp;", "&", $value);
-                    $value = str_replace("/", "", $value);
-                    $value = str_replace("\\", "", $value);
-                    $value = str_replace("*", "", $value);
-                    $clean_value = str_replace(":", "", $value);
+                    $value = str_replace("/", "-", $value);
+                    $value = str_replace("\\", "-", $value);
+                    $value = str_replace("*", "-", $value);
+                    $clean_value = str_replace(":", "-", $value);
                 } else {
                     $clean_value = $default;
                 }
diff --git a/src/data/public_default.db b/src/data/public_default.db
index 37d2e814f..ce2b4c51d 100644
Binary files a/src/data/public_default.db and b/src/data/public_default.db differ
diff --git a/src/library/media_jobs/AnalyticsJob.php b/src/library/media_jobs/AnalyticsJob.php
index 2af3b856c..b515b7cf1 100644
--- a/src/library/media_jobs/AnalyticsJob.php
+++ b/src/library/media_jobs/AnalyticsJob.php
@@ -33,6 +33,7 @@ namespace seekquarry\yioop\library\media_jobs;
 use seekquarry\yioop\configs as C;
 use seekquarry\yioop\library as L;
 use seekquarry\yioop\library\CrawlConstants;
+use seekquarry\yioop\library\UrlParser;
 use seekquarry\yioop\library\processors\PageProcessor;
 use seekquarry\yioop\models\ImpressionModel;
 use seekquarry\yioop\models\MachineModel;
@@ -330,8 +331,6 @@ class AnalyticsJob extends MediaJob
         $results = $this->phrase_model->getPhrasePageResults(
             "$query i:$index_timestamp", 0,
             1, true, null, false, 0, $machine_urls);
-        echo $query."\n";
-        print_r($results);
         return (isset($results["TOTAL_ROWS"])) ? $results["TOTAL_ROWS"] : -1;
     }
 }
diff --git a/src/library/media_jobs/FeedsUpdateJob.php b/src/library/media_jobs/FeedsUpdateJob.php
index 8c994f357..d75ff8cb5 100644
--- a/src/library/media_jobs/FeedsUpdateJob.php
+++ b/src/library/media_jobs/FeedsUpdateJob.php
@@ -387,6 +387,7 @@ class FeedsUpdateJob extends MediaJob
                     $log_function("----Scraped channel is:", "h3");
                 }
                 $channel = "";
+                $nodes = [];
                 if (!empty($matches[1])) {
                     if ($test_mode) {
                         $log_function($matches[1]);
diff --git a/src/library/media_jobs/WikiMediaJob.php b/src/library/media_jobs/WikiMediaJob.php
index 2610fc83a..20bfd9662 100644
--- a/src/library/media_jobs/WikiMediaJob.php
+++ b/src/library/media_jobs/WikiMediaJob.php
@@ -39,6 +39,7 @@ use seekquarry\yioop\library\IndexShard;
 use seekquarry\yioop\library\PhraseParser;
 use seekquarry\yioop\library\UrlParser;
 use seekquarry\yioop\models\GroupModel;
+use seekquarry\yioop\controllers\CrawlController;

 /**
  * A media job to download and index feeds from various search sources (RSS,
@@ -588,6 +589,7 @@ class WikiMediaJob extends MediaJob
     public function downloadPodcastItemIfNew($item, &$podcast, $age)
     {
         $group_model = $this->group_model;
+        $controller = new CrawlController(); //only need for clean() method
         $pubdate = (empty($item['pubdate'])) ? time():
             (is_int($item['pubdate']) ? $item['pubdate'] :
             strtotime($item['pubdate']));
@@ -613,6 +615,7 @@ class WikiMediaJob extends MediaJob
             $podcast['LANGUAGE']);
         $file_name = $this->makeFileNamePattern($file_name, $file_pattern,
             substr($item['title'], 0, C\NAME_LEN), $pubdate);
+        $file_name = $controller->clean($file_name, "file_name");
         $type = UrlParser::getDocumentType($file_name);
         $data = $this->downloadPodcastItem($item['link'], $type);
         if ($data) {
ViewGit