Add more log messages to Fetcher, a=chris

Chris Pollett [2019-01-25 17:Jan:th]

Add more log messages to Fetcher, a=chris

Filename
src/configs/PublicHelpPages.php
src/controllers/FetchController.php
src/data/public_default.db
src/error.php
src/executables/Fetcher.php
src/library/CrawlConstants.php

diff --git a/src/configs/PublicHelpPages.php b/src/configs/PublicHelpPages.php
index 4b7aed4d1..31644329b 100644
--- a/src/configs/PublicHelpPages.php
+++ b/src/configs/PublicHelpPages.php
@@ -39,21 +39,8 @@ namespace seekquarry\yioop\configs;
  * @var array
  */
 $public_pages = [];
-$public_pages["en-US"]["404"] = <<< 'EOD'
-title=Page Not Found
-description=The page you requested cannot be found on our server
-END_HEAD_VARS
-==The page you requested cannot be found.==
-EOD;
-$public_pages["en-US"]["409"] = <<< 'EOD'
-title=Conflict
-
-description=Your request would result in an edit conflict.
-END_HEAD_VARS
-==Your request would result in an edit conflict, so will not be processed.==
-EOD;
-$public_pages["en-US"]["Podcast_Examples"] = <<< 'EOD'
-page_type=media_list
+$public_pages["en-US"]["400"] = <<< 'EOD'
+page_type=standard

 page_alias=

@@ -61,13 +48,13 @@ page_border=solid-border

 toc=true

-title=Podcast+Examples
+title=

-author=Yioop+Team
+author=

-robots=NOINDEX%2C+NOFOLLOW
+robots=

-description=Used+to+save+example+feed+podcasts+listed+on+search+sources.
+description=

 alternative_path=

@@ -77,7 +64,24 @@ page_footer=

 sort=aname

+END_HEAD_VARS==Bad Request==
+Your request couldn&#039;t be processed by the server!
+EOD;
+$public_pages["en-US"]["404"] = <<< 'EOD'
+title=Page Not Found
+description=The page you requested cannot be found on our server
 END_HEAD_VARS
+==The page you requested cannot be found.==
+EOD;
+$public_pages["en-US"]["409"] = <<< 'EOD'
+title=Conflict
+
+description=Your request would result in an edit conflict.
+END_HEAD_VARS
+==Your request would result in an edit conflict, so will not be processed.==
+EOD;
+$public_pages["en-US"]["Podcast_Examples"] = <<< 'EOD'
+
 EOD;
 $public_pages["en-US"]["Syntax"] = <<< 'EOD'
 page_type=standard
@@ -2059,13 +2063,16 @@ If a regex is used rather than an xpath, then the first capture group of the reg
  Language: English
  Aux Url XPaths:
  /(https\:\/\/cdn.somenetwork.com\/nightly-news-netcast\/video\/nightly-[^\&quot;]+)\&quot;/
- /window\.\_\_data\s*\=\s*([^\n]+\}\;)/json|video|current|0|publicUrl
+ /window\.\_\_data\s*\=\s*([^
+]+\}\;)/json|video|current|0|publicUrl
  Download Xpath: //video[contains(@height,&#039;540&#039;)]
  Wiki Destination: My Private Group@Podcasts/%Y-%m-%d.mp4

 The initial page to be download will be: https://www.somenetwork.com/nightly-news. On this page, we will use the first Aux Path to find a string in the page that matches /(https\:\/\/www.somenetwork.com\/nightly-news-netcast\/video\/nightly-[^\&quot;]+)\&quot;/. The contents matching between the parentheses is the first capture group and will be the next url to download. SO for example, one might get a url:
  https://cdn.somenetwork.com/nightly-news-netcast/video/nightly-safghdsjfg
-This url is then downloaded and a string matching  the pattern /window\.\_\_data\s*\=\s*([^\n]+\}\;)/ is found. The capture group portion of this string consists of what matches ([^\n]+\}\;) is then converted to a JSON object, becausee of the json| in the Aux Url XPath. From this JSON object, we look at the video field, then the current subfields, its 0 subfield, and finally, the publicUrl field. This is the url we download next. Lastly, the download Xpath is then used to actually get the final video link from this downloaded page.
+This url is then downloaded and a string matching  the pattern /window\.\_\_data\s*\=\s*([^
+]+\}\;)/ is found. The capture group portion of this string consists of what matches ([^
+]+\}\;) is then converted to a JSON object, becausee of the json| in the Aux Url XPath. From this JSON object, we look at the video field, then the current subfields, its 0 subfield, and finally, the publicUrl field. This is the url we download next. Lastly, the download Xpath is then used to actually get the final video link from this downloaded page.
 Once this video is downloaded, it is stored in the Podcasts page&#039;s resource folder of the the My Private Group wiki group in a file with a name in the format: %Y-%m-%d.mp4.
 EOD;
 $help_pages["en-US"]["Monetization"] = <<< EOD
diff --git a/src/controllers/FetchController.php b/src/controllers/FetchController.php
index 88aaddd09..73243a928 100755
--- a/src/controllers/FetchController.php
+++ b/src/controllers/FetchController.php
@@ -81,6 +81,9 @@ class FetchController extends Controller implements CrawlConstants
          */
         if (!$this->checkRequest()) {
             $this->web_site->header("HTTP/1.0 400 Bad Request");
+            $_REQUEST['p'] = "400";
+            $_REQUEST['c'] = "static";
+            \seekquarry\yioop\bootstrap($this->web_site, false);
             return;
         }
         $activity = $_REQUEST['a'];
@@ -133,7 +136,7 @@ class FetchController extends Controller implements CrawlConstants
         } else {
             $crawl_time = 0;
         }
-        $schedule_filename = C\CRAWL_DIR."/schedules/".
+        $schedule_filename = C\CRAWL_DIR . "/schedules/".
             self::schedule_name . "$crawl_time.txt";
         if (file_exists($schedule_filename)) {
             $data['MESSAGE'] = file_get_contents($schedule_filename);
diff --git a/src/data/public_default.db b/src/data/public_default.db
index 45f0cd5c1..37d2e814f 100644
Binary files a/src/data/public_default.db and b/src/data/public_default.db differ
diff --git a/src/error.php b/src/error.php
index 82a3d155b..ec66f91c9 100755
--- a/src/error.php
+++ b/src/error.php
@@ -41,7 +41,7 @@ use seekquarry\yioop\controllers\StaticController;
 function webError()
 {
     if (!isset($_REQUEST['p']) ||
-        !in_array($_REQUEST['p'], ["404", "409"])) {
+        !in_array($_REQUEST['p'], ["400", "404", "409"])) {
         $_REQUEST['p'] = "404";
     }
     $_REQUEST['c'] = "static";
diff --git a/src/executables/Fetcher.php b/src/executables/Fetcher.php
index 5e0d0d165..e0666e470 100755
--- a/src/executables/Fetcher.php
+++ b/src/executables/Fetcher.php
@@ -1160,7 +1160,14 @@ class Fetcher implements CrawlConstants
         }
         $info_string = trim($info_string);
         $tok = strtok($info_string, "\n");
-        $info = unserialize(base64_decode($tok));
+        $decode_token = base64_decode($tok);
+        L\crawlLog("First 256 bytes of first token in response:");
+        L\crawlLog(substr($decode_token, 0, 256));
+        if ($decode_token[0] != '{') {
+            L\crawlLog("Error in decoding response, requeest failed!!!!");
+            return false;
+        }
+        $info = unserialize($decode_token);
         $this->setCrawlParamsFromArray($info);
         if (isset($info[self::SITES])) {
             $tok = strtok("\n"); //skip meta info
diff --git a/src/library/CrawlConstants.php b/src/library/CrawlConstants.php
index a9a806476..b0631a0d7 100755
--- a/src/library/CrawlConstants.php
+++ b/src/library/CrawlConstants.php
@@ -56,7 +56,7 @@ interface CrawlConstants
     const fetch_archive_iterator = "FetchArchiveIterator";
     const save_point = "SavePoint";
     const schedule_data_base_name = "ScheduleData";
-    const schedule_name = "FetchSchedule";
+    const schedule_name = "FetchSchedule"; //obtained from scheduler
     const robot_data_base_name = "RobotData";
     const etag_expires_data_base_name = "EtagExpiresData";
     const index_data_base_name = "IndexData";
@@ -65,7 +65,7 @@ interface CrawlConstants
     const network_crawllist_base_name = "NetworkCrawlList";
     const statistics_base_name = "Statistics";
     const index_closed_name = "IndexClosed";
-    const fetch_batch_name = "FetchBatch";
+    const fetch_batch_name = "FetchBatch"; //used to continue if fetcheer halted
     const fetch_crawl_info = "FetchInfo";
     const fetch_closed_name = "FetchClosed";
     const data_base_name = "At";

ViewGit