viewgit/inc/functions.php:22 Function utf8_encode() is deprecated [8192]
Filename | |
---|---|
src/configs/PublicHelpPages.php | |
src/controllers/FetchController.php | |
src/data/public_default.db | |
src/error.php | |
src/executables/Fetcher.php | |
src/library/CrawlConstants.php |
diff --git a/src/configs/PublicHelpPages.php b/src/configs/PublicHelpPages.php index 4b7aed4d1..31644329b 100644 --- a/src/configs/PublicHelpPages.php +++ b/src/configs/PublicHelpPages.php @@ -39,21 +39,8 @@ namespace seekquarry\yioop\configs; * @var array */ $public_pages = []; -$public_pages["en-US"]["404"] = <<< 'EOD' -title=Page Not Found -description=The page you requested cannot be found on our server -END_HEAD_VARS -==The page you requested cannot be found.== -EOD; -$public_pages["en-US"]["409"] = <<< 'EOD' -title=Conflict - -description=Your request would result in an edit conflict. -END_HEAD_VARS -==Your request would result in an edit conflict, so will not be processed.== -EOD; -$public_pages["en-US"]["Podcast_Examples"] = <<< 'EOD' -page_type=media_list +$public_pages["en-US"]["400"] = <<< 'EOD' +page_type=standard page_alias= @@ -61,13 +48,13 @@ page_border=solid-border toc=true -title=Podcast+Examples +title= -author=Yioop+Team +author= -robots=NOINDEX%2C+NOFOLLOW +robots= -description=Used+to+save+example+feed+podcasts+listed+on+search+sources. +description= alternative_path= @@ -77,7 +64,24 @@ page_footer= sort=aname +END_HEAD_VARS==Bad Request== +Your request couldn't be processed by the server! +EOD; +$public_pages["en-US"]["404"] = <<< 'EOD' +title=Page Not Found +description=The page you requested cannot be found on our server END_HEAD_VARS +==The page you requested cannot be found.== +EOD; +$public_pages["en-US"]["409"] = <<< 'EOD' +title=Conflict + +description=Your request would result in an edit conflict. +END_HEAD_VARS +==Your request would result in an edit conflict, so will not be processed.== +EOD; +$public_pages["en-US"]["Podcast_Examples"] = <<< 'EOD' + EOD; $public_pages["en-US"]["Syntax"] = <<< 'EOD' page_type=standard @@ -2059,13 +2063,16 @@ If a regex is used rather than an xpath, then the first capture group of the reg Language: English Aux Url XPaths: /(https\:\/\/cdn.somenetwork.com\/nightly-news-netcast\/video\/nightly-[^\"]+)\"/ - /window\.\_\_data\s*\=\s*([^\n]+\}\;)/json|video|current|0|publicUrl + /window\.\_\_data\s*\=\s*([^ +]+\}\;)/json|video|current|0|publicUrl Download Xpath: //video[contains(@height,'540')] Wiki Destination: My Private Group@Podcasts/%Y-%m-%d.mp4 The initial page to be download will be: https://www.somenetwork.com/nightly-news. On this page, we will use the first Aux Path to find a string in the page that matches /(https\:\/\/www.somenetwork.com\/nightly-news-netcast\/video\/nightly-[^\"]+)\"/. The contents matching between the parentheses is the first capture group and will be the next url to download. SO for example, one might get a url: https://cdn.somenetwork.com/nightly-news-netcast/video/nightly-safghdsjfg -This url is then downloaded and a string matching the pattern /window\.\_\_data\s*\=\s*([^\n]+\}\;)/ is found. The capture group portion of this string consists of what matches ([^\n]+\}\;) is then converted to a JSON object, becausee of the json| in the Aux Url XPath. From this JSON object, we look at the video field, then the current subfields, its 0 subfield, and finally, the publicUrl field. This is the url we download next. Lastly, the download Xpath is then used to actually get the final video link from this downloaded page. +This url is then downloaded and a string matching the pattern /window\.\_\_data\s*\=\s*([^ +]+\}\;)/ is found. The capture group portion of this string consists of what matches ([^ +]+\}\;) is then converted to a JSON object, becausee of the json| in the Aux Url XPath. From this JSON object, we look at the video field, then the current subfields, its 0 subfield, and finally, the publicUrl field. This is the url we download next. Lastly, the download Xpath is then used to actually get the final video link from this downloaded page. Once this video is downloaded, it is stored in the Podcasts page's resource folder of the the My Private Group wiki group in a file with a name in the format: %Y-%m-%d.mp4. EOD; $help_pages["en-US"]["Monetization"] = <<< EOD diff --git a/src/controllers/FetchController.php b/src/controllers/FetchController.php index 88aaddd09..73243a928 100755 --- a/src/controllers/FetchController.php +++ b/src/controllers/FetchController.php @@ -81,6 +81,9 @@ class FetchController extends Controller implements CrawlConstants */ if (!$this->checkRequest()) { $this->web_site->header("HTTP/1.0 400 Bad Request"); + $_REQUEST['p'] = "400"; + $_REQUEST['c'] = "static"; + \seekquarry\yioop\bootstrap($this->web_site, false); return; } $activity = $_REQUEST['a']; @@ -133,7 +136,7 @@ class FetchController extends Controller implements CrawlConstants } else { $crawl_time = 0; } - $schedule_filename = C\CRAWL_DIR."/schedules/". + $schedule_filename = C\CRAWL_DIR . "/schedules/". self::schedule_name . "$crawl_time.txt"; if (file_exists($schedule_filename)) { $data['MESSAGE'] = file_get_contents($schedule_filename); diff --git a/src/data/public_default.db b/src/data/public_default.db index 45f0cd5c1..37d2e814f 100644 Binary files a/src/data/public_default.db and b/src/data/public_default.db differ diff --git a/src/error.php b/src/error.php index 82a3d155b..ec66f91c9 100755 --- a/src/error.php +++ b/src/error.php @@ -41,7 +41,7 @@ use seekquarry\yioop\controllers\StaticController; function webError() { if (!isset($_REQUEST['p']) || - !in_array($_REQUEST['p'], ["404", "409"])) { + !in_array($_REQUEST['p'], ["400", "404", "409"])) { $_REQUEST['p'] = "404"; } $_REQUEST['c'] = "static"; diff --git a/src/executables/Fetcher.php b/src/executables/Fetcher.php index 5e0d0d165..e0666e470 100755 --- a/src/executables/Fetcher.php +++ b/src/executables/Fetcher.php @@ -1160,7 +1160,14 @@ class Fetcher implements CrawlConstants } $info_string = trim($info_string); $tok = strtok($info_string, "\n"); - $info = unserialize(base64_decode($tok)); + $decode_token = base64_decode($tok); + L\crawlLog("First 256 bytes of first token in response:"); + L\crawlLog(substr($decode_token, 0, 256)); + if ($decode_token[0] != '{') { + L\crawlLog("Error in decoding response, requeest failed!!!!"); + return false; + } + $info = unserialize($decode_token); $this->setCrawlParamsFromArray($info); if (isset($info[self::SITES])) { $tok = strtok("\n"); //skip meta info diff --git a/src/library/CrawlConstants.php b/src/library/CrawlConstants.php index a9a806476..b0631a0d7 100755 --- a/src/library/CrawlConstants.php +++ b/src/library/CrawlConstants.php @@ -56,7 +56,7 @@ interface CrawlConstants const fetch_archive_iterator = "FetchArchiveIterator"; const save_point = "SavePoint"; const schedule_data_base_name = "ScheduleData"; - const schedule_name = "FetchSchedule"; + const schedule_name = "FetchSchedule"; //obtained from scheduler const robot_data_base_name = "RobotData"; const etag_expires_data_base_name = "EtagExpiresData"; const index_data_base_name = "IndexData"; @@ -65,7 +65,7 @@ interface CrawlConstants const network_crawllist_base_name = "NetworkCrawlList"; const statistics_base_name = "Statistics"; const index_closed_name = "IndexClosed"; - const fetch_batch_name = "FetchBatch"; + const fetch_batch_name = "FetchBatch"; //used to continue if fetcheer halted const fetch_crawl_info = "FetchInfo"; const fetch_closed_name = "FetchClosed"; const data_base_name = "At";