viewgit/inc/functions.php:22 Function utf8_encode() is deprecated [8192]
Filename | |
---|---|
src/executables/QueueServer.php | |
src/library/MailServer.php | |
src/library/media_jobs/BulkEmailJob.php | |
src/scripts/suggest.js |
diff --git a/src/executables/QueueServer.php b/src/executables/QueueServer.php index 2c383b466..04bf85727 100755 --- a/src/executables/QueueServer.php +++ b/src/executables/QueueServer.php @@ -401,7 +401,6 @@ class QueueServer implements CrawlConstants, Join $remove = true; } } - if ($remove == true) { L\crawlLog("Remove old messages..", $this->process_name); } @@ -736,7 +735,7 @@ class QueueServer implements CrawlConstants, Join $count = $this->web_queue->to_crawl_queue->count; $max_links = max(C\MAX_LINKS_PER_PAGE, C\MAX_LINKS_PER_SITEMAP); if ($count < C\NUM_URLS_QUEUE_RAM - - C\SEEN_URLS_BEFORE_UPDATE_SCHEDULER * $max_links){ + C\SEEN_URLS_BEFORE_UPDATE_SCHEDULER * $max_links) { $info = $this->processQueueUrls(); } if ($count > 0) { @@ -2526,12 +2525,12 @@ class QueueServer implements CrawlConstants, Join $next_slot = $this->getEarliestSlot($current_crawl_index, $sites); if ($next_slot < C\MAX_FETCH_SIZE) { - $sites[$next_slot] = [$url, $weight, 0]; - $delete_urls[$i] = $url; /* note don't add to seen url filter since check robots every 24 hours as needed */ + $sites[$next_slot] = [$url, $weight, 0]; $current_crawl_index = $next_slot; + $delete_urls[$i] = $url; $fetch_size++; $i++; } else { //no more available slots so prepare to bail @@ -2546,114 +2545,124 @@ class QueueServer implements CrawlConstants, Join } //Now handle the non-robots.txt url case $robots_okay = true; - if ($has_robots) { - if ($no_flags) { - if ($this->robots_txt == C\IGNORE_ROBOTS || - ($this->robots_txt == C\ALLOW_LANDING_ROBOTS && - rtrim($url, "/") == rtrim($host_url, "/"))) { - $robots_okay = true; - } else if (!isset($hard_coded) || !$hard_coded) { - $robots_okay = $this->web_queue->checkRobotOkay($url); + if (!$has_robots) { + $i++; + continue; + } + if ($no_flags) { + if ($this->robots_txt == C\IGNORE_ROBOTS || + ($this->robots_txt == C\ALLOW_LANDING_ROBOTS && + rtrim($url, "/") == rtrim($host_url, "/"))) { + $robots_okay = true; + } else if (!isset($hard_coded) || !$hard_coded) { + $robots_okay = $this->web_queue->checkRobotOkay($url); + } else { + $robots_okay = true; + } + if (!$this->allowedToCrawlSite($url) || + $this->disallowedToCrawlSite($url)) { + /* This is checked when added to queue, + we check again here in case allowed and disallowed + sites have changed since then + */ + $robots_okay = false; + } + if (!$robots_okay) { + $delete_urls[$i] = $url; + $this->web_queue->addSeenUrlFilter($url); + $i++; + continue; + } + $delay = $this->web_queue->getCrawlDelay($host_url); + } + if (!$this->withinQuota($url)) { + //we've not allowed to schedule $url till next hour + $delete_urls[$i] = $url; + //delete from queue (so no clog) but don't mark seen + $i++; + continue; + } + //each host has two entries in $this->waiting_hosts + $num_waiting = floor(count($this->waiting_hosts)/2); + if ($delay > 0) { + // handle adding a url if there is a crawl delay + $hash_host = L\crawlHash($host_url); + $is_waiting_host = isset($this->waiting_hosts[$hash_host]); + /* + To ensure that crawl-delay isn't violated by two separate + fetchers crawling the same host, if a host has a crawl + delay we only let it appear in one outstanding schedule + at a time. When data appears back from the fetcher handling + a crawl-delayed host, we'll clear it to appear in another + schedule + */ + if ((!$is_waiting_host + && $num_waiting < C\MAX_WAITING_HOSTS) || + $is_waiting_host && $this->waiting_hosts[$hash_host] == + $schedule_time) { + $this->waiting_hosts[$hash_host] = + $schedule_time; + $this->waiting_hosts[$schedule_time][] = + $hash_host; + $request_batches_per_delay = + ceil($delay/$time_per_request_guess); + if (!isset($crawl_delay_hosts[$hash_host])) { + $next_earliest_slot = $current_crawl_index; + $crawl_delay_hosts[$hash_host] = $next_earliest_slot; } else { - $robots_okay = true; - } - if (!$this->allowedToCrawlSite($url) || - $this->disallowedToCrawlSite($url)) { - /* This is checked when added to queue, - we check again here in case allowed and disallowed - sites have changed since then - */ - $robots_okay = false; + $next_earliest_slot = $crawl_delay_hosts[$hash_host] + + $request_batches_per_delay + * C\NUM_MULTI_CURL_PAGES; } - if (!$robots_okay) { + if (($next_slot = + $this->getEarliestSlot($next_earliest_slot, + $sites)) < C\MAX_FETCH_SIZE) { + $crawl_delay_hosts[$hash_host] = $next_slot; $delete_urls[$i] = $url; + $sites[$next_slot] = [$url, $weight, $delay]; $this->web_queue->addSeenUrlFilter($url); - $i++; - continue; + /* we might miss some sites by marking them + seen after only scheduling them + */ + $fetch_size++; + } else if ($no_flags) { + $this->web_queue->setQueueFlag($url, + $delay + WebQueueBundle::SCHEDULABLE); } - $delay = $this->web_queue->getCrawlDelay($host_url); - } - if (!$this->withinQuota($url)) { - //we've not allowed to schedule $url till next hour + } else if (!$is_waiting_host) { + // has crawl delay but too many already waiting $delete_urls[$i] = $url; //delete from queue (so no clog) but don't mark seen $i++; continue; } - //each host has two entries in $this->waiting_hosts - $num_waiting = floor(count($this->waiting_hosts)/2); - if ($delay > 0 ) { - // handle adding a url if there is a crawl delay - $hash_host = L\crawlHash($host_url); - $is_waiting_host = isset($this->waiting_hosts[$hash_host]); - if ((!$is_waiting_host - && $num_waiting < C\MAX_WAITING_HOSTS) || - $is_waiting_host && $this->waiting_hosts[$hash_host] == - $schedule_time) { - $this->waiting_hosts[$hash_host] = - $schedule_time; - $this->waiting_hosts[$schedule_time][] = - $hash_host; - $request_batches_per_delay = - ceil($delay/$time_per_request_guess); - if (!isset($crawl_delay_hosts[$hash_host])) { - $next_earliest_slot = $current_crawl_index; - $crawl_delay_hosts[$hash_host]= $next_earliest_slot; - } else { - $next_earliest_slot = $crawl_delay_hosts[$hash_host] - + $request_batches_per_delay - * C\NUM_MULTI_CURL_PAGES; - } - if (($next_slot = - $this->getEarliestSlot( $next_earliest_slot, - $sites)) < C\MAX_FETCH_SIZE) { - $crawl_delay_hosts[$hash_host] = $next_slot; - $delete_urls[$i] = $url; - $sites[$next_slot] = [$url, $weight, $delay]; - $this->web_queue->addSeenUrlFilter($url); - /* we might miss some sites by marking them - seen after only scheduling them - */ - $fetch_size++; - } else if ($no_flags) { - $this->web_queue->setQueueFlag($url, - $delay + WebQueueBundle::SCHEDULABLE); - } - } else if (!$is_waiting_host) { - // has crawl delay but too many already waiting - $delete_urls[$i] = $url; - //delete from queue (so no clog) but don't mark seen - $i++; - continue; - } - } else { // add a url no crawl delay - $next_slot = $this->getEarliestSlot( - $current_crawl_index, $sites); - if ($next_slot < C\MAX_FETCH_SIZE) { - $sites[$next_slot] = [$url, $weight, 0]; - $delete_urls[$i] = $url; - $this->web_queue->addSeenUrlFilter($url); - /* we might miss some sites by marking them - seen after only scheduling them - */ - $current_crawl_index = $next_slot; - $fetch_size++; - } else { //no more available slots so prepare to bail - $i = $count; - if ($no_flags) { - $this->web_queue->setQueueFlag($url, - WebQueueBundle::SCHEDULABLE); - } + } else { // add a url no crawl delay + $next_slot = $this->getEarliestSlot($current_crawl_index, + $sites); + if ($next_slot < C\MAX_FETCH_SIZE) { + $sites[$next_slot] = [$url, $weight, 0]; + $delete_urls[$i] = $url; + $this->web_queue->addSeenUrlFilter($url); + /* we might miss some sites by marking them + seen after only scheduling them + */ + $current_crawl_index = $next_slot; + $fetch_size++; + } else { //no more available slots so prepare to bail + $i = $count; + if ($no_flags) { + $this->web_queue->setQueueFlag($url, + WebQueueBundle::SCHEDULABLE); } - } //if delay else - } // if containsGotRobotTxt - // handle robots.txt urls + } + } //no crawl-delay else $i++; } //end while $this->web_queue->closeUrlArchive($fh); $new_time = microtime(true); L\crawlLog("...Scheduler: Done selecting URLS for fetch batch time ". "so far:". L\changeInMicrotime($start_time)); + L\crawlLog("...Scheduler: Examined urls while making fetch batch: $i"); $num_deletes = count($delete_urls); $k = 0; foreach ($delete_urls as $delete_url) { @@ -2669,7 +2678,7 @@ class QueueServer implements CrawlConstants, Join } } L\crawlLog("...Scheduler: Removed $k URLS for fetch batch from ". - "queue in time: ".L\changeInMicrotime($new_time)); + "queue in time: " . L\changeInMicrotime($new_time)); $new_time = microtime(true); if (isset($sites) && count($sites) > 0 ) { $dummy_slot = [self::DUMMY, 0.0, 0]; @@ -2693,7 +2702,7 @@ class QueueServer implements CrawlConstants, Join //write schedule to disk $fh = fopen(C\CRAWL_DIR. "/schedules/". - self::schedule_name.$this->crawl_time.".txt", "wb"); + self::schedule_name.$this->crawl_time . ".txt", "wb"); fwrite($fh, $first_line); $num_sites = count($sites); $k = 0; diff --git a/src/library/MailServer.php b/src/library/MailServer.php index e0b5783bc..c3fc121b4 100644 --- a/src/library/MailServer.php +++ b/src/library/MailServer.php @@ -37,7 +37,7 @@ use seekquarry\yioop\library\MediaConstants; /** * Timing functions */ -require_once __DIR__."/Utility.php"; +require_once __DIR__ . "/Utility.php"; /** * A small class for communicating with an SMTP server. Used to avoid * configuration issues that might be needed with PHP's built-in mail() @@ -144,7 +144,7 @@ class MailServer implements MediaConstants "dev.null"; $this->server = $server; if ($secure == "ssl") { - 'ssl://'.$server; + 'ssl://' . $server; } $this->port = $port; $this->login = $login; @@ -239,7 +239,7 @@ class MailServer implements MediaConstants return $this->readResponseGetCode(); } /** - * Sends (or queues for media updater)an email + * Sends (or queues for media updater) an email * (much like PHP's mail command, but not requiring * a configured smtp server on the current machine) * @@ -340,14 +340,14 @@ class MailServer implements MediaConstants webExit(); } } - $files = glob($mail_directory."/*.txt"); + $files = glob($mail_directory . "/*.txt"); $file_count = count($files); $current_count = 0; $current_time = time(); $diff = 0; if ($file_count > 0) { $file = end($files); - $file_name = str_replace($mail_directory."/", "", $file); + $file_name = str_replace($mail_directory . "/", "", $file); $last_file_time = substr($file_name, 0, -4); $diff = $current_time - $last_file_time; } @@ -371,7 +371,7 @@ class MailServer implements MediaConstants " for $file_time.txt!\n"); } } else { - $fp = fopen($mail_directory."/".$last_file_time.".txt", "a+"); + $fp = fopen($mail_directory . "/" . $last_file_time . ".txt", "a+"); if (flock($fp, LOCK_EX | LOCK_NB)) { crawlLog("....Lock acquired! Sending emails now!\n"); fwrite($fp, $mail_details); diff --git a/src/library/media_jobs/BulkEmailJob.php b/src/library/media_jobs/BulkEmailJob.php index c3511e8dc..b5dc6a79a 100644 --- a/src/library/media_jobs/BulkEmailJob.php +++ b/src/library/media_jobs/BulkEmailJob.php @@ -94,7 +94,7 @@ class BulkEmailJob extends MediaJob if (!$sendable_file) { return; } - L\crawlLog("Using Mail Directory:". $mail_directory); + L\crawlLog("Using Mail Directory:" . $mail_directory); $emails_string = file_get_contents($sendable_file); unlink($sendable_file); $emails = explode(self::MESSAGE_SEPARATOR, $emails_string); @@ -150,7 +150,7 @@ class BulkEmailJob extends MediaJob if (!file_exists($mail_directory)) { return false; } - $files = glob($mail_directory."/*.txt"); + $files = glob($mail_directory . "/*.txt"); $sendable_file = false; foreach ($files as $email_file) { if (time() - filemtime($email_file) > diff --git a/src/scripts/suggest.js b/src/scripts/suggest.js index 52b2ee2ac..684135a44 100644 --- a/src/scripts/suggest.js +++ b/src/scripts/suggest.js @@ -733,7 +733,7 @@ function spellCheck() if (query.length > MIN_SPELL_CHECK_WIDTH) { return; } - if (corrected_query.trim() != query) { + if (corrected_query.trim() != query.toLowerCase()) { if (logged_in) { var token_name = csrf_name; var spell_link = "?" + token_name + "=" + csrf_token + "&q="