Cosmestic tweaks fix toLowerCase issue on spell correct, add log message about number of urls examined when making a fetch batch, a=chris

Chris Pollett [2019-07-02 03:Jul:nd]

Cosmestic tweaks fix toLowerCase issue on spell correct, add log message about number of urls examined when making a fetch batch, a=chris

Filename
src/executables/QueueServer.php
src/library/MailServer.php
src/library/media_jobs/BulkEmailJob.php
src/scripts/suggest.js

diff --git a/src/executables/QueueServer.php b/src/executables/QueueServer.php
index 2c383b466..04bf85727 100755
--- a/src/executables/QueueServer.php
+++ b/src/executables/QueueServer.php
@@ -401,7 +401,6 @@ class QueueServer implements CrawlConstants, Join
                 $remove = true;
             }
         }
-
         if ($remove == true) {
             L\crawlLog("Remove old messages..", $this->process_name);
         }
@@ -736,7 +735,7 @@ class QueueServer implements CrawlConstants, Join
                 $count = $this->web_queue->to_crawl_queue->count;
                 $max_links = max(C\MAX_LINKS_PER_PAGE, C\MAX_LINKS_PER_SITEMAP);
                 if ($count < C\NUM_URLS_QUEUE_RAM -
-                    C\SEEN_URLS_BEFORE_UPDATE_SCHEDULER * $max_links){
+                    C\SEEN_URLS_BEFORE_UPDATE_SCHEDULER * $max_links) {
                     $info = $this->processQueueUrls();
                 }
                 if ($count > 0) {
@@ -2526,12 +2525,12 @@ class QueueServer implements CrawlConstants, Join
                     $next_slot = $this->getEarliestSlot($current_crawl_index,
                         $sites);
                     if ($next_slot < C\MAX_FETCH_SIZE) {
-                        $sites[$next_slot] = [$url, $weight, 0];
-                        $delete_urls[$i] = $url;
                         /* note don't add to seen url filter
                            since check robots every 24 hours as needed
                          */
+                        $sites[$next_slot] = [$url, $weight, 0];
                         $current_crawl_index = $next_slot;
+                        $delete_urls[$i] = $url;
                         $fetch_size++;
                         $i++;
                     } else { //no more available slots so prepare to bail
@@ -2546,114 +2545,124 @@ class QueueServer implements CrawlConstants, Join
             }
             //Now handle the non-robots.txt url case
             $robots_okay = true;
-            if ($has_robots) {
-                if ($no_flags) {
-                    if ($this->robots_txt == C\IGNORE_ROBOTS ||
-                        ($this->robots_txt == C\ALLOW_LANDING_ROBOTS &&
-                        rtrim($url, "/") == rtrim($host_url, "/"))) {
-                        $robots_okay = true;
-                    } else if (!isset($hard_coded) || !$hard_coded) {
-                        $robots_okay = $this->web_queue->checkRobotOkay($url);
+            if (!$has_robots) {
+                $i++;
+                continue;
+            }
+            if ($no_flags) {
+                if ($this->robots_txt == C\IGNORE_ROBOTS ||
+                    ($this->robots_txt == C\ALLOW_LANDING_ROBOTS &&
+                    rtrim($url, "/") == rtrim($host_url, "/"))) {
+                    $robots_okay = true;
+                } else if (!isset($hard_coded) || !$hard_coded) {
+                    $robots_okay = $this->web_queue->checkRobotOkay($url);
+                } else {
+                    $robots_okay = true;
+                }
+                if (!$this->allowedToCrawlSite($url) ||
+                    $this->disallowedToCrawlSite($url)) {
+                    /* This is checked when added to queue,
+                       we check again here in case allowed and disallowed
+                       sites have changed since then
+                     */
+                    $robots_okay = false;
+                }
+                if (!$robots_okay) {
+                    $delete_urls[$i] = $url;
+                    $this->web_queue->addSeenUrlFilter($url);
+                    $i++;
+                    continue;
+                }
+                $delay = $this->web_queue->getCrawlDelay($host_url);
+            }
+            if (!$this->withinQuota($url)) {
+                //we've not allowed to schedule $url till next hour
+                $delete_urls[$i] = $url;
+                //delete from queue (so no clog) but don't mark seen
+                $i++;
+                continue;
+            }
+            //each host has two entries in $this->waiting_hosts
+            $num_waiting = floor(count($this->waiting_hosts)/2);
+            if ($delay > 0) {
+                // handle adding a url if there is a crawl delay
+                $hash_host = L\crawlHash($host_url);
+                $is_waiting_host = isset($this->waiting_hosts[$hash_host]);
+                /*
+                  To ensure that crawl-delay isn't violated by two separate
+                  fetchers crawling the same host, if a host has a crawl
+                  delay we only let it appear in one outstanding schedule
+                  at a time. When data appears back from the fetcher handling
+                  a crawl-delayed host, we'll clear it to appear in another
+                  schedule
+                 */
+                if ((!$is_waiting_host
+                    && $num_waiting < C\MAX_WAITING_HOSTS) ||
+                    $is_waiting_host && $this->waiting_hosts[$hash_host] ==
+                    $schedule_time) {
+                    $this->waiting_hosts[$hash_host] =
+                       $schedule_time;
+                    $this->waiting_hosts[$schedule_time][] =
+                        $hash_host;
+                    $request_batches_per_delay =
+                        ceil($delay/$time_per_request_guess);
+                    if (!isset($crawl_delay_hosts[$hash_host])) {
+                        $next_earliest_slot = $current_crawl_index;
+                        $crawl_delay_hosts[$hash_host] = $next_earliest_slot;
                     } else {
-                        $robots_okay = true;
-                    }
-                    if (!$this->allowedToCrawlSite($url) ||
-                        $this->disallowedToCrawlSite($url)) {
-                        /* This is checked when added to queue,
-                           we check again here in case allowed and disallowed
-                           sites have changed since then
-                         */
-                        $robots_okay = false;
+                        $next_earliest_slot = $crawl_delay_hosts[$hash_host]
+                            + $request_batches_per_delay
+                            * C\NUM_MULTI_CURL_PAGES;
                     }
-                    if (!$robots_okay) {
+                    if (($next_slot =
+                        $this->getEarliestSlot($next_earliest_slot,
+                            $sites)) < C\MAX_FETCH_SIZE) {
+                        $crawl_delay_hosts[$hash_host] = $next_slot;
                         $delete_urls[$i] = $url;
+                        $sites[$next_slot] = [$url, $weight, $delay];
                         $this->web_queue->addSeenUrlFilter($url);
-                        $i++;
-                        continue;
+                        /* we might miss some sites by marking them
+                           seen after only scheduling them
+                         */
+                        $fetch_size++;
+                    } else if ($no_flags) {
+                        $this->web_queue->setQueueFlag($url,
+                            $delay + WebQueueBundle::SCHEDULABLE);
                     }
-                    $delay = $this->web_queue->getCrawlDelay($host_url);
-                }
-                if (!$this->withinQuota($url)) {
-                    //we've not allowed to schedule $url till next hour
+                } else if (!$is_waiting_host) {
+                    // has crawl delay but too many already waiting
                     $delete_urls[$i] = $url;
                     //delete from queue (so no clog) but don't mark seen
                     $i++;
                     continue;
                 }
-                //each host has two entries in $this->waiting_hosts
-                $num_waiting = floor(count($this->waiting_hosts)/2);
-                if ($delay > 0 ) {
-                    // handle adding a url if there is a crawl delay
-                    $hash_host = L\crawlHash($host_url);
-                    $is_waiting_host = isset($this->waiting_hosts[$hash_host]);
-                    if ((!$is_waiting_host
-                        && $num_waiting < C\MAX_WAITING_HOSTS) ||
-                        $is_waiting_host && $this->waiting_hosts[$hash_host] ==
-                        $schedule_time) {
-                        $this->waiting_hosts[$hash_host] =
-                           $schedule_time;
-                        $this->waiting_hosts[$schedule_time][] =
-                            $hash_host;
-                        $request_batches_per_delay =
-                            ceil($delay/$time_per_request_guess);
-                        if (!isset($crawl_delay_hosts[$hash_host])) {
-                            $next_earliest_slot = $current_crawl_index;
-                            $crawl_delay_hosts[$hash_host]= $next_earliest_slot;
-                        } else {
-                            $next_earliest_slot = $crawl_delay_hosts[$hash_host]
-                                + $request_batches_per_delay
-                                * C\NUM_MULTI_CURL_PAGES;
-                        }
-                        if (($next_slot =
-                            $this->getEarliestSlot( $next_earliest_slot,
-                                $sites)) < C\MAX_FETCH_SIZE) {
-                            $crawl_delay_hosts[$hash_host] = $next_slot;
-                            $delete_urls[$i] = $url;
-                            $sites[$next_slot] = [$url, $weight, $delay];
-                            $this->web_queue->addSeenUrlFilter($url);
-                            /* we might miss some sites by marking them
-                               seen after only scheduling them
-                             */
-                            $fetch_size++;
-                        } else if ($no_flags) {
-                            $this->web_queue->setQueueFlag($url,
-                                $delay + WebQueueBundle::SCHEDULABLE);
-                        }
-                    } else if (!$is_waiting_host) {
-                        // has crawl delay but too many already waiting
-                        $delete_urls[$i] = $url;
-                        //delete from queue (so no clog) but don't mark seen
-                        $i++;
-                        continue;
-                    }
-                } else { // add a url no crawl delay
-                    $next_slot = $this->getEarliestSlot(
-                        $current_crawl_index, $sites);
-                    if ($next_slot < C\MAX_FETCH_SIZE) {
-                        $sites[$next_slot] = [$url, $weight, 0];
-                        $delete_urls[$i] = $url;
-                        $this->web_queue->addSeenUrlFilter($url);
-                            /* we might miss some sites by marking them
-                               seen after only scheduling them
-                             */
-                        $current_crawl_index = $next_slot;
-                        $fetch_size++;
-                    } else { //no more available slots so prepare to bail
-                        $i = $count;
-                        if ($no_flags) {
-                            $this->web_queue->setQueueFlag($url,
-                                WebQueueBundle::SCHEDULABLE);
-                        }
+            } else { // add a url no crawl delay
+                $next_slot = $this->getEarliestSlot($current_crawl_index,
+                    $sites);
+                if ($next_slot < C\MAX_FETCH_SIZE) {
+                    $sites[$next_slot] = [$url, $weight, 0];
+                    $delete_urls[$i] = $url;
+                    $this->web_queue->addSeenUrlFilter($url);
+                        /* we might miss some sites by marking them
+                           seen after only scheduling them
+                         */
+                    $current_crawl_index = $next_slot;
+                    $fetch_size++;
+                } else { //no more available slots so prepare to bail
+                    $i = $count;
+                    if ($no_flags) {
+                        $this->web_queue->setQueueFlag($url,
+                            WebQueueBundle::SCHEDULABLE);
                     }
-                } //if delay else
-            } // if containsGotRobotTxt
-            // handle robots.txt urls
+                }
+            } //no crawl-delay else
             $i++;
         } //end while
         $this->web_queue->closeUrlArchive($fh);
         $new_time = microtime(true);
         L\crawlLog("...Scheduler: Done selecting URLS for fetch batch time ".
             "so far:". L\changeInMicrotime($start_time));
+        L\crawlLog("...Scheduler: Examined urls while making fetch batch: $i");
         $num_deletes = count($delete_urls);
         $k = 0;
         foreach ($delete_urls as $delete_url) {
@@ -2669,7 +2678,7 @@ class QueueServer implements CrawlConstants, Join
             }
         }
         L\crawlLog("...Scheduler: Removed $k URLS for fetch batch from ".
-            "queue in time: ".L\changeInMicrotime($new_time));
+            "queue in time: " . L\changeInMicrotime($new_time));
         $new_time = microtime(true);
         if (isset($sites) && count($sites) > 0 ) {
             $dummy_slot = [self::DUMMY, 0.0, 0];
@@ -2693,7 +2702,7 @@ class QueueServer implements CrawlConstants, Join
             //write schedule to disk
             $fh = fopen(C\CRAWL_DIR.
                 "/schedules/".
-                self::schedule_name.$this->crawl_time.".txt", "wb");
+                self::schedule_name.$this->crawl_time . ".txt", "wb");
             fwrite($fh, $first_line);
             $num_sites = count($sites);
             $k = 0;
diff --git a/src/library/MailServer.php b/src/library/MailServer.php
index e0b5783bc..c3fc121b4 100644
--- a/src/library/MailServer.php
+++ b/src/library/MailServer.php
@@ -37,7 +37,7 @@ use seekquarry\yioop\library\MediaConstants;
 /**
  * Timing functions
  */
-require_once __DIR__."/Utility.php";
+require_once __DIR__ . "/Utility.php";
 /**
  * A small class for communicating with an SMTP server. Used to avoid
  * configuration issues that might be needed with PHP's built-in mail()
@@ -144,7 +144,7 @@ class MailServer implements MediaConstants
             "dev.null";
         $this->server = $server;
         if ($secure == "ssl") {
-            'ssl://'.$server;
+            'ssl://' . $server;
         }
         $this->port = $port;
         $this->login = $login;
@@ -239,7 +239,7 @@ class MailServer implements MediaConstants
         return $this->readResponseGetCode();
     }
     /**
-     * Sends (or queues for media updater)an email
+     * Sends (or queues for media updater) an email
      * (much like PHP's mail command, but not requiring
      * a configured smtp server on the current machine)
      *
@@ -340,14 +340,14 @@ class MailServer implements MediaConstants
                 webExit();
             }
         }
-        $files = glob($mail_directory."/*.txt");
+        $files = glob($mail_directory . "/*.txt");
         $file_count = count($files);
         $current_count = 0;
         $current_time = time();
         $diff = 0;
         if ($file_count > 0) {
             $file = end($files);
-            $file_name = str_replace($mail_directory."/", "", $file);
+            $file_name = str_replace($mail_directory . "/", "", $file);
             $last_file_time = substr($file_name, 0, -4);
             $diff = $current_time - $last_file_time;
         }
@@ -371,7 +371,7 @@ class MailServer implements MediaConstants
                     " for $file_time.txt!\n");
             }
         } else {
-            $fp = fopen($mail_directory."/".$last_file_time.".txt", "a+");
+            $fp = fopen($mail_directory . "/" . $last_file_time . ".txt", "a+");
             if (flock($fp, LOCK_EX | LOCK_NB)) {
                 crawlLog("....Lock acquired! Sending emails now!\n");
                 fwrite($fp, $mail_details);
diff --git a/src/library/media_jobs/BulkEmailJob.php b/src/library/media_jobs/BulkEmailJob.php
index c3511e8dc..b5dc6a79a 100644
--- a/src/library/media_jobs/BulkEmailJob.php
+++ b/src/library/media_jobs/BulkEmailJob.php
@@ -94,7 +94,7 @@ class BulkEmailJob extends MediaJob
         if (!$sendable_file) {
             return;
         }
-        L\crawlLog("Using Mail Directory:". $mail_directory);
+        L\crawlLog("Using Mail Directory:" . $mail_directory);
         $emails_string = file_get_contents($sendable_file);
         unlink($sendable_file);
         $emails = explode(self::MESSAGE_SEPARATOR, $emails_string);
@@ -150,7 +150,7 @@ class BulkEmailJob extends MediaJob
         if (!file_exists($mail_directory)) {
             return false;
         }
-        $files = glob($mail_directory."/*.txt");
+        $files = glob($mail_directory . "/*.txt");
         $sendable_file = false;
         foreach ($files as $email_file) {
             if (time() - filemtime($email_file) >
diff --git a/src/scripts/suggest.js b/src/scripts/suggest.js
index 52b2ee2ac..684135a44 100644
--- a/src/scripts/suggest.js
+++ b/src/scripts/suggest.js
@@ -733,7 +733,7 @@ function spellCheck()
         if (query.length > MIN_SPELL_CHECK_WIDTH) {
             return;
         }
-        if (corrected_query.trim() != query) {
+        if (corrected_query.trim() != query.toLowerCase()) {
             if (logged_in) {
                 var token_name = csrf_name;
                 var spell_link = "?" + token_name + "=" + csrf_token + "&q="

ViewGit