[autotest] Send an email if a drone is hitting process limit

Add a setting 'max_processes_warning_threshold' to global_config.ini.
If the raitio of active processes to max processes goes over
the threshold, DroneManager will send a email to
chromeos-lab-infrastructure@.

To prevent it from spamming our mailing list, only one email
will be sent for each drone within 24 hours.

BUG=chromium:277184
TEST=Test locally and confirm email is sent out when
the threshold is reached.
DEPLOY=scheduler

Change-Id: Id4a883ff6c26e9bba384974c255a0ce0f3cb4056
Reviewed-on: https://chromium-review.googlesource.com/168147
Reviewed-by: Dan Shi <dshi@chromium.org>
Reviewed-by: Alex Miller <milleral@chromium.org>
Tested-by: Fang Deng <fdeng@chromium.org>
Commit-Queue: Fang Deng <fdeng@chromium.org>
diff --git a/global_config.ini b/global_config.ini
index bdc634d..9dedf31 100644
--- a/global_config.ini
+++ b/global_config.ini
@@ -76,6 +76,7 @@
 notify_email_from: chromeos-autotest@google.com
 notify_email_statuses: Completed,Failed,Aborted
 max_processes_per_drone: 1000
+max_processes_warning_threshold: 0.8
 max_hostless_processes: 500
 max_jobs_started_per_cycle: 100
 max_parse_processes: 100
diff --git a/scheduler/drone_manager.py b/scheduler/drone_manager.py
index 0550f0f..520dff7 100644
--- a/scheduler/drone_manager.py
+++ b/scheduler/drone_manager.py
@@ -137,6 +137,13 @@
     All paths going into and out of this class are relative to the full results
     directory, except for those returns by absolute_path().
     """
+
+
+    # Minimum time to wait before next email
+    # about a drone hitting process limit is sent.
+    NOTIFY_INTERVAL = 60 * 60 * 24 # one day
+
+
     def __init__(self):
         # absolute path of base results dir
         self._results_dir = None
@@ -159,6 +166,9 @@
         self._attached_files = {}
         # heapq of _DroneHeapWrappers
         self._drone_queue = []
+        # map drone hostname to time stamp of email that
+        # has been sent about the drone hitting process limit.
+        self._notify_record = {}
 
 
     def initialize(self, base_results_dir, drone_hostnames,
@@ -243,6 +253,8 @@
             drone.allowed_users = allowed_users
 
         self._reorder_drone_queue() # max_processes may have changed
+        # Clear notification record about reaching max_processes limit.
+        self._notify_record = {}
 
 
     def get_drones(self):
@@ -348,6 +360,28 @@
                     drone.active_processes += info.num_processes
 
 
+    def _check_drone_process_limit(self, drone):
+        """
+        Notify if the number of processes on |drone| is approaching limit.
+
+        @param drone: A Drone object.
+        """
+        percent = float(drone.active_processes) / drone.max_processes
+        max_percent = scheduler_config.config.max_processes_warning_threshold
+        if percent >= max_percent:
+            message = ('Drone %s is hitting %s of process limit.' %
+                       (drone.hostname, format(percent, '.2%')))
+            logging.warning(message)
+            last_notified = self._notify_record.get(drone.hostname, 0)
+            now = time.time()
+            if last_notified + BaseDroneManager.NOTIFY_INTERVAL < now:
+                body = ('Active processes/Process limit: %d/%d (%s)' %
+                        (drone.active_processes, drone.max_processes,
+                         format(percent, '.2%')))
+                email_manager.manager.enqueue_notify_email(message, body)
+                self._notify_record[drone.hostname] = now
+
+
     def refresh(self):
         """
         Called at the beginning of a scheduler cycle to refresh all process
@@ -377,6 +411,7 @@
             self._compute_active_processes(drone)
             if drone.enabled:
                 self._enqueue_drone(drone)
+                self._check_drone_process_limit(drone)
 
 
     def execute_actions(self):
diff --git a/scheduler/scheduler_config.py b/scheduler/scheduler_config.py
index 91c8fb1..3dfb927 100644
--- a/scheduler/scheduler_config.py
+++ b/scheduler/scheduler_config.py
@@ -8,6 +8,8 @@
     Contains configuration that can be changed during scheduler execution.
     """
     FIELDS = {'max_processes_per_drone': 'max_processes_per_drone',
+              'max_processes_warning_threshold':
+                  'max_processes_warning_threshold',
               'max_processes_started_per_cycle': 'max_jobs_started_per_cycle',
               'clean_interval': 'clean_interval_minutes',
               'max_parse_processes': 'max_parse_processes',
@@ -31,9 +33,13 @@
         config = global_config.global_config
         config.parse_config_file()
         for field, config_option in self.FIELDS.iteritems():
+            if field == 'max_processes_warning_threshold':
+                data_type = float
+            else:
+                data_type = int
             setattr(self, field, config.get_config_value(CONFIG_SECTION,
                                                          config_option,
-                                                         type=int))
+                                                         type=data_type))
 
 
 config = SchedulerConfig()