[autotest] Send an email if a drone is hitting process limit
Add a setting 'max_processes_warning_threshold' to global_config.ini.
If the raitio of active processes to max processes goes over
the threshold, DroneManager will send a email to
chromeos-lab-infrastructure@.
To prevent it from spamming our mailing list, only one email
will be sent for each drone within 24 hours.
BUG=chromium:277184
TEST=Test locally and confirm email is sent out when
the threshold is reached.
DEPLOY=scheduler
Change-Id: Id4a883ff6c26e9bba384974c255a0ce0f3cb4056
Reviewed-on: https://chromium-review.googlesource.com/168147
Reviewed-by: Dan Shi <dshi@chromium.org>
Reviewed-by: Alex Miller <milleral@chromium.org>
Tested-by: Fang Deng <fdeng@chromium.org>
Commit-Queue: Fang Deng <fdeng@chromium.org>
diff --git a/global_config.ini b/global_config.ini
index bdc634d..9dedf31 100644
--- a/global_config.ini
+++ b/global_config.ini
@@ -76,6 +76,7 @@
notify_email_from: chromeos-autotest@google.com
notify_email_statuses: Completed,Failed,Aborted
max_processes_per_drone: 1000
+max_processes_warning_threshold: 0.8
max_hostless_processes: 500
max_jobs_started_per_cycle: 100
max_parse_processes: 100
diff --git a/scheduler/drone_manager.py b/scheduler/drone_manager.py
index 0550f0f..520dff7 100644
--- a/scheduler/drone_manager.py
+++ b/scheduler/drone_manager.py
@@ -137,6 +137,13 @@
All paths going into and out of this class are relative to the full results
directory, except for those returns by absolute_path().
"""
+
+
+ # Minimum time to wait before next email
+ # about a drone hitting process limit is sent.
+ NOTIFY_INTERVAL = 60 * 60 * 24 # one day
+
+
def __init__(self):
# absolute path of base results dir
self._results_dir = None
@@ -159,6 +166,9 @@
self._attached_files = {}
# heapq of _DroneHeapWrappers
self._drone_queue = []
+ # map drone hostname to time stamp of email that
+ # has been sent about the drone hitting process limit.
+ self._notify_record = {}
def initialize(self, base_results_dir, drone_hostnames,
@@ -243,6 +253,8 @@
drone.allowed_users = allowed_users
self._reorder_drone_queue() # max_processes may have changed
+ # Clear notification record about reaching max_processes limit.
+ self._notify_record = {}
def get_drones(self):
@@ -348,6 +360,28 @@
drone.active_processes += info.num_processes
+ def _check_drone_process_limit(self, drone):
+ """
+ Notify if the number of processes on |drone| is approaching limit.
+
+ @param drone: A Drone object.
+ """
+ percent = float(drone.active_processes) / drone.max_processes
+ max_percent = scheduler_config.config.max_processes_warning_threshold
+ if percent >= max_percent:
+ message = ('Drone %s is hitting %s of process limit.' %
+ (drone.hostname, format(percent, '.2%')))
+ logging.warning(message)
+ last_notified = self._notify_record.get(drone.hostname, 0)
+ now = time.time()
+ if last_notified + BaseDroneManager.NOTIFY_INTERVAL < now:
+ body = ('Active processes/Process limit: %d/%d (%s)' %
+ (drone.active_processes, drone.max_processes,
+ format(percent, '.2%')))
+ email_manager.manager.enqueue_notify_email(message, body)
+ self._notify_record[drone.hostname] = now
+
+
def refresh(self):
"""
Called at the beginning of a scheduler cycle to refresh all process
@@ -377,6 +411,7 @@
self._compute_active_processes(drone)
if drone.enabled:
self._enqueue_drone(drone)
+ self._check_drone_process_limit(drone)
def execute_actions(self):
diff --git a/scheduler/scheduler_config.py b/scheduler/scheduler_config.py
index 91c8fb1..3dfb927 100644
--- a/scheduler/scheduler_config.py
+++ b/scheduler/scheduler_config.py
@@ -8,6 +8,8 @@
Contains configuration that can be changed during scheduler execution.
"""
FIELDS = {'max_processes_per_drone': 'max_processes_per_drone',
+ 'max_processes_warning_threshold':
+ 'max_processes_warning_threshold',
'max_processes_started_per_cycle': 'max_jobs_started_per_cycle',
'clean_interval': 'clean_interval_minutes',
'max_parse_processes': 'max_parse_processes',
@@ -31,9 +33,13 @@
config = global_config.global_config
config.parse_config_file()
for field, config_option in self.FIELDS.iteritems():
+ if field == 'max_processes_warning_threshold':
+ data_type = float
+ else:
+ data_type = int
setattr(self, field, config.get_config_value(CONFIG_SECTION,
config_option,
- type=int))
+ type=data_type))
config = SchedulerConfig()