autotest: delete some email alerts; replace some with monarch metrics
For email alerts that seem (based on searching my email) to never be
sent, I simply deleted them.
For those that are sent sometimes and seem easily amenable to a monarch
metric instead, I changed them to a metric.
This is a first step; there are still many remaining unneccesary email
alerts.
BUG=chromium:672726
TEST=None
Change-Id: Ib1d3715e618623faa16f3faaceabf4218dbad49a
Reviewed-on: https://chromium-review.googlesource.com/420468
Commit-Ready: Aviv Keshet <akeshet@chromium.org>
Tested-by: Aviv Keshet <akeshet@chromium.org>
Reviewed-by: Aviv Keshet <akeshet@chromium.org>
diff --git a/global_config.ini b/global_config.ini
index 51f22b7..0079dd2 100644
--- a/global_config.ini
+++ b/global_config.ini
@@ -183,7 +183,6 @@
notify_email_from: chromeos-autotest@google.com
notify_email_statuses: Completed,Failed,Aborted
max_processes_per_drone: 1000
-max_processes_warning_threshold: 0.8
max_parse_processes: 100
max_transfer_processes: 50
tick_pause_sec: 5
diff --git a/scheduler/drone_manager.py b/scheduler/drone_manager.py
index f44865c..fb0aad2 100644
--- a/scheduler/drone_manager.py
+++ b/scheduler/drone_manager.py
@@ -1,14 +1,12 @@
import heapq
import os
-import time
-import traceback
import logging
from chromite.lib import metrics
import common
from autotest_lib.client.common_lib import error, global_config, utils
-from autotest_lib.scheduler import email_manager, drone_utility, drones
+from autotest_lib.scheduler import drone_utility, drones
from autotest_lib.scheduler import drone_task_queue
from autotest_lib.scheduler import scheduler_config
from autotest_lib.scheduler import thread_lib
@@ -183,9 +181,6 @@
self._attached_files = {}
# heapq of _DroneHeapWrappers
self._drone_queue = []
- # map drone hostname to time stamp of email that
- # has been sent about the drone hitting process limit.
- self._notify_record = {}
# A threaded task queue used to refresh drones asynchronously.
if _THREADED_DRONE_MANAGER:
self._refresh_task_queue = thread_lib.ThreadedTaskQueue(
@@ -437,20 +432,8 @@
percent = float(drone.active_processes) / drone.max_processes
except ZeroDivisionError:
percent = 100
- max_percent = scheduler_config.config.max_processes_warning_threshold
- if percent >= max_percent:
- message = ('Drone %s is hitting %s of process limit.' %
- (drone.hostname, format(percent, '.2%')))
- logging.warning(message)
- last_notified = self._notify_record.get(drone.hostname, 0)
- now = time.time()
- if last_notified + BaseDroneManager.NOTIFY_INTERVAL < now:
- body = ('Active processes/Process limit: %d/%d (%s)' %
- (drone.active_processes, drone.max_processes,
- format(percent, '.2%')))
- email_manager.manager.enqueue_notify_email(message, body)
- self._notify_record[drone.hostname] = now
-
+ metrics.Gauge('chromeos/autotest/drone/active_process_percentage'
+ ).set(percent, fields={'drone_hostname': drone.hostname})
def trigger_refresh(self):
"""Triggers a drone manager refresh.
@@ -564,10 +547,9 @@
try:
self._results_drone.execute_queued_calls()
except error.AutoservError:
- warning = ('Results repository failed to execute calls:\n' +
- traceback.format_exc())
- email_manager.manager.enqueue_notify_email(
- 'Results repository error', warning)
+ m = 'chromeos/autotest/errors/results_repository_failed'
+ metrics.Counter(m).increment(
+ fields={'drone_hostname': self._results_drone.hostname})
self._results_drone.clear_call_queue()
diff --git a/scheduler/monitor_db.py b/scheduler/monitor_db.py
index 0be6292..0dbce9a 100755
--- a/scheduler/monitor_db.py
+++ b/scheduler/monitor_db.py
@@ -661,12 +661,13 @@
def _check_for_remaining_orphan_processes(self, orphans):
+ m = 'chromeos/autotest/errors/unrecovered_orphan_processes'
+ metrics.Gauge(m).set(len(orphans))
+
if not orphans:
return
subject = 'Unrecovered orphan autoserv processes remain'
message = '\n'.join(str(process) for process in orphans)
- email_manager.manager.enqueue_notify_email(subject, message)
-
die_on_orphans = global_config.global_config.get_config_value(
scheduler_config.CONFIG_SECTION, 'die_on_orphans', type=bool)
@@ -811,13 +812,6 @@
"""
if self.host_has_agent(host):
host_agent_task = list(self._host_agents.get(host.id))[0].task
- subject = 'Host with agents assigned to an HQE'
- message = ('HQE: %s assigned host %s, but the host has '
- 'agent: %s for queue_entry %s. The HQE '
- 'will have to try and acquire a host next tick ' %
- (queue_entry, host.hostname, host_agent_task,
- host_agent_task.queue_entry))
- email_manager.manager.enqueue_notify_email(subject, message)
else:
self._host_scheduler.schedule_host_job(host, queue_entry)
@@ -854,15 +848,10 @@
metrics.Counter(
'chromeos/autotest/scheduler/scheduled_jobs_hostless'
).increment_by(new_hostless_jobs)
+
if not host_jobs:
return
- if not _inline_host_acquisition:
- message = ('Found %s jobs that need hosts though '
- '_inline_host_acquisition=%s. Will acquire hosts.' %
- ([str(job) for job in host_jobs],
- _inline_host_acquisition))
- email_manager.manager.enqueue_notify_email(
- 'Processing unexpected host acquisition requests', message)
+
jobs_with_hosts = self._host_scheduler.find_hosts_for_jobs(host_jobs)
for host_assignment in jobs_with_hosts:
self._schedule_host_job(host_assignment.host, host_assignment.job)
diff --git a/scheduler/monitor_db_cleanup.py b/scheduler/monitor_db_cleanup.py
index 51d3567..fd5d73b 100644
--- a/scheduler/monitor_db_cleanup.py
+++ b/scheduler/monitor_db_cleanup.py
@@ -147,13 +147,10 @@
first_model, first_field, second_model, second_field))
if errors:
- subject = ('%s relationships to invalid models, cleaned all' %
- len(errors))
- message = '\n'.join(errors)
- logging.warning(subject)
- logging.warning(message)
- email_manager.manager.enqueue_notify_email(subject, message)
-
+ m = 'chromeos/autotest/scheduler/cleanup/invalid_models_cleaned'
+ metrics.Counter(m).increment_by(len(errors))
+ logging.warn('Cleaned invalid models due to errors: %s'
+ % ('\n'.join(errors)))
def _clear_inactive_blocks(self):
msg = 'Clear out blocks for all completed jobs.'
diff --git a/scheduler/scheduler_config.py b/scheduler/scheduler_config.py
index 63e4589..69e476f 100644
--- a/scheduler/scheduler_config.py
+++ b/scheduler/scheduler_config.py
@@ -9,7 +9,6 @@
"""
FIELDS = [
('max_processes_per_drone', int),
- ('max_processes_warning_threshold', float),
('clean_interval_minutes', int),
('max_parse_processes', int),
('tick_pause_sec', float),