autotest: delete some email alerts; replace some with monarch metrics For email alerts that seem (based on searching my email) to never be sent, I simply deleted them. For those that are sent sometimes and seem easily amenable to a monarch metric instead, I changed them to a metric. This is a first step; there are still many remaining unneccesary email alerts. BUG=chromium:672726 TEST=None Change-Id: Ib1d3715e618623faa16f3faaceabf4218dbad49a Reviewed-on: https://chromium-review.googlesource.com/420468 Commit-Ready: Aviv Keshet <akeshet@chromium.org> Tested-by: Aviv Keshet <akeshet@chromium.org> Reviewed-by: Aviv Keshet <akeshet@chromium.org>

commit: c29b4c7ec10db41f38e0361febe9846a95629b5a [log] [tgz]
author: Aviv Keshet <akeshet@chromium.org> Wed Dec 14 22:27:35 2016 -0800
committer: chrome-bot <chrome-bot@chromium.org> Mon Dec 19 12:24:43 2016 -0800
tree: 682286a0a00db657cf3a57cd6442faa20fe19516
parent: 383ef9c9c6cc3c08f82fbcfa43b1f3f492283321 [diff]
diff --git a/global_config.ini b/global_config.ini
index 51f22b7..0079dd2 100644
--- a/global_config.ini
+++ b/global_config.ini

@@ -183,7 +183,6 @@
 notify_email_from: chromeos-autotest@google.com
 notify_email_statuses: Completed,Failed,Aborted
 max_processes_per_drone: 1000
-max_processes_warning_threshold: 0.8
 max_parse_processes: 100
 max_transfer_processes: 50
 tick_pause_sec: 5

diff --git a/scheduler/drone_manager.py b/scheduler/drone_manager.py
index f44865c..fb0aad2 100644
--- a/scheduler/drone_manager.py
+++ b/scheduler/drone_manager.py

@@ -1,14 +1,12 @@
 import heapq
 import os
-import time
-import traceback
 import logging
 
 from chromite.lib import metrics
 
 import common
 from autotest_lib.client.common_lib import error, global_config, utils
-from autotest_lib.scheduler import email_manager, drone_utility, drones
+from autotest_lib.scheduler import drone_utility, drones
 from autotest_lib.scheduler import drone_task_queue
 from autotest_lib.scheduler import scheduler_config
 from autotest_lib.scheduler import thread_lib
@@ -183,9 +181,6 @@
         self._attached_files = {}
         # heapq of _DroneHeapWrappers
         self._drone_queue = []
-        # map drone hostname to time stamp of email that
-        # has been sent about the drone hitting process limit.
-        self._notify_record = {}
         # A threaded task queue used to refresh drones asynchronously.
         if _THREADED_DRONE_MANAGER:
             self._refresh_task_queue = thread_lib.ThreadedTaskQueue(
@@ -437,20 +432,8 @@
             percent = float(drone.active_processes) / drone.max_processes
         except ZeroDivisionError:
             percent = 100
-        max_percent = scheduler_config.config.max_processes_warning_threshold
-        if percent >= max_percent:
-            message = ('Drone %s is hitting %s of process limit.' %
-                       (drone.hostname, format(percent, '.2%')))
-            logging.warning(message)
-            last_notified = self._notify_record.get(drone.hostname, 0)
-            now = time.time()
-            if last_notified + BaseDroneManager.NOTIFY_INTERVAL < now:
-                body = ('Active processes/Process limit: %d/%d (%s)' %
-                        (drone.active_processes, drone.max_processes,
-                         format(percent, '.2%')))
-                email_manager.manager.enqueue_notify_email(message, body)
-                self._notify_record[drone.hostname] = now
-
+        metrics.Gauge('chromeos/autotest/drone/active_process_percentage'
+                      ).set(percent, fields={'drone_hostname': drone.hostname})
 
     def trigger_refresh(self):
         """Triggers a drone manager refresh.
@@ -564,10 +547,9 @@
         try:
             self._results_drone.execute_queued_calls()
         except error.AutoservError:
-            warning = ('Results repository failed to execute calls:\n' +
-                       traceback.format_exc())
-            email_manager.manager.enqueue_notify_email(
-                'Results repository error', warning)
+            m = 'chromeos/autotest/errors/results_repository_failed'
+            metrics.Counter(m).increment(
+                fields={'drone_hostname': self._results_drone.hostname})
             self._results_drone.clear_call_queue()
 
 

diff --git a/scheduler/monitor_db.py b/scheduler/monitor_db.py
index 0be6292..0dbce9a 100755
--- a/scheduler/monitor_db.py
+++ b/scheduler/monitor_db.py

@@ -661,12 +661,13 @@
 
 
     def _check_for_remaining_orphan_processes(self, orphans):
+        m = 'chromeos/autotest/errors/unrecovered_orphan_processes'
+        metrics.Gauge(m).set(len(orphans))
+
         if not orphans:
             return
         subject = 'Unrecovered orphan autoserv processes remain'
         message = '\n'.join(str(process) for process in orphans)
-        email_manager.manager.enqueue_notify_email(subject, message)
-
         die_on_orphans = global_config.global_config.get_config_value(
             scheduler_config.CONFIG_SECTION, 'die_on_orphans', type=bool)
 
@@ -811,13 +812,6 @@
         """
         if self.host_has_agent(host):
             host_agent_task = list(self._host_agents.get(host.id))[0].task
-            subject = 'Host with agents assigned to an HQE'
-            message = ('HQE: %s assigned host %s, but the host has '
-                       'agent: %s for queue_entry %s. The HQE '
-                       'will have to try and acquire a host next tick ' %
-                       (queue_entry, host.hostname, host_agent_task,
-                        host_agent_task.queue_entry))
-            email_manager.manager.enqueue_notify_email(subject, message)
         else:
             self._host_scheduler.schedule_host_job(host, queue_entry)
 
@@ -854,15 +848,10 @@
         metrics.Counter(
             'chromeos/autotest/scheduler/scheduled_jobs_hostless'
         ).increment_by(new_hostless_jobs)
+
         if not host_jobs:
             return
-        if not _inline_host_acquisition:
-            message = ('Found %s jobs that need hosts though '
-                       '_inline_host_acquisition=%s. Will acquire hosts.' %
-                       ([str(job) for job in host_jobs],
-                        _inline_host_acquisition))
-            email_manager.manager.enqueue_notify_email(
-                    'Processing unexpected host acquisition requests', message)
+
         jobs_with_hosts = self._host_scheduler.find_hosts_for_jobs(host_jobs)
         for host_assignment in jobs_with_hosts:
             self._schedule_host_job(host_assignment.host, host_assignment.job)

diff --git a/scheduler/monitor_db_cleanup.py b/scheduler/monitor_db_cleanup.py
index 51d3567..fd5d73b 100644
--- a/scheduler/monitor_db_cleanup.py
+++ b/scheduler/monitor_db_cleanup.py

@@ -147,13 +147,10 @@
                 first_model, first_field, second_model, second_field))
 
         if errors:
-            subject = ('%s relationships to invalid models, cleaned all' %
-                       len(errors))
-            message = '\n'.join(errors)
-            logging.warning(subject)
-            logging.warning(message)
-            email_manager.manager.enqueue_notify_email(subject, message)
-
+            m = 'chromeos/autotest/scheduler/cleanup/invalid_models_cleaned'
+            metrics.Counter(m).increment_by(len(errors))
+            logging.warn('Cleaned invalid models due to errors: %s'
+                         % ('\n'.join(errors)))
 
     def _clear_inactive_blocks(self):
         msg = 'Clear out blocks for all completed jobs.'

diff --git a/scheduler/scheduler_config.py b/scheduler/scheduler_config.py
index 63e4589..69e476f 100644
--- a/scheduler/scheduler_config.py
+++ b/scheduler/scheduler_config.py

@@ -9,7 +9,6 @@
     """
     FIELDS = [
                 ('max_processes_per_drone', int),
-                ('max_processes_warning_threshold', float),
                 ('clean_interval_minutes', int),
                 ('max_parse_processes', int),
                 ('tick_pause_sec', float),
commit	c29b4c7ec10db41f38e0361febe9846a95629b5a	[log] [tgz]
author	Aviv Keshet <akeshet@chromium.org>	Wed Dec 14 22:27:35 2016 -0800
committer	chrome-bot <chrome-bot@chromium.org>	Mon Dec 19 12:24:43 2016 -0800
tree	682286a0a00db657cf3a57cd6442faa20fe19516
parent	383ef9c9c6cc3c08f82fbcfa43b1f3f492283321 [diff]