[autotest] Report repair loops in inventory runs. This add an option to the lab inventory script to detect and report DUTs stuck in repair loops. BUG=chromium:775199 TEST=run the script locally with --debug Change-Id: I2f7972054e632906207279c08a32bd691864eb56 Reviewed-on: https://chromium-review.googlesource.com/737070 Commit-Ready: Richard Barnette <jrbarnette@google.com> Tested-by: Richard Barnette <jrbarnette@google.com> Reviewed-by: Richard Barnette <jrbarnette@google.com>

commit: cf5d8346c98566f33923ac946845f6e1c7a4eb06 [log] [tgz]
author: Richard Barnette <jrbarnette@chromium.org> Tue Oct 24 18:13:11 2017 -0700
committer: chrome-bot <chrome-bot@chromium.org> Fri Dec 01 20:32:41 2017 -0800
tree: d505e3ada0306a65aaab6f5337beb361113046c8
parent: 78e829a75ca10d1fbcbeca836e7cf6bbc047960d [diff] [blame]
diff --git a/site_utils/lab_inventory.py b/site_utils/lab_inventory.py
index a5d9c40..7922844 100755
--- a/site_utils/lab_inventory.py
+++ b/site_utils/lab_inventory.py

@@ -29,14 +29,18 @@
     When generating the "board status" e-mail, included a list of
     <number> specific DUTs to be recommended for repair.
 
+--repair-loops
+    Scan the inventory for DUTs stuck in repair loops, and report them
+    via a Monarch presence metric.
+
 --logdir <directory>
     Log progress and actions in a file under this directory.  Text
     of any e-mail sent will also be logged in a timestamped file in
     this directory.
 
 --debug
-    Suppress all logging and sending e-mail.  Instead, write the
-    output that would be generated onto stdout.
+    Suppress all logging, metrics reporting, and sending e-mail.
+    Instead, write the output that would be generated onto stdout.
 
 <board> arguments:
     With no arguments, gathers the status for all boards in the lab.
@@ -58,12 +62,14 @@
 import common
 from autotest_lib.client.bin import utils
 from autotest_lib.client.common_lib import time_utils
+from autotest_lib.server import site_utils
 from autotest_lib.server.cros.dynamic_suite import frontend_wrappers
 from autotest_lib.server.hosts import servo_host
 from autotest_lib.server.lib import status_history
 from autotest_lib.site_utils import gmail_lib
 from autotest_lib.site_utils.suite_scheduler import constants
 from autotest_lib.utils import labellib
+from chromite.lib import metrics
 
 
 CRITICAL_POOLS = constants.Pools.CRITICAL_POOLS
@@ -86,10 +92,9 @@
 _DEFAULT_DURATION = 24
 
 # _LOGDIR:
-#     Relative path used in the calculation of the default setting
-#     for the --logdir option.  The full path path is relative to
-#     the root of the autotest directory, as determined from
-#     sys.argv[0].
+#     Relative path used in the calculation of the default setting for
+#     the --logdir option.  The full path is relative to the root of the
+#     autotest directory, as determined from sys.argv[0].
 # _LOGFILE:
 #     Basename of a file to which general log information will be
 #     written.
@@ -115,6 +120,12 @@
 
 _MANAGED_POOL_DEFAULT = 'all_pools'
 
+# _REPAIR_LOOP_THRESHOLD:
+#    The number of repeated Repair tasks that must be seen to declare
+#    that a DUT is stuck in a repair loop.
+
+_REPAIR_LOOP_THRESHOLD = 4
+
 
 class _CachedHostJobHistories(object):
     """Maintains a set of `HostJobHistory` objects for a pool.
@@ -1023,10 +1034,10 @@
 
     @param arguments  Command-line arguments as returned by
                       `ArgumentParser`
-    @param inventory  _LabInventory object with the inventory to
-                      be reported.
-    @param timestamp  A string used to identify this run's timestamp
-                      in logs and email output.
+    @param inventory  _LabInventory object with the inventory to be
+                      reported.
+    @param timestamp  A string used to identify this run's timestamp in
+                      logs and email output.
     """
     pool_message = _generate_pool_inventory_message(inventory)
     idle_message = _generate_idle_inventory_message(inventory)
@@ -1037,6 +1048,87 @@
                 pool_message + '\n\n\n' + idle_message)
 
 
+def _dut_in_repair_loop(history):
+    """Return whether a DUT's history indicates a repair loop.
+
+    A DUT is considered looping if it runs no tests, and no tasks pass
+    other than repair tasks.
+
+    @param history  An instance of `status_history.HostJobHistory` to be
+                    scanned for a repair loop.  The caller guarantees
+                    that this history corresponds to a working DUT.
+    @returns  Return a true value if the DUT's most recent history
+              indicates a repair loop.
+    """
+    # Our caller passes only histories for working DUTs; that means
+    # we've already paid the cost of fetching the diagnosis task, and
+    # we know that the task was successful.  The diagnosis task will be
+    # one of the tasks we must scan to find a loop, so if the task isn't
+    # a repair task, then our history includes a successful non-repair
+    # task, and we're not looping.
+    #
+    # The for loop below  is very expensive, because it must fetch the
+    # full history, regardless of how many tasks we examine.  At the
+    # time of this writing, this check against the diagnosis task
+    # reduces the cost of finding loops in the full inventory from hours
+    # to minutes.
+    if history.last_diagnosis()[1].name != 'Repair':
+        return False
+    repair_ok_count = 0
+    for task in history:
+        if not task.is_special:
+            # This is a test, so we're not looping.
+            return False
+        if task.diagnosis == status_history.BROKEN:
+            # Failed a repair, so we're not looping.
+            return False
+        if (task.diagnosis == status_history.WORKING
+                and task.name != 'Repair'):
+            # Non-repair task succeeded, so we're not looping.
+            return False
+        # At this point, we have either a failed non-repair task, or
+        # a successful repair.
+        if task.name == 'Repair':
+            repair_ok_count += 1
+            if repair_ok_count >= _REPAIR_LOOP_THRESHOLD:
+                return True
+
+
+def _perform_repair_loop_report(arguments, inventory):
+    """Scan the inventory for DUTs stuck in a repair loop.
+
+    This routine walks through the given inventory looking for DUTs
+    where the most recent history shows that the DUT is regularly
+    passing repair tasks, but has not run any tests.
+
+    @param arguments  Command-line arguments as returned by
+                      `ArgumentParser`
+    @param inventory  _LabInventory object with the inventory to be
+                      reported.
+    """
+    loop_presence = metrics.BooleanMetric(
+        'chromeos/autotest/inventory/repair_loops',
+        'DUTs stuck in repair loops')
+    logging.info('Scanning for DUTs in repair loops.')
+    for counts in inventory.by_board.itervalues():
+        for history in counts.get_working_list():
+            # Managed DUTs with names that don't match
+            # _HOSTNAME_PATTERN shouldn't be possible.  However, we
+            # don't want arbitrary strings being attached to the
+            # 'dut_hostname' field, so for safety, we exclude all
+            # anomalies.
+            if not _HOSTNAME_PATTERN.match(history.hostname):
+                continue
+            if _dut_in_repair_loop(history):
+                fields = {'dut_hostname': history.hostname,
+                          'board': history.host_board,
+                          'pool': history.host_pool}
+                logging.info('Looping DUT: %(dut_hostname)s, '
+                             'board: %(board)s, pool: %(pool)s',
+                             fields)
+                loop_presence.set(True, fields=fields)
+
+
 def _log_startup(arguments, startup_time):
     """Log the start of this inventory run.
 
@@ -1077,6 +1169,28 @@
     return inventory
 
 
+def _perform_inventory_reports(arguments):
+    """Perform all inventory checks requested on the command line.
+
+    Create the initial inventory and run through the inventory reports
+    as called for by the parsed command-line arguments.
+
+    @param arguments  Command-line arguments as returned by
+                      `ArgumentParser`.
+    """
+    startup_time = time.time()
+    timestamp = _log_startup(arguments, startup_time)
+    inventory = _create_inventory(arguments, startup_time)
+    if arguments.debug:
+        _populate_board_counts(inventory)
+    if arguments.board_notify:
+        _perform_board_inventory(arguments, inventory, timestamp)
+    if arguments.pool_notify:
+        _perform_pool_inventory(arguments, inventory, timestamp)
+    if arguments.repair_loops:
+        _perform_repair_loop_report(arguments, inventory)
+
+
 def _separate_email_addresses(address_list):
     """Parse a list of comma-separated lists of e-mail addresses.
 
@@ -1174,6 +1288,8 @@
                         help=('Specify how many DUTs should be '
                               'recommended for repair (default: no '
                               'recommendation)'))
+    parser.add_argument('--repair-loops', action='store_true',
+                        help='Check for devices stuck in repair loops.')
     parser.add_argument('--debug', action='store_true',
                         help='Print e-mail messages on stdout '
                              'without sending them.')
@@ -1200,11 +1316,14 @@
         ~3 months worth of history.
       * With the option, we expect stdout to contain other
         human-readable output (including the contents of the e-mail
-        messages), so we restrict the output.
+        messages), so we restrict the output to INFO level.
+
+    For convenience, when `--debug` is on, the logging format has
+    no adornments, so that a call like `logging.info(msg)` simply writes
+    `msg` to stdout, plus a trailing newline.
 
     @param arguments  Command-line arguments as returned by
                       `ArgumentParser`
-
     """
     root_logger = logging.getLogger()
     if arguments.debug:
@@ -1233,22 +1352,20 @@
 
 def main(argv):
     """Standard main routine.
-    @param argv  Command line arguments including `sys.argv[0]`.
+
+    @param argv  Command line arguments, including `sys.argv[0]`.
     """
     arguments = _parse_command(argv)
     if not arguments:
         sys.exit(1)
     _configure_logging(arguments)
     try:
-        startup_time = time.time()
-        timestamp = _log_startup(arguments, startup_time)
-        inventory = _create_inventory(arguments, startup_time)
-        if arguments.debug:
-            _populate_board_counts(inventory)
-        if arguments.board_notify:
-            _perform_board_inventory(arguments, inventory, timestamp)
-        if arguments.pool_notify:
-            _perform_pool_inventory(arguments, inventory, timestamp)
+        if not arguments.debug:
+            with site_utils.SetupTsMonGlobalState(
+                    'repair_loops', short_lived=True, auto_flush=False):
+                _perform_inventory_reports(arguments)
+        else:
+            _perform_inventory_reports(arguments)
     except KeyboardInterrupt:
         pass
     except EnvironmentError as e:
commit	cf5d8346c98566f33923ac946845f6e1c7a4eb06	[log] [tgz]
author	Richard Barnette <jrbarnette@chromium.org>	Tue Oct 24 18:13:11 2017 -0700
committer	chrome-bot <chrome-bot@chromium.org>	Fri Dec 01 20:32:41 2017 -0800
tree	d505e3ada0306a65aaab6f5337beb361113046c8
parent	78e829a75ca10d1fbcbeca836e7cf6bbc047960d [diff] [blame]