[autotest] Update lab_inventory to report idle DUTs with metrics. This updates the lab_inventory script to report DUTs that are idle and unlocked via a new Monarch metric. The new metric supports reporting both idle DUTs and DUTs in repair loops. The old, single-purpose metric for reporting repair loops is being deprecated in favor of the new metric. CQ-DEPEND=CL:*605029 BUG=chromium:804625 TEST=run the inventory command locally Change-Id: Id4d5eba265902cb84072bd0141d9657bce0f47d0 Reviewed-on: https://chromium-review.googlesource.com/1003599 Commit-Ready: Richard Barnette <jrbarnette@chromium.org> Tested-by: Richard Barnette <jrbarnette@chromium.org> Reviewed-by: Allen Li <ayatane@chromium.org>

commit: 1ca30e67e846aa12e9d523763a3829b298b5ce1d [log] [tgz]
author: Richard Barnette <jrbarnette@chromium.org> Mon Apr 09 16:45:58 2018 -0700
committer: chrome-bot <chrome-bot@chromium.org> Fri Apr 13 15:47:52 2018 -0700
tree: 69dd771e334babe84889430bb266a3d35d7b80a0
parent: 16b74bef5599bc5cdaf305cde9fe3efde3ed973c [diff]
diff --git a/contrib/inventory_options b/contrib/inventory_options
index 67d1887..3d329b3 100644
--- a/contrib/inventory_options
+++ b/contrib/inventory_options

@@ -35,19 +35,19 @@
 # Options to be used for different script invocations.  Inventory
 # runs are relatively expensive, so operations that happen rarely
 # also bundle operations that happen more frequently.
-#   + REPAIR_LOOP_DETECT happens with every run.  It looks for
-#     and reports DUTs that do no work other than to fail, then repair
-#     successfully.
+#   + UNTESTABLE_DETECT happens with every run.  It looks for
+#     and reports DUTs that for never run tests even though their
+#     status indicates that they should.
 #   + MODEL_NOTIFY happens less often.  This adds a full model
 #     inventory count to REPAIR_LOOP_DETECT.
 #   + POOL_NOTIFY happens least often.  It adds per-pool inventory
 #     counts, as well as individual DUT repair recommendations to
 #     MODEL_NOTIFY.
 
-REPAIR_LOOP_DETECT=( --repair-loops )
+UNTESTABLE_DETECT=( --report-untestable )
 
 MODEL_NOTIFY=(
-  "${REPAIR_LOOP_DETECT[@]}"
+  "${UNTESTABLE_DETECT[@]}"
   --model-notify $(echo "${MODEL_INTEREST[@]}" | sed 's/ /,/g')
 )
 

diff --git a/contrib/run-loop-detection b/contrib/run-loop-detection
index 4a938cf..7938175 100755
--- a/contrib/run-loop-detection
+++ b/contrib/run-loop-detection

@@ -7,4 +7,4 @@
 cd $SCRIPT_DIR/..
 . contrib/inventory_options
 
-site_utils/lab_inventory.py $OPTIONS "${REPAIR_LOOP_DETECT[@]}"
+site_utils/lab_inventory.py $OPTIONS "${UNTESTABLE_DETECT[@]}"

diff --git a/site_utils/lab_inventory.py b/site_utils/lab_inventory.py
index 39503a9..26dadf6 100755
--- a/site_utils/lab_inventory.py
+++ b/site_utils/lab_inventory.py

@@ -29,9 +29,9 @@
     When generating the "model status" e-mail, include a list of
     <number> specific DUTs to be recommended for repair.
 
---repair-loops
-    Scan the inventory for DUTs stuck in repair loops, and report them
-    via a Monarch presence metric.
+--report-untestable
+    Scan the inventory for DUTs that can't test because they're stuck in
+    repair loops, or because the scheduler can't give them work.
 
 --logdir <directory>
     Log progress and actions in a file under this directory.  Text
@@ -122,6 +122,11 @@
 _REPAIR_LOOP_THRESHOLD = 4
 
 
+_UNTESTABLE_PRESENCE_METRIC = metrics.BooleanMetric(
+    'chromeos/autotest/inventory/untestable',
+    'DUTs that cannot be scheduled for testing')
+
+
 class _HostSetInventory(object):
     """Maintains a set of related `HostJobHistory` objects.
 
@@ -1015,7 +1020,7 @@
     # a repair task, then our history includes a successful non-repair
     # task, and we're not looping.
     #
-    # The for loop below  is very expensive, because it must fetch the
+    # The for loop below is very expensive, because it must fetch the
     # full history, regardless of how many tasks we examine.  At the
     # time of this writing, this check against the diagnosis task
     # reduces the cost of finding loops in the full inventory from hours
@@ -1042,17 +1047,29 @@
                 return True
 
 
-def _perform_repair_loop_report(arguments, inventory):
-    """Scan the inventory for DUTs stuck in a repair loop.
+def _report_untestable_dut(history, state):
+    fields = {
+        'dut_hostname': history.hostname,
+        'model': history.host_model,
+        'pool': history.host_pool,
+        'state': state,
+    }
+    logging.info('Untestable DUT: %(dut_hostname)s, model: %(model)s, '
+                 'pool: %(pool)s', fields)
+    _UNTESTABLE_PRESENCE_METRIC.set(True, fields=fields)
 
-    This routine walks through the given inventory looking for DUTs
-    where the most recent history shows that the DUT is regularly
-    passing repair tasks, but has not run any tests.
 
-    @param arguments  Command-line arguments as returned by
-                      `ArgumentParser`
+def _report_repair_loop_metrics(inventory):
+    """Find and report DUTs stuck in a repair loop.
+
+    Go through `inventory`, and find and report any DUT identified as
+    being in a repair loop.
+
     @param inventory  `_LabInventory` object to be reported on.
     """
+    # TODO(jrbarnette)  The `repair_loops` metric is being deprecated,
+    # and should be deleted as soon as the `untestable` metric has a
+    # dashboard that's working reliably.
     loop_presence = metrics.BooleanMetric(
         'chromeos/autotest/inventory/repair_loops',
         'DUTs stuck in repair loops')
@@ -1067,13 +1084,59 @@
             if not _HOSTNAME_PATTERN.match(history.hostname):
                 continue
             if _dut_in_repair_loop(history):
-                fields = {'dut_hostname': history.hostname,
-                          'model': history.host_model,
-                          'pool': history.host_pool}
-                logging.info('Looping DUT: %(dut_hostname)s, '
-                             'model: %(model)s, pool: %(pool)s',
-                             fields)
+                fields = {
+                    'dut_hostname': history.hostname,
+                    'model': history.host_model,
+                    'pool': history.host_pool
+                }
                 loop_presence.set(True, fields=fields)
+                _report_untestable_dut(history, 'repair_loop')
+
+
+def _report_idle_dut_metrics(inventory):
+    """Find and report idle, unlocked DUTs.
+
+    Go through `inventory`, and find and report any DUT identified as
+    "idle" that is not also locked.
+
+    @param inventory  `_LabInventory` object to be reported on.
+    """
+    logging.info('Scanning for idle, unlocked DUTs.')
+    for counts in inventory.itervalues():
+        for history in counts.get_idle_list():
+            # Managed DUTs with names that don't match
+            # _HOSTNAME_PATTERN shouldn't be possible.  However, we
+            # don't want arbitrary strings being attached to the
+            # 'dut_hostname' field, so for safety, we exclude all
+            # anomalies.
+            if not _HOSTNAME_PATTERN.match(history.hostname):
+                continue
+            if not history.host.locked:
+                _report_untestable_dut(history, 'idle_unlocked')
+
+
+def _report_untestable_dut_metrics(inventory):
+    """Scan the inventory for DUTs unable to run tests.
+
+    DUTs in the inventory are judged "untestable" if they meet one of
+    two criteria:
+      * The DUT is stuck in a repair loop; that is, it regularly passes
+        repair, but never passes other operations.
+      * The DUT runs no tasks at all, but is not locked.
+
+    This routine walks through the given inventory looking for DUTs in
+    either of these states.  Results are reported via a Monarch presence
+    metric.
+
+    Note:  To make sure that DUTs aren't flagged as "idle" merely
+    because there's no work, a separate job runs prior to regular
+    inventory runs which schedules trivial work on any DUT that appears
+    idle.
+
+    @param inventory  `_LabInventory` object to be reported on.
+    """
+    _report_repair_loop_metrics(inventory)
+    _report_idle_dut_metrics(inventory)
 
 
 def _log_startup(arguments, startup_time):
@@ -1134,8 +1197,8 @@
         _perform_model_inventory(arguments, inventory, timestamp)
     if arguments.pool_notify:
         _perform_pool_inventory(arguments, inventory, timestamp)
-    if arguments.repair_loops:
-        _perform_repair_loop_report(arguments, inventory)
+    if arguments.report_untestable:
+        _report_untestable_dut_metrics(inventory)
 
 
 def _separate_email_addresses(address_list):
@@ -1176,11 +1239,11 @@
     arguments.pool_notify = _separate_email_addresses(
             arguments.pool_notify)
     if not any([arguments.model_notify, arguments.pool_notify,
-                arguments.repair_loops]):
+                arguments.report_untestable]):
         if not arguments.debug:
             sys.stderr.write('Must request at least one report via '
                              '--model-notify, --pool-notify, or '
-                             '--repair-loops\n')
+                             '--report-untestable\n')
             return False
         else:
             # We want to run all the e-mail reports.  An empty notify
@@ -1237,8 +1300,8 @@
                         help=('Specify how many DUTs should be '
                               'recommended for repair (default: no '
                               'recommendation)'))
-    parser.add_argument('--repair-loops', action='store_true',
-                        help='Check for devices stuck in repair loops.')
+    parser.add_argument('--report-untestable', action='store_true',
+                        help='Check for devices unable to run tests.')
     parser.add_argument('--debug-metrics', action='store_true',
                         help='Include debug information about the metrics '
                              'that would be reported ')
@@ -1315,7 +1378,7 @@
         if arguments.debug_metrics or not arguments.debug:
             metrics_file = None if not arguments.debug_metrics else '/dev/null'
             with site_utils.SetupTsMonGlobalState(
-                    'repair_loops', debug_file=metrics_file,
+                    'lab_inventory', debug_file=metrics_file,
                     auto_flush=False):
                 _perform_inventory_reports(arguments)
             metrics.Flush()

diff --git a/site_utils/lab_inventory_unittest.py b/site_utils/lab_inventory_unittest.py
index 487fa88..5418f7e 100755
--- a/site_utils/lab_inventory_unittest.py
+++ b/site_utils/lab_inventory_unittest.py

@@ -1077,7 +1077,9 @@
 
     # At least one of these options must be specified on every command
     # line; otherwise, the command line parsing will fail.
-    _REPORT_OPTIONS = ['--model-notify=', '--pool-notify=', '--repair-loops']
+    _REPORT_OPTIONS = [
+        '--model-notify=', '--pool-notify=', '--report-untestable'
+    ]
 
     def setUp(self):
         dirpath = '/usr/local/fubar'
@@ -1126,7 +1128,7 @@
         arguments = self._parse_arguments(['--model-notify='])
         self.assertEqual(arguments.model_notify, [''])
         self.assertEqual(arguments.pool_notify, [])
-        self.assertFalse(arguments.repair_loops)
+        self.assertFalse(arguments.report_untestable)
 
 
     def test_pool_notify_defaults(self):
@@ -1134,15 +1136,15 @@
         arguments = self._parse_arguments(['--pool-notify='])
         self.assertEqual(arguments.model_notify, [])
         self.assertEqual(arguments.pool_notify, [''])
-        self.assertFalse(arguments.repair_loops)
+        self.assertFalse(arguments.report_untestable)
 
 
-    def test_repair_loop_defaults(self):
-        """Test defaults when `--repair-loops` is specified alone."""
-        arguments = self._parse_arguments(['--repair-loops'])
+    def test_report_untestable_defaults(self):
+        """Test defaults when `--report-untestable` is specified alone."""
+        arguments = self._parse_arguments(['--report-untestable'])
         self.assertEqual(arguments.model_notify, [])
         self.assertEqual(arguments.pool_notify, [])
-        self.assertTrue(arguments.repair_loops)
+        self.assertTrue(arguments.report_untestable)
 
 
     def test_model_arguments(self):
commit	1ca30e67e846aa12e9d523763a3829b298b5ce1d	[log] [tgz]
author	Richard Barnette <jrbarnette@chromium.org>	Mon Apr 09 16:45:58 2018 -0700
committer	chrome-bot <chrome-bot@chromium.org>	Fri Apr 13 15:47:52 2018 -0700
tree	69dd771e334babe84889430bb266a3d35d7b80a0
parent	16b74bef5599bc5cdaf305cde9fe3efde3ed973c [diff]