[autotest] Update lab_inventory to report idle DUTs with metrics.
This updates the lab_inventory script to report DUTs that are idle
and unlocked via a new Monarch metric. The new metric supports
reporting both idle DUTs and DUTs in repair loops. The old,
single-purpose metric for reporting repair loops is being deprecated
in favor of the new metric.
CQ-DEPEND=CL:*605029
BUG=chromium:804625
TEST=run the inventory command locally
Change-Id: Id4d5eba265902cb84072bd0141d9657bce0f47d0
Reviewed-on: https://chromium-review.googlesource.com/1003599
Commit-Ready: Richard Barnette <jrbarnette@chromium.org>
Tested-by: Richard Barnette <jrbarnette@chromium.org>
Reviewed-by: Allen Li <ayatane@chromium.org>
diff --git a/contrib/inventory_options b/contrib/inventory_options
index 67d1887..3d329b3 100644
--- a/contrib/inventory_options
+++ b/contrib/inventory_options
@@ -35,19 +35,19 @@
# Options to be used for different script invocations. Inventory
# runs are relatively expensive, so operations that happen rarely
# also bundle operations that happen more frequently.
-# + REPAIR_LOOP_DETECT happens with every run. It looks for
-# and reports DUTs that do no work other than to fail, then repair
-# successfully.
+# + UNTESTABLE_DETECT happens with every run. It looks for
+# and reports DUTs that for never run tests even though their
+# status indicates that they should.
# + MODEL_NOTIFY happens less often. This adds a full model
# inventory count to REPAIR_LOOP_DETECT.
# + POOL_NOTIFY happens least often. It adds per-pool inventory
# counts, as well as individual DUT repair recommendations to
# MODEL_NOTIFY.
-REPAIR_LOOP_DETECT=( --repair-loops )
+UNTESTABLE_DETECT=( --report-untestable )
MODEL_NOTIFY=(
- "${REPAIR_LOOP_DETECT[@]}"
+ "${UNTESTABLE_DETECT[@]}"
--model-notify $(echo "${MODEL_INTEREST[@]}" | sed 's/ /,/g')
)
diff --git a/contrib/run-loop-detection b/contrib/run-loop-detection
index 4a938cf..7938175 100755
--- a/contrib/run-loop-detection
+++ b/contrib/run-loop-detection
@@ -7,4 +7,4 @@
cd $SCRIPT_DIR/..
. contrib/inventory_options
-site_utils/lab_inventory.py $OPTIONS "${REPAIR_LOOP_DETECT[@]}"
+site_utils/lab_inventory.py $OPTIONS "${UNTESTABLE_DETECT[@]}"
diff --git a/site_utils/lab_inventory.py b/site_utils/lab_inventory.py
index 39503a9..26dadf6 100755
--- a/site_utils/lab_inventory.py
+++ b/site_utils/lab_inventory.py
@@ -29,9 +29,9 @@
When generating the "model status" e-mail, include a list of
<number> specific DUTs to be recommended for repair.
---repair-loops
- Scan the inventory for DUTs stuck in repair loops, and report them
- via a Monarch presence metric.
+--report-untestable
+ Scan the inventory for DUTs that can't test because they're stuck in
+ repair loops, or because the scheduler can't give them work.
--logdir <directory>
Log progress and actions in a file under this directory. Text
@@ -122,6 +122,11 @@
_REPAIR_LOOP_THRESHOLD = 4
+_UNTESTABLE_PRESENCE_METRIC = metrics.BooleanMetric(
+ 'chromeos/autotest/inventory/untestable',
+ 'DUTs that cannot be scheduled for testing')
+
+
class _HostSetInventory(object):
"""Maintains a set of related `HostJobHistory` objects.
@@ -1015,7 +1020,7 @@
# a repair task, then our history includes a successful non-repair
# task, and we're not looping.
#
- # The for loop below is very expensive, because it must fetch the
+ # The for loop below is very expensive, because it must fetch the
# full history, regardless of how many tasks we examine. At the
# time of this writing, this check against the diagnosis task
# reduces the cost of finding loops in the full inventory from hours
@@ -1042,17 +1047,29 @@
return True
-def _perform_repair_loop_report(arguments, inventory):
- """Scan the inventory for DUTs stuck in a repair loop.
+def _report_untestable_dut(history, state):
+ fields = {
+ 'dut_hostname': history.hostname,
+ 'model': history.host_model,
+ 'pool': history.host_pool,
+ 'state': state,
+ }
+ logging.info('Untestable DUT: %(dut_hostname)s, model: %(model)s, '
+ 'pool: %(pool)s', fields)
+ _UNTESTABLE_PRESENCE_METRIC.set(True, fields=fields)
- This routine walks through the given inventory looking for DUTs
- where the most recent history shows that the DUT is regularly
- passing repair tasks, but has not run any tests.
- @param arguments Command-line arguments as returned by
- `ArgumentParser`
+def _report_repair_loop_metrics(inventory):
+ """Find and report DUTs stuck in a repair loop.
+
+ Go through `inventory`, and find and report any DUT identified as
+ being in a repair loop.
+
@param inventory `_LabInventory` object to be reported on.
"""
+ # TODO(jrbarnette) The `repair_loops` metric is being deprecated,
+ # and should be deleted as soon as the `untestable` metric has a
+ # dashboard that's working reliably.
loop_presence = metrics.BooleanMetric(
'chromeos/autotest/inventory/repair_loops',
'DUTs stuck in repair loops')
@@ -1067,13 +1084,59 @@
if not _HOSTNAME_PATTERN.match(history.hostname):
continue
if _dut_in_repair_loop(history):
- fields = {'dut_hostname': history.hostname,
- 'model': history.host_model,
- 'pool': history.host_pool}
- logging.info('Looping DUT: %(dut_hostname)s, '
- 'model: %(model)s, pool: %(pool)s',
- fields)
+ fields = {
+ 'dut_hostname': history.hostname,
+ 'model': history.host_model,
+ 'pool': history.host_pool
+ }
loop_presence.set(True, fields=fields)
+ _report_untestable_dut(history, 'repair_loop')
+
+
+def _report_idle_dut_metrics(inventory):
+ """Find and report idle, unlocked DUTs.
+
+ Go through `inventory`, and find and report any DUT identified as
+ "idle" that is not also locked.
+
+ @param inventory `_LabInventory` object to be reported on.
+ """
+ logging.info('Scanning for idle, unlocked DUTs.')
+ for counts in inventory.itervalues():
+ for history in counts.get_idle_list():
+ # Managed DUTs with names that don't match
+ # _HOSTNAME_PATTERN shouldn't be possible. However, we
+ # don't want arbitrary strings being attached to the
+ # 'dut_hostname' field, so for safety, we exclude all
+ # anomalies.
+ if not _HOSTNAME_PATTERN.match(history.hostname):
+ continue
+ if not history.host.locked:
+ _report_untestable_dut(history, 'idle_unlocked')
+
+
+def _report_untestable_dut_metrics(inventory):
+ """Scan the inventory for DUTs unable to run tests.
+
+ DUTs in the inventory are judged "untestable" if they meet one of
+ two criteria:
+ * The DUT is stuck in a repair loop; that is, it regularly passes
+ repair, but never passes other operations.
+ * The DUT runs no tasks at all, but is not locked.
+
+ This routine walks through the given inventory looking for DUTs in
+ either of these states. Results are reported via a Monarch presence
+ metric.
+
+ Note: To make sure that DUTs aren't flagged as "idle" merely
+ because there's no work, a separate job runs prior to regular
+ inventory runs which schedules trivial work on any DUT that appears
+ idle.
+
+ @param inventory `_LabInventory` object to be reported on.
+ """
+ _report_repair_loop_metrics(inventory)
+ _report_idle_dut_metrics(inventory)
def _log_startup(arguments, startup_time):
@@ -1134,8 +1197,8 @@
_perform_model_inventory(arguments, inventory, timestamp)
if arguments.pool_notify:
_perform_pool_inventory(arguments, inventory, timestamp)
- if arguments.repair_loops:
- _perform_repair_loop_report(arguments, inventory)
+ if arguments.report_untestable:
+ _report_untestable_dut_metrics(inventory)
def _separate_email_addresses(address_list):
@@ -1176,11 +1239,11 @@
arguments.pool_notify = _separate_email_addresses(
arguments.pool_notify)
if not any([arguments.model_notify, arguments.pool_notify,
- arguments.repair_loops]):
+ arguments.report_untestable]):
if not arguments.debug:
sys.stderr.write('Must request at least one report via '
'--model-notify, --pool-notify, or '
- '--repair-loops\n')
+ '--report-untestable\n')
return False
else:
# We want to run all the e-mail reports. An empty notify
@@ -1237,8 +1300,8 @@
help=('Specify how many DUTs should be '
'recommended for repair (default: no '
'recommendation)'))
- parser.add_argument('--repair-loops', action='store_true',
- help='Check for devices stuck in repair loops.')
+ parser.add_argument('--report-untestable', action='store_true',
+ help='Check for devices unable to run tests.')
parser.add_argument('--debug-metrics', action='store_true',
help='Include debug information about the metrics '
'that would be reported ')
@@ -1315,7 +1378,7 @@
if arguments.debug_metrics or not arguments.debug:
metrics_file = None if not arguments.debug_metrics else '/dev/null'
with site_utils.SetupTsMonGlobalState(
- 'repair_loops', debug_file=metrics_file,
+ 'lab_inventory', debug_file=metrics_file,
auto_flush=False):
_perform_inventory_reports(arguments)
metrics.Flush()
diff --git a/site_utils/lab_inventory_unittest.py b/site_utils/lab_inventory_unittest.py
index 487fa88..5418f7e 100755
--- a/site_utils/lab_inventory_unittest.py
+++ b/site_utils/lab_inventory_unittest.py
@@ -1077,7 +1077,9 @@
# At least one of these options must be specified on every command
# line; otherwise, the command line parsing will fail.
- _REPORT_OPTIONS = ['--model-notify=', '--pool-notify=', '--repair-loops']
+ _REPORT_OPTIONS = [
+ '--model-notify=', '--pool-notify=', '--report-untestable'
+ ]
def setUp(self):
dirpath = '/usr/local/fubar'
@@ -1126,7 +1128,7 @@
arguments = self._parse_arguments(['--model-notify='])
self.assertEqual(arguments.model_notify, [''])
self.assertEqual(arguments.pool_notify, [])
- self.assertFalse(arguments.repair_loops)
+ self.assertFalse(arguments.report_untestable)
def test_pool_notify_defaults(self):
@@ -1134,15 +1136,15 @@
arguments = self._parse_arguments(['--pool-notify='])
self.assertEqual(arguments.model_notify, [])
self.assertEqual(arguments.pool_notify, [''])
- self.assertFalse(arguments.repair_loops)
+ self.assertFalse(arguments.report_untestable)
- def test_repair_loop_defaults(self):
- """Test defaults when `--repair-loops` is specified alone."""
- arguments = self._parse_arguments(['--repair-loops'])
+ def test_report_untestable_defaults(self):
+ """Test defaults when `--report-untestable` is specified alone."""
+ arguments = self._parse_arguments(['--report-untestable'])
self.assertEqual(arguments.model_notify, [])
self.assertEqual(arguments.pool_notify, [])
- self.assertTrue(arguments.repair_loops)
+ self.assertTrue(arguments.report_untestable)
def test_model_arguments(self):