Autotest: Report idle devices for the Lab DUT inventory.
Idle DUTs that are typically locked or wedged, are currently classfied as the
same non-working DUT as broken DUTs.
This fixes two things:
* For the board inventory, request repair for those `BROKEN` DUTs that have
actually failed repair. Add a column that reports on idle DUTs to
the message that is sent to englab-sys-cros.
* For the pool inventory (sent to the deputies) add a column that reports
on idle DUTs, and a detailed idle DUT list. Ask for deputy's attention.
BUG=chromium:590386
TEST=ran lab_inventory_unittest.py locally with --debug, add idle duts
in the test.
Change-Id: I564dd79de69092276aabca6b6714dc175e37dfda
Reviewed-on: https://chromium-review.googlesource.com/332191
Commit-Ready: Xixuan Wu <xixuan@chromium.org>
Tested-by: Xixuan Wu <xixuan@chromium.org>
Reviewed-by: Xixuan Wu <xixuan@chromium.org>
diff --git a/site_utils/lab_inventory.py b/site_utils/lab_inventory.py
index ba34fe0..cddb489 100755
--- a/site_utils/lab_inventory.py
+++ b/site_utils/lab_inventory.py
@@ -141,16 +141,18 @@
* `get_working_list()`
* `get_broken()`
* `get_broken_list()`
+ * `get_idle()`
+ * `get_idle_list()`
The first time any one of these methods is called, it causes
multiple RPC calls with a relatively expensive set of database
queries. However, the results of the queries are cached in the
individual `HostJobHistory` objects, so only the first call
actually pays the full cost.
- Additionally, `get_working_list()` and `get_broken_list()` both
- cache their return values to avoid recalculating lists at every
- call; this caching is separate from the caching of RPC results
- described above.
+ Additionally, `get_working_list()`, `get_broken_list()` and
+ `get_idle_list()` cache their return values to avoid recalculating
+ lists at every call; this caching is separate from the caching of RPC
+ results described above.
This class is deliberately constructed to delay the RPC cost
until the accessor methods are called (rather than to query in
@@ -164,6 +166,7 @@
self._histories = []
self._working_list = None
self._broken_list = None
+ self._idle_list = None
def record_host(self, host_history):
@@ -175,6 +178,7 @@
"""
self._working_list = None
self._broken_list = None
+ self._idle_list = None
self._histories.append(host_history)
@@ -204,7 +208,7 @@
"""Return a list of all broken DUTs in the pool.
Filter `self._histories` for histories where the last
- diagnosis is not `WORKING`.
+ diagnosis is `BROKEN`.
Cache the result so that we only cacluate it once.
@@ -213,7 +217,7 @@
"""
if self._broken_list is None:
self._broken_list = [h for h in self._histories
- if h.last_diagnosis()[0] != status_history.WORKING]
+ if h.last_diagnosis()[0] == status_history.BROKEN]
return self._broken_list
@@ -222,6 +226,29 @@
return len(self.get_broken_list())
+ def get_idle_list(self):
+ """Return a list of all idle DUTs in the pool.
+
+ Filter `self._histories` for histories where the last
+ diagnosis is `UNUSED` or `UNKNOWN`.
+
+ Cache the result so that we only cacluate it once.
+
+ @return A list of HostJobHistory objects.
+
+ """
+ idle_list = [status_history.UNUSED, status_history.UNKNOWN]
+ if self._idle_list is None:
+ self._idle_list = [h for h in self._histories
+ if h.last_diagnosis()[0] in idle_list]
+ return self._idle_list
+
+
+ def get_idle(self):
+ """Return the number of idle DUTs in the pool."""
+ return len(self.get_idle_list())
+
+
def get_total(self):
"""Return the total number of DUTs in the pool."""
return len(self._histories)
@@ -311,8 +338,7 @@
"""Return a list of all broken DUTs for the board.
Go through all HostJobHistory objects in the board's pools,
- selecting the ones where the last diagnosis is not
- `WORKING`.
+ selecting the ones where the last diagnosis is `BROKEN`.
@return A list of HostJobHistory objects.
@@ -334,6 +360,38 @@
return self._count_pool(_PoolCounts.get_broken, pool)
+ def get_idle_list(self, pool=None):
+ """Return a list of all idle DUTs for the board.
+
+ Go through all HostJobHistory objects in the board's pools,
+ selecting the ones where the last diagnosis is `UNUSED` or `UNKNOWN`.
+
+ @param pool: The pool to be counted. If `None`, return the total list
+ across all pools.
+
+ @return A list of HostJobHistory objects.
+
+ """
+ if pool is None:
+ l = []
+ for p in self._pools.values():
+ l.extend(p.get_idle_list())
+ return l
+ else:
+ return _PoolCounts.get_idle_list(self._pools[pool])
+
+
+ def get_idle(self, pool=None):
+ """Return the number of idle DUTs in a pool.
+
+ @param pool: The pool to be counted. If `None`, return the total
+ across all pools.
+
+ @return The total number of idle DUTs in the selected pool(s).
+ """
+ return self._count_pool(_PoolCounts.get_idle, pool)
+
+
def get_spares_buffer(self):
"""Return the the nominal number of working spares.
@@ -664,6 +722,7 @@
logging.debug('Creating board inventory')
nworking = 0
nbroken = 0
+ nidle = 0
nbroken_boards = 0
ntotal_boards = 0
summaries = []
@@ -672,11 +731,12 @@
counts = inventory[board]
# Summary elements laid out in the same order as the text
# headers:
- # Board Avail Bad Good Spare Total
- # e[0] e[1] e[2] e[3] e[4] e[5]
+ # Board Avail Bad Idle Good Spare Total
+ # e[0] e[1] e[2] e[3] e[4] e[5] e[6]
element = (board,
counts.get_spares_buffer(),
counts.get_broken(),
+ counts.get_idle(),
counts.get_working(),
counts.get_total(_SPARE_POOL),
counts.get_total())
@@ -685,15 +745,18 @@
nbroken_boards += 1
ntotal_boards += 1
nbroken += element[2]
- nworking += element[3]
- ntotal = nworking + nbroken
+ nidle += element[3]
+ nworking += element[4]
+ ntotal = nworking + nbroken + nidle
summaries = sorted(summaries, key=lambda e: (e[1], -e[2]))
broken_percent = int(round(100.0 * nbroken / ntotal))
- working_percent = 100 - broken_percent
+ idle_percent = int(round(100.0 * nidle / ntotal))
+ working_percent = 100 - broken_percent - idle_percent
message = ['Summary of DUTs in inventory:',
- '%10s %10s %6s' % ('Bad', 'Good', 'Total'),
- '%5d %3d%% %5d %3d%% %6d' % (
+ '%10s %10s %10s %6s' % ('Bad', 'Idle', 'Good', 'Total'),
+ '%5d %3d%% %5d %3d%% %5d %3d%% %6d' % (
nbroken, broken_percent,
+ nidle, idle_percent,
nworking, working_percent,
ntotal),
'',
@@ -701,11 +764,11 @@
'Boards in inventory: %d' % ntotal_boards,
'', '',
'Full board inventory:\n',
- '%-22s %5s %5s %5s %5s %5s' % (
- 'Board', 'Avail', 'Bad', 'Good',
+ '%-22s %5s %5s %5s %5s %5s %5s' % (
+ 'Board', 'Avail', 'Bad', 'Idle', 'Good',
'Spare', 'Total')]
message.extend(
- ['%-22s %5d %5d %5d %5d %5d' % e for e in summaries])
+ ['%-22s %5d %5d %5d %5d %5d %5d' % e for e in summaries])
return '\n'.join(message)
@@ -741,28 +804,69 @@
message.append(
'%sStatus for pool:%s, by board:' % (newline, pool))
message.append(
- '%-20s %5s %5s %5s' % (
- 'Board', 'Bad', 'Good', 'Total'))
+ '%-20s %5s %5s %5s %5s' % (
+ 'Board', 'Bad', 'Idle', 'Good', 'Total'))
data_list = []
for board, counts in inventory.items():
logging.debug('Counting inventory for %s, %s',
board, pool)
broken = counts.get_broken(pool)
- if broken == 0:
+ idle = counts.get_idle(pool)
+ # boards at full strength are not reported
+ if broken == 0 and idle == 0:
continue
working = counts.get_working(pool)
total = counts.get_total(pool)
- data_list.append((board, broken, working, total))
+ data_list.append((board, broken, idle, working, total))
if data_list:
data_list = sorted(data_list, key=lambda d: -d[1])
message.extend(
- ['%-20s %5d %5d %5d' % t for t in data_list])
+ ['%-20s %5d %5d %5d %5d' % t for t in data_list])
else:
message.append('(All boards at full strength)')
newline = '\n'
return '\n'.join(message)
+_IDLE_INVENTORY_HEADER = '''\
+Notice to Infrastructure deputies: The hosts shown below haven't
+run any jobs for at least 24 hours. Please check each host; locked
+hosts should normally be unlocked; stuck jobs should normally be
+aborted.
+'''
+
+
+def _generate_idle_inventory_message(inventory):
+ """Generate the "idle inventory" e-mail message.
+
+ The idle inventory is a host list with corresponding pool and board,
+ where the hosts are idle (`UNKWOWN` or `UNUSED`).
+
+ N.B. For sample output text format as users can expect to
+ see it in e-mail and log files, refer to the unit tests.
+
+ @param inventory _LabInventory object with the inventory to
+ be reported on.
+ @return String with the inventory message to be sent.
+
+ """
+ logging.debug('Creating idle inventory')
+ message = [_IDLE_INVENTORY_HEADER]
+ message.append('Idle Host List:')
+ message.append('%-30s %-20s %s' % ('Hostname', 'Board', 'Pool'))
+ data_list = []
+ for pool in _MANAGED_POOLS:
+ for board, counts in inventory.items():
+ logging.debug('Counting inventory for %s, %s', board, pool)
+ data_list.extend([(dut.host.hostname, board, pool)
+ for dut in counts.get_idle_list(pool)])
+ if data_list:
+ message.extend(['%-30s %-20s %s' % t for t in data_list])
+ else:
+ message.append('(No idle DUTs)')
+ return '\n'.join(message)
+
+
def _send_email(arguments, tag, subject, recipients, body):
"""Send an inventory e-mail message.
@@ -1036,11 +1140,13 @@
recommend_message + board_message)
if arguments.pool_notify:
+ pool_message = _generate_pool_inventory_message(inventory)
+ idle_message = _generate_idle_inventory_message(inventory)
_send_email(arguments,
'pools-%s.txt' % timestamp,
'DUT pool inventory %s' % timestamp,
arguments.pool_notify,
- _generate_pool_inventory_message(inventory))
+ pool_message + '\n\n\n' + idle_message)
except KeyboardInterrupt:
pass
except EnvironmentError as e: