Autotest: Report idle devices for the Lab DUT inventory. Idle DUTs that are typically locked or wedged, are currently classfied as the same non-working DUT as broken DUTs. This fixes two things: * For the board inventory, request repair for those `BROKEN` DUTs that have actually failed repair. Add a column that reports on idle DUTs to the message that is sent to englab-sys-cros. * For the pool inventory (sent to the deputies) add a column that reports on idle DUTs, and a detailed idle DUT list. Ask for deputy's attention. BUG=chromium:590386 TEST=ran lab_inventory_unittest.py locally with --debug, add idle duts in the test. Change-Id: I564dd79de69092276aabca6b6714dc175e37dfda Reviewed-on: https://chromium-review.googlesource.com/332191 Commit-Ready: Xixuan Wu <xixuan@chromium.org> Tested-by: Xixuan Wu <xixuan@chromium.org> Reviewed-by: Xixuan Wu <xixuan@chromium.org>

commit: 12ce04fcc1a574815e5891512adc7d2f08aa32cc [log] [tgz]
author: xixuan <xixuan@google.com> Thu Mar 10 13:16:30 2016 -0800
committer: chrome-bot <chrome-bot@chromium.org> Fri Mar 18 22:16:52 2016 -0700
tree: 4dfad61aeefc8f0bd28baf065534430303176ad3
parent: 2dbd95a58b1866f849cc5230f140a0162eea6e15 [diff] [blame]
diff --git a/site_utils/lab_inventory.py b/site_utils/lab_inventory.py
index ba34fe0..cddb489 100755
--- a/site_utils/lab_inventory.py
+++ b/site_utils/lab_inventory.py

@@ -141,16 +141,18 @@
       * `get_working_list()`
       * `get_broken()`
       * `get_broken_list()`
+      * `get_idle()`
+      * `get_idle_list()`
     The first time any one of these methods is called, it causes
     multiple RPC calls with a relatively expensive set of database
     queries.  However, the results of the queries are cached in the
     individual `HostJobHistory` objects, so only the first call
     actually pays the full cost.
 
-    Additionally, `get_working_list()` and `get_broken_list()` both
-    cache their return values to avoid recalculating lists at every
-    call; this caching is separate from the caching of RPC results
-    described above.
+    Additionally, `get_working_list()`, `get_broken_list()` and
+    `get_idle_list()` cache their return values to avoid recalculating
+    lists at every call; this caching is separate from the caching of RPC
+    results described above.
 
     This class is deliberately constructed to delay the RPC cost
     until the accessor methods are called (rather than to query in
@@ -164,6 +166,7 @@
         self._histories = []
         self._working_list = None
         self._broken_list = None
+        self._idle_list = None
 
 
     def record_host(self, host_history):
@@ -175,6 +178,7 @@
         """
         self._working_list = None
         self._broken_list = None
+        self._idle_list = None
         self._histories.append(host_history)
 
 
@@ -204,7 +208,7 @@
         """Return a list of all broken DUTs in the pool.
 
         Filter `self._histories` for histories where the last
-        diagnosis is not `WORKING`.
+        diagnosis is `BROKEN`.
 
         Cache the result so that we only cacluate it once.
 
@@ -213,7 +217,7 @@
         """
         if self._broken_list is None:
             self._broken_list = [h for h in self._histories
-                    if h.last_diagnosis()[0] != status_history.WORKING]
+                    if h.last_diagnosis()[0] == status_history.BROKEN]
         return self._broken_list
 
 
@@ -222,6 +226,29 @@
         return len(self.get_broken_list())
 
 
+    def get_idle_list(self):
+        """Return a list of all idle DUTs in the pool.
+
+        Filter `self._histories` for histories where the last
+        diagnosis is `UNUSED` or `UNKNOWN`.
+
+        Cache the result so that we only cacluate it once.
+
+        @return A list of HostJobHistory objects.
+
+        """
+        idle_list = [status_history.UNUSED, status_history.UNKNOWN]
+        if self._idle_list is None:
+            self._idle_list = [h for h in self._histories
+                    if h.last_diagnosis()[0] in idle_list]
+        return self._idle_list
+
+
+    def get_idle(self):
+        """Return the number of idle DUTs in the pool."""
+        return len(self.get_idle_list())
+
+
     def get_total(self):
         """Return the total number of DUTs in the pool."""
         return len(self._histories)
@@ -311,8 +338,7 @@
         """Return a list of all broken DUTs for the board.
 
         Go through all HostJobHistory objects in the board's pools,
-        selecting the ones where the last diagnosis is not
-        `WORKING`.
+        selecting the ones where the last diagnosis is `BROKEN`.
 
         @return A list of HostJobHistory objects.
 
@@ -334,6 +360,38 @@
         return self._count_pool(_PoolCounts.get_broken, pool)
 
 
+    def get_idle_list(self, pool=None):
+        """Return a list of all idle DUTs for the board.
+
+        Go through all HostJobHistory objects in the board's pools,
+        selecting the ones where the last diagnosis is `UNUSED` or `UNKNOWN`.
+
+        @param pool: The pool to be counted. If `None`, return the total list
+                     across all pools.
+
+        @return A list of HostJobHistory objects.
+
+        """
+        if pool is None:
+            l = []
+            for p in self._pools.values():
+                l.extend(p.get_idle_list())
+            return l
+        else:
+            return _PoolCounts.get_idle_list(self._pools[pool])
+
+
+    def get_idle(self, pool=None):
+        """Return the number of idle DUTs in a pool.
+
+        @param pool: The pool to be counted. If `None`, return the total
+                     across all pools.
+
+        @return The total number of idle DUTs in the selected pool(s).
+        """
+        return self._count_pool(_PoolCounts.get_idle, pool)
+
+
     def get_spares_buffer(self):
         """Return the the nominal number of working spares.
 
@@ -664,6 +722,7 @@
     logging.debug('Creating board inventory')
     nworking = 0
     nbroken = 0
+    nidle = 0
     nbroken_boards = 0
     ntotal_boards = 0
     summaries = []
@@ -672,11 +731,12 @@
         counts = inventory[board]
         # Summary elements laid out in the same order as the text
         # headers:
-        #     Board Avail   Bad  Good Spare Total
-        #      e[0]  e[1]  e[2]  e[3]  e[4]  e[5]
+        #     Board Avail   Bad  Idle  Good  Spare Total
+        #      e[0]  e[1]  e[2]  e[3]  e[4]  e[5]  e[6]
         element = (board,
                    counts.get_spares_buffer(),
                    counts.get_broken(),
+                   counts.get_idle(),
                    counts.get_working(),
                    counts.get_total(_SPARE_POOL),
                    counts.get_total())
@@ -685,15 +745,18 @@
             nbroken_boards += 1
         ntotal_boards += 1
         nbroken += element[2]
-        nworking += element[3]
-    ntotal = nworking + nbroken
+        nidle += element[3]
+        nworking += element[4]
+    ntotal = nworking + nbroken + nidle
     summaries = sorted(summaries, key=lambda e: (e[1], -e[2]))
     broken_percent = int(round(100.0 * nbroken / ntotal))
-    working_percent = 100 - broken_percent
+    idle_percent = int(round(100.0 * nidle / ntotal))
+    working_percent = 100 - broken_percent - idle_percent
     message = ['Summary of DUTs in inventory:',
-               '%10s %10s %6s' % ('Bad', 'Good', 'Total'),
-               '%5d %3d%% %5d %3d%% %6d' % (
+               '%10s %10s %10s %6s' % ('Bad', 'Idle', 'Good', 'Total'),
+               '%5d %3d%% %5d %3d%% %5d %3d%% %6d' % (
                    nbroken, broken_percent,
+                   nidle, idle_percent,
                    nworking, working_percent,
                    ntotal),
                '',
@@ -701,11 +764,11 @@
                'Boards in inventory:  %d' % ntotal_boards,
                '', '',
                'Full board inventory:\n',
-               '%-22s %5s %5s %5s %5s %5s' % (
-                   'Board', 'Avail', 'Bad', 'Good',
+               '%-22s %5s %5s %5s %5s %5s %5s' % (
+                   'Board', 'Avail', 'Bad', 'Idle', 'Good',
                    'Spare', 'Total')]
     message.extend(
-            ['%-22s %5d %5d %5d %5d %5d' % e for e in summaries])
+            ['%-22s %5d %5d %5d %5d %5d %5d' % e for e in summaries])
     return '\n'.join(message)
 
 
@@ -741,28 +804,69 @@
         message.append(
             '%sStatus for pool:%s, by board:' % (newline, pool))
         message.append(
-            '%-20s   %5s %5s %5s' % (
-                'Board', 'Bad', 'Good', 'Total'))
+            '%-20s   %5s %5s %5s %5s' % (
+                'Board', 'Bad', 'Idle', 'Good', 'Total'))
         data_list = []
         for board, counts in inventory.items():
             logging.debug('Counting inventory for %s, %s',
                           board, pool)
             broken = counts.get_broken(pool)
-            if broken == 0:
+            idle = counts.get_idle(pool)
+            # boards at full strength are not reported
+            if broken == 0 and idle == 0:
                 continue
             working = counts.get_working(pool)
             total = counts.get_total(pool)
-            data_list.append((board, broken, working, total))
+            data_list.append((board, broken, idle, working, total))
         if data_list:
             data_list = sorted(data_list, key=lambda d: -d[1])
             message.extend(
-                ['%-20s   %5d %5d %5d' % t for t in data_list])
+                ['%-20s   %5d %5d %5d %5d' % t for t in data_list])
         else:
             message.append('(All boards at full strength)')
         newline = '\n'
     return '\n'.join(message)
 
 
+_IDLE_INVENTORY_HEADER = '''\
+Notice to Infrastructure deputies:  The hosts shown below haven't
+run any jobs for at least 24 hours. Please check each host; locked
+hosts should normally be unlocked; stuck jobs should normally be
+aborted.
+'''
+
+
+def _generate_idle_inventory_message(inventory):
+    """Generate the "idle inventory" e-mail message.
+
+    The idle inventory is a host list with corresponding pool and board,
+    where the hosts are idle (`UNKWOWN` or `UNUSED`).
+
+    N.B. For sample output text format as users can expect to
+    see it in e-mail and log files, refer to the unit tests.
+
+    @param inventory  _LabInventory object with the inventory to
+                      be reported on.
+    @return String with the inventory message to be sent.
+
+    """
+    logging.debug('Creating idle inventory')
+    message = [_IDLE_INVENTORY_HEADER]
+    message.append('Idle Host List:')
+    message.append('%-30s %-20s %s' % ('Hostname', 'Board', 'Pool'))
+    data_list = []
+    for pool in _MANAGED_POOLS:
+        for board, counts in inventory.items():
+            logging.debug('Counting inventory for %s, %s', board, pool)
+            data_list.extend([(dut.host.hostname, board, pool)
+                                  for dut in counts.get_idle_list(pool)])
+    if data_list:
+        message.extend(['%-30s %-20s %s' % t for t in data_list])
+    else:
+        message.append('(No idle DUTs)')
+    return '\n'.join(message)
+
+
 def _send_email(arguments, tag, subject, recipients, body):
     """Send an inventory e-mail message.
 
@@ -1036,11 +1140,13 @@
                         recommend_message + board_message)
 
         if arguments.pool_notify:
+            pool_message = _generate_pool_inventory_message(inventory)
+            idle_message = _generate_idle_inventory_message(inventory)
             _send_email(arguments,
                         'pools-%s.txt' % timestamp,
                         'DUT pool inventory %s' % timestamp,
                         arguments.pool_notify,
-                        _generate_pool_inventory_message(inventory))
+                        pool_message + '\n\n\n' + idle_message)
     except KeyboardInterrupt:
         pass
     except EnvironmentError as e:
commit	12ce04fcc1a574815e5891512adc7d2f08aa32cc	[log] [tgz]
author	xixuan <xixuan@google.com>	Thu Mar 10 13:16:30 2016 -0800
committer	chrome-bot <chrome-bot@chromium.org>	Fri Mar 18 22:16:52 2016 -0700
tree	4dfad61aeefc8f0bd28baf065534430303176ad3
parent	2dbd95a58b1866f849cc5230f140a0162eea6e15 [diff] [blame]