blob: 97944a78c22923f9f556d6ba3771db8a66ccb107 [file] [log] [blame]
J. Richard Barnette96db3492015-03-27 17:23:52 -07001#!/usr/bin/env python
2# Copyright 2015 The Chromium OS Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6"""Create e-mail reports of the Lab's DUT inventory.
7
8Gathers a list of all DUTs of interest in the Lab, segregated by
9board and pool, and determines whether each DUT is working or
10broken. Then, send one or more e-mail reports summarizing the
11status to e-mail addresses provided on the command line.
12
13usage: lab_inventory.py [ options ] [ board ... ]
14
15Options:
16--duration / -d <hours>
17 How far back in time to search job history to determine DUT
18 status.
19
20--board-notify <address>[,<address>]
21 Send the "board status" e-mail to all the specified e-mail
22 addresses.
23
24--pool-notify <address>[,<address>]
25 Send the "pool status" e-mail to all the specified e-mail
26 addresses.
27
28--logdir <directory>
29 Log progress and actions in a file under this directory. Text
30 of any e-mail sent will also be logged in a timestamped file in
31 this directory.
32
33--print
34 Suppress all logging and sending e-mail. Instead, write the
35 output that would be generated onto stdout.
36
37<board> arguments:
38 With no arguments, gathers the status for all boards in the lab.
39 With one or more named boards on the command line, restricts
40 reporting to just those boards.
41
42"""
43
44
45import argparse
46import logging
47import logging.handlers
48import os
J. Richard Barnettef6839282015-06-01 16:00:35 -070049import re
J. Richard Barnette96db3492015-03-27 17:23:52 -070050import sys
51import time
52
53import common
J. Richard Barnettef6839282015-06-01 16:00:35 -070054from autotest_lib.client.bin import utils
J. Richard Barnette96db3492015-03-27 17:23:52 -070055from autotest_lib.client.common_lib import time_utils
56from autotest_lib.server import frontend
J. Richard Barnettef6839282015-06-01 16:00:35 -070057from autotest_lib.server.hosts import servo_host
J. Richard Barnette96db3492015-03-27 17:23:52 -070058from autotest_lib.site_utils import gmail_lib
59from autotest_lib.site_utils import status_history
60from autotest_lib.site_utils.suite_scheduler import constants
61
62
63# The pools in the Lab that are actually of interest.
64#
65# These are general purpose pools of DUTs that are considered
66# identical for purposes of testing. That is, a device in one of
67# these pools can be shifted to another pool at will for purposes
68# of supplying test demand.
69#
70# Devices in these pools are not allowed to have special-purpose
71# attachments, or to be part of in any kind of custom fixture.
72# Devices in these pools are also required to reside in areas
73# managed by the Platforms team (i.e. at the time of this writing,
74# only in "Atlantis" or "Destiny").
75#
76# _CRITICAL_POOLS - Pools that must be kept fully supplied in order
77# to guarantee timely completion of tests from builders.
78# _SPARE_POOL - A low priority pool that is allowed to provide
79# spares to replace broken devices in the critical pools.
80# _MANAGED_POOLS - The set of all the general purpose pools
81# monitored by this script.
82
83_CRITICAL_POOLS = ['bvt', 'cq']
84_SPARE_POOL = 'suites'
85_MANAGED_POOLS = _CRITICAL_POOLS + [_SPARE_POOL]
86
J. Richard Barnette96db3492015-03-27 17:23:52 -070087# _DEFAULT_DURATION:
88# Default value used for the --duration command line option.
89# Specifies how far back in time to search in order to determine
90# DUT status.
91
92_DEFAULT_DURATION = 24
93
J. Richard Barnette96db3492015-03-27 17:23:52 -070094# _LOGDIR:
95# Relative path used in the calculation of the default setting
96# for the --logdir option. The full path path is relative to
97# the root of the autotest directory, as determined from
98# sys.argv[0].
99# _LOGFILE:
100# Basename of a file to which general log information will be
101# written.
102# _LOG_FORMAT:
103# Format string for log messages.
104
105_LOGDIR = os.path.join('logs', 'dut-data')
106_LOGFILE = 'lab-inventory.log'
107_LOG_FORMAT = '%(asctime)s | %(levelname)-10s | %(message)s'
108
J. Richard Barnettef6839282015-06-01 16:00:35 -0700109# _DEFAULT_NUM_RECOMMEND:
110# The default setting for the --recommend option. That option
111# determines how many DUTs will be listed in the output produced
112# by `_generate_repair_recommendation()`.
113_DEFAULT_NUM_RECOMMEND = 10
114
115# Pattern describing location-based host names in the Chrome OS test
116# labs. Each DUT hostname designates the DUT's location:
117# * A lab (room) that's physically separated from other labs
118# (i.e. there's a door).
119# * A row (or aisle) of DUTs within the lab.
120# * A vertical rack of shelves on the row.
121# * A specific host on one shelf of the rack.
122
123_HOSTNAME_PATTERN = re.compile(
124 r'(chromeos\d+)-row(\d+)-rack(\d+)-host(\d+)')
125
J. Richard Barnette96db3492015-03-27 17:23:52 -0700126
127class _PoolCounts(object):
128 """Maintains a set of `HostJobHistory` objects for a pool.
129
130 The collected history objects are nominally all part of a single
J. Richard Barnettef6839282015-06-01 16:00:35 -0700131 scheduling pool of DUTs. The collection maintains a list of
132 working DUTs, a list of broken DUTs, and a list of all DUTs.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700133
J. Richard Barnettef6839282015-06-01 16:00:35 -0700134 Performance note: Certain methods in this class are potentially
135 expensive:
136 * `get_working()`
137 * `get_working_list()`
138 * `get_broken()`
139 * `get_broken_list()`
140 The first time any one of these methods is called, it causes
141 multiple RPC calls with a relatively expensive set of database
142 queries. However, the results of the queries are cached in the
143 individual `HostJobHistory` objects, so only the first call
144 actually pays the full cost.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700145
J. Richard Barnettef6839282015-06-01 16:00:35 -0700146 Additionally, `get_working_list()` and `get_broken_list()` both
147 cache their return values to avoid recalculating lists at every
148 call; this caching is separate from the caching of RPC results
149 described above.
150
151 This class is deliberately constructed to delay the RPC cost
152 until the accessor methods are called (rather than to query in
J. Richard Barnette96db3492015-03-27 17:23:52 -0700153 `record_host()`) so that it's possible to construct a complete
154 `_LabInventory` without making the expensive queries at creation
J. Richard Barnettef6839282015-06-01 16:00:35 -0700155 time. `_populate_board_counts()`, below, assumes this behavior.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700156
157 """
158
159 def __init__(self):
160 self._histories = []
J. Richard Barnettef6839282015-06-01 16:00:35 -0700161 self._working_list = None
162 self._broken_list = None
J. Richard Barnette96db3492015-03-27 17:23:52 -0700163
164
165 def record_host(self, host_history):
166 """Add one `HostJobHistory` object to the collection.
167
168 @param host_history The `HostJobHistory` object to be
169 remembered.
170
171 """
J. Richard Barnettef6839282015-06-01 16:00:35 -0700172 self._working_list = None
173 self._broken_list = None
J. Richard Barnette96db3492015-03-27 17:23:52 -0700174 self._histories.append(host_history)
175
176
J. Richard Barnettef6839282015-06-01 16:00:35 -0700177 def get_working_list(self):
178 """Return a list of all working DUTs in the pool.
179
180 Filter `self._histories` for histories where the last
181 diagnosis is `WORKING`.
182
183 Cache the result so that we only cacluate it once.
184
185 @return A list of HostJobHistory objects.
186
187 """
188 if self._working_list is None:
189 self._working_list = [h for h in self._histories
190 if h.last_diagnosis()[0] == status_history.WORKING]
191 return self._working_list
192
193
J. Richard Barnette96db3492015-03-27 17:23:52 -0700194 def get_working(self):
J. Richard Barnettef6839282015-06-01 16:00:35 -0700195 """Return the number of working DUTs in the pool."""
196 return len(self.get_working_list())
197
198
199 def get_broken_list(self):
200 """Return a list of all broken DUTs in the pool.
201
202 Filter `self._histories` for histories where the last
203 diagnosis is not `WORKING`.
204
205 Cache the result so that we only cacluate it once.
206
207 @return A list of HostJobHistory objects.
208
209 """
210 if self._broken_list is None:
211 self._broken_list = [h for h in self._histories
212 if h.last_diagnosis()[0] != status_history.WORKING]
213 return self._broken_list
J. Richard Barnette96db3492015-03-27 17:23:52 -0700214
215
216 def get_broken(self):
J. Richard Barnettef6839282015-06-01 16:00:35 -0700217 """Return the number of broken DUTs in the pool."""
218 return len(self.get_broken_list())
J. Richard Barnette96db3492015-03-27 17:23:52 -0700219
220
221 def get_total(self):
J. Richard Barnettef6839282015-06-01 16:00:35 -0700222 """Return the total number of DUTs in the pool."""
J. Richard Barnette96db3492015-03-27 17:23:52 -0700223 return len(self._histories)
224
225
226class _BoardCounts(object):
227 """Maintains a set of `HostJobHistory` objects for a board.
228
229 The collected history objects are nominally all of the same
230 board. The collection maintains a count of working DUTs, a
231 count of broken DUTs, and a total count. The counts can be
232 obtained either for a single pool, or as a total across all
233 pools.
234
235 DUTs in the collection must be assigned to one of the pools
236 in `_MANAGED_POOLS`.
237
238 The `get_working()` and `get_broken()` methods rely on the
239 methods of the same name in _PoolCounts, so the performance
240 note in _PoolCounts applies here as well.
241
242 """
243
244 def __init__(self):
245 self._pools = {
246 pool: _PoolCounts() for pool in _MANAGED_POOLS
247 }
248
249 def record_host(self, host_history):
250 """Add one `HostJobHistory` object to the collection.
251
252 @param host_history The `HostJobHistory` object to be
253 remembered.
254
255 """
J. Richard Barnette3d0590a2015-04-29 12:56:12 -0700256 pool = host_history.host_pool
J. Richard Barnette96db3492015-03-27 17:23:52 -0700257 self._pools[pool].record_host(host_history)
258
259
260 def _count_pool(self, get_pool_count, pool=None):
261 """Internal helper to count hosts in a given pool.
262
263 The `get_pool_count` parameter is a function to calculate
264 the exact count of interest for the pool.
265
266 @param get_pool_count Function to return a count from a
267 _PoolCount object.
268 @param pool The pool to be counted. If `None`,
269 return the total across all pools.
270
271 """
272 if pool is None:
273 return sum([get_pool_count(counts)
274 for counts in self._pools.values()])
275 else:
276 return get_pool_count(self._pools[pool])
277
278
J. Richard Barnettef6839282015-06-01 16:00:35 -0700279 def get_working_list(self):
280 """Return a list of all working DUTs for the board.
281
282 Go through all HostJobHistory objects in the board's pools,
283 selecting the ones where the last diagnosis is `WORKING`.
284
285 @return A list of HostJobHistory objects.
286
287 """
288 l = []
289 for p in self._pools.values():
290 l.extend(p.get_working_list())
291 return l
292
293
J. Richard Barnette96db3492015-03-27 17:23:52 -0700294 def get_working(self, pool=None):
295 """Return the number of working DUTs in a pool.
296
297 @param pool The pool to be counted. If `None`, return the
298 total across all pools.
299
300 """
301 return self._count_pool(_PoolCounts.get_working, pool)
302
303
J. Richard Barnettef6839282015-06-01 16:00:35 -0700304 def get_broken_list(self):
305 """Return a list of all broken DUTs for the board.
306
307 Go through all HostJobHistory objects in the board's pools,
308 selecting the ones where the last diagnosis is not
309 `WORKING`.
310
311 @return A list of HostJobHistory objects.
312
313 """
314 l = []
315 for p in self._pools.values():
316 l.extend(p.get_broken_list())
317 return l
318
319
J. Richard Barnette96db3492015-03-27 17:23:52 -0700320 def get_broken(self, pool=None):
321 """Return the number of broken DUTs in a pool.
322
323 @param pool The pool to be counted. If `None`, return the
324 total across all pools.
325
326 """
327 return self._count_pool(_PoolCounts.get_broken, pool)
328
329
330 def get_total(self, pool=None):
331 """Return the total number of DUTs in a pool.
332
333 @param pool The pool to be counted. If `None`, return the
334 total across all pools.
335
336 """
337 return self._count_pool(_PoolCounts.get_total, pool)
338
339
340class _LabInventory(dict):
341 """Collection of `HostJobHistory` objects for the Lab's inventory.
342
343 The collection is indexed by board. Indexing returns the
344 _BoardCounts object associated with the board.
345
346 The collection is also iterable. The iterator returns all the
347 boards in the inventory, in unspecified order.
348
349 """
350
351 @classmethod
352 def create_inventory(cls, afe, start_time, end_time, boardlist=[]):
353 """Return a Lab inventory with specified parameters.
354
355 By default, gathers inventory from `HostJobHistory` objects
356 for all DUTs in the `_MANAGED_POOLS` list. If `boardlist`
357 is supplied, the inventory will be restricted to only the
358 given boards.
359
360 @param afe AFE object for constructing the
361 `HostJobHistory` objects.
362 @param start_time Start time for the `HostJobHistory`
363 objects.
364 @param end_time End time for the `HostJobHistory`
365 objects.
366 @param boardlist List of boards to include. If empty,
367 include all available boards.
368 @return A `_LabInventory` object for the specified boards.
369
370 """
371 label_list = [constants.Labels.POOL_PREFIX + l
372 for l in _MANAGED_POOLS]
373 afehosts = afe.get_hosts(labels__name__in=label_list)
374 if boardlist:
375 boardhosts = []
376 for board in boardlist:
377 board_label = constants.Labels.BOARD_PREFIX + board
378 host_list = [h for h in afehosts
379 if board_label in h.labels]
380 boardhosts.extend(host_list)
381 afehosts = boardhosts
382 create = lambda host: (
383 status_history.HostJobHistory(afe, host,
384 start_time, end_time))
385 return cls([create(host) for host in afehosts])
386
387
388 def __init__(self, histories):
J. Richard Barnette6948ed32015-05-06 08:57:10 -0700389 # N.B. The query that finds our hosts is restricted to those
390 # with a valid pool: label, but doesn't check for a valid
391 # board: label. In some (insufficiently) rare cases, the
392 # AFE hosts table has been known to (incorrectly) have DUTs
393 # with a pool: but no board: label. We explicitly exclude
394 # those here.
395 histories = [h for h in histories
396 if h.host_board is not None]
J. Richard Barnette3d0590a2015-04-29 12:56:12 -0700397 boards = set([h.host_board for h in histories])
J. Richard Barnette96db3492015-03-27 17:23:52 -0700398 initval = { board: _BoardCounts() for board in boards }
399 super(_LabInventory, self).__init__(initval)
400 self._dut_count = len(histories)
J. Richard Barnettef6839282015-06-01 16:00:35 -0700401 self._board_counts = None
J. Richard Barnette96db3492015-03-27 17:23:52 -0700402 for h in histories:
J. Richard Barnette3d0590a2015-04-29 12:56:12 -0700403 self[h.host_board].record_host(h)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700404
405
J. Richard Barnettef6839282015-06-01 16:00:35 -0700406 def get_working_list(self):
407 """Return a list of all working DUTs in the inventory.
408
409 Go through all HostJobHistory objects in the inventory,
410 selecting the ones where the last diagnosis is `WORKING`.
411
412 @return A list of HostJobHistory objects.
413
414 """
415 l = []
416 for counts in self.values():
417 l.extend(counts.get_working_list())
418 return l
419
420
421 def get_broken_list(self):
422 """Return a list of all broken DUTs in the inventory.
423
424 Go through all HostJobHistory objects in the inventory,
425 selecting the ones where the last diagnosis is not
426 `WORKING`.
427
428 @return A list of HostJobHistory objects.
429
430 """
431 l = []
432 for counts in self.values():
433 l.extend(counts.get_broken_list())
434 return l
435
436
437 def get_board_counts(self):
438 """Calculate a summary of board counts.
439
440 The summary is a list of tuples. The tuple elements, in
441 order, are:
442 * board - The name of the board associated with the
443 counts.
444 * buffer - The buffer of working spares (the total number
445 of spares, less the number of broken DUTs).
446 * broken - The number of broken DUTs.
447 * working - The number of working DUTs.
448 * spares - The number of DUTs in the spares pool.
449 * total - The the total number of DUTs.
450
451 Boards with no DUTs in the spares pool or no DUTs in a
452 critical pool will be excluded from the listed counts.
453
454 The ordering of the boards is unspecified.
455
456 @param inventory The inventory to be summarized.
457 @return A list of tuples with board data.
458
459 """
460 if self._board_counts is None:
461 self._board_counts = []
462 for board, counts in self.items():
463 logging.debug('Counting inventory for %s', board)
464 spares = counts.get_total(_SPARE_POOL)
465 total = counts.get_total()
466 if spares == 0 or spares == total:
467 continue
468 working = counts.get_working()
469 broken = counts.get_broken()
470 spare_buffer = spares - broken
471 element = (board, spare_buffer, broken, working,
472 spares, total)
473 self._board_counts.append(element)
474 return self._board_counts
475
476
J. Richard Barnette96db3492015-03-27 17:23:52 -0700477 def get_num_duts(self):
478 """Return the total number of DUTs in the inventory."""
479 return self._dut_count
480
481
482 def get_num_boards(self):
483 """Return the total number of boards in the inventory."""
484 return len(self)
485
486
J. Richard Barnettef6839282015-06-01 16:00:35 -0700487def _sort_by_location(inventory_list):
488 """Return a list of DUTs, organized by location.
489
490 Take the given list of `HostJobHistory` objects, separate it
491 into a list per lab, and sort each lab's list by location. The
492 order of sorting within a lab is
493 * By row number within the lab,
494 * then by rack number within the row,
495 * then by host shelf number within the rack.
496
497 Return a list of the sorted lists.
498
499 Implementation note: host locations are sorted by converting
500 each location into a base 100 number. If row, rack or
501 host numbers exceed the range [0..99], then sorting will
502 break down.
503
504 @return A list of sorted lists of DUTs.
505
506 """
507 BASE = 100
508 lab_lists = {}
509 for history in inventory_list:
510 location = _HOSTNAME_PATTERN.match(history.host.hostname)
511 if location:
512 lab = location.group(1)
513 key = 0
514 for idx in location.group(2, 3, 4):
515 key = BASE * key + int(idx)
516 lab_lists.setdefault(lab, []).append((key, history))
517 return_list = []
518 for dut_list in lab_lists.values():
519 dut_list.sort(key=lambda t: t[0])
520 return_list.append([t[1] for t in dut_list])
521 return return_list
522
523
524def _score_repair_set(buffer_counts, repair_list):
525 """Return a numeric score rating a set of DUTs to be repaired.
526
527 `buffer_counts` is a dictionary mapping board names to the
528 size of the board's spares buffer.
529
530 `repair_list` is a list of DUTs to be repaired.
531
532 This function calculates the new set of buffer counts that would
533 result from the proposed repairs, and scores the new set using
534 two numbers:
535 * Worst case buffer count for any board (higher is better).
536 This is the more siginficant number for comparison.
537 * Number of boards at the worst case (lower is better). This
538 is the less significant number.
539
540 Implementation note: The score could fail to reflect the
541 intended criteria if there are more than 1000 boards in the
542 inventory.
543
544 @param spare_counts A dictionary mapping boards to buffer counts.
545 @param repair_list A list of boards to be repaired.
546 @return A numeric score.
547
548 """
549 # Go through `buffer_counts`, and create a list of new counts
550 # that records the buffer count for each board after repair.
551 # The new list of counts discards the board names, as they don't
552 # contribute to the final score.
553 _NBOARDS = 1000
554 repair_inventory = _LabInventory(repair_list)
555 new_counts = []
556 for b, c in buffer_counts.items():
557 if b in repair_inventory:
558 newcount = repair_inventory[b].get_total()
559 else:
560 newcount = 0
561 new_counts.append(c + newcount)
562 # Go through the new list of counts. Find the worst available
563 # spares count, and count how many times that worst case occurs.
564 worst_count = new_counts[0]
565 num_worst = 1
566 for c in new_counts[1:]:
567 if c == worst_count:
568 num_worst += 1
569 elif c < worst_count:
570 worst_count = c
571 num_worst = 1
572 # Return the calculated score
573 return _NBOARDS * worst_count - num_worst
574
575
576def _generate_repair_recommendation(inventory, num_recommend):
577 """Return a summary of selected DUTs needing repair.
578
579 Returns a message recommending a list of broken DUTs to be
580 repaired. The list of DUTs is selected based on these
581 criteria:
582 * No more than `num_recommend` DUTs will be listed.
583 * All DUTs must be in the same lab.
584 * DUTs should be selected for some degree of physical
585 proximity.
586 * DUTs for boards with a low spares buffer are more important
587 than DUTs with larger buffers.
588
589 The algorithm used will guarantee that at least one DUT from a
590 board with the smallest spares buffer will be recommended. If
591 the worst spares buffer number is shared by more than one board,
592 the algorithm will tend to prefer repair sets that include more
593 of those boards over sets that cover fewer boards.
594
595 """
596 logging.debug('Creating DUT repair recommendations')
597 board_counts = inventory.get_board_counts()
598 # t[0] - board name
599 # t[1] - size of spares buffer
600 # t[2] - number of broken devices
601 board_buffer_counts = {t[0]: t[1] for t in board_counts
602 if t[2] != 0}
603 recommendation = None
604 best_score = None
605 # N.B. The logic of this loop may seem complicated, but
606 # simplification is hard:
607 # * Calculating an initial recommendation outside of
608 # the loop likely would make things more complicated,
609 # not less.
610 # * It's necessary to calculate an initial lab slice once per
611 # lab _before_ the while loop, in case the number of broken
612 # DUTs in a lab is less than `num_recommend`.
613 for lab_duts in _sort_by_location(inventory.get_broken_list()):
614 start = 0
615 end = num_recommend
616 lab_slice = lab_duts[start : end]
617 lab_score = _score_repair_set(board_buffer_counts,
618 lab_slice)
619 while end < len(lab_duts):
620 start += 1
621 end += 1
622 new_slice = lab_duts[start : end]
623 new_score = _score_repair_set(board_buffer_counts,
624 new_slice)
625 if new_score > lab_score:
626 lab_slice = new_slice
627 lab_score = new_score
628 if recommendation is None or lab_score > best_score:
629 recommendation = lab_slice
630 best_score = lab_score
631 message = ['%-30s %-16s %s' % (
632 'Hostname', 'Board', 'Servo instructions')]
633 for h in recommendation:
634 servo_name = servo_host.make_servo_hostname(h.host.hostname)
635 if utils.host_is_in_lab_zone(servo_name):
636 servo_message = 'Repair servo first'
637 else:
638 servo_message = 'No servo present'
639 line = '%-30s %-16s %s' % (
640 h.host.hostname, h.host_board, servo_message)
641 message.append(line)
642 return '\n'.join(message)
643
644
J. Richard Barnette96db3492015-03-27 17:23:52 -0700645def _generate_board_inventory_message(inventory):
646 """Generate the "board inventory" e-mail message.
647
648 The board inventory is a list by board summarizing the number
649 of working and broken DUTs, and the total shortfall or surplus
650 of working devices relative to the minimum critical pool
651 requirement.
652
653 The report omits boards with no DUTs in the spare pool or with
654 no DUTs in a critical pool.
655
656 N.B. For sample output text formattted as users can expect to
657 see it in e-mail and log files, refer to the unit tests.
658
659 @param inventory _LabInventory object with the inventory to
660 be reported on.
661 @return String with the inventory message to be sent.
662
663 """
664 logging.debug('Creating board inventory')
665 message = []
666 message.append(
667 '%-20s %5s %5s %5s %5s %5s' % (
668 'Board', 'Avail', 'Bad', 'Good', 'Spare', 'Total'))
J. Richard Barnettef6839282015-06-01 16:00:35 -0700669 data_list = inventory.get_board_counts()
J. Richard Barnette96db3492015-03-27 17:23:52 -0700670 data_list = sorted(sorted(data_list, key=lambda t: -t[2]),
671 key=lambda t: t[1])
672 message.extend(
673 ['%-20s %5d %5d %5d %5d %5d' % t for t in data_list])
674 return '\n'.join(message)
675
676
J. Richard Barnette4845fcf2015-04-20 14:26:25 -0700677_POOL_INVENTORY_HEADER = '''\
J. Richard Barnettec9a143c2015-06-04 11:11:19 -0700678Notice to Infrastructure deputies: If any boards are shown at
679less than full strength, please take action to resolve the issues.
680Once you're satisified that failures won't recur, failed DUTs can
681be replaced with spares by running `balance_pool`. Detailed
682instructions can be found here:
J. Richard Barnette4845fcf2015-04-20 14:26:25 -0700683 http://go/cros-manage-duts
684'''
685
686
J. Richard Barnette96db3492015-03-27 17:23:52 -0700687def _generate_pool_inventory_message(inventory):
688 """Generate the "pool inventory" e-mail message.
689
690 The pool inventory is a list by pool and board summarizing the
691 number of working and broken DUTs in the pool. Only boards with
692 at least one broken DUT are included in the list.
693
694 N.B. For sample output text formattted as users can expect to
695 see it in e-mail and log files, refer to the unit tests.
696
697 @param inventory _LabInventory object with the inventory to
698 be reported on.
699 @return String with the inventory message to be sent.
700
701 """
702 logging.debug('Creating pool inventory')
J. Richard Barnette4845fcf2015-04-20 14:26:25 -0700703 message = [_POOL_INVENTORY_HEADER]
J. Richard Barnette96db3492015-03-27 17:23:52 -0700704 newline = ''
705 for pool in _CRITICAL_POOLS:
706 message.append(
707 '%sStatus for pool:%s, by board:' % (newline, pool))
708 message.append(
709 '%-20s %5s %5s %5s' % (
710 'Board', 'Bad', 'Good', 'Total'))
711 data_list = []
712 for board, counts in inventory.items():
713 logging.debug('Counting inventory for %s, %s',
714 board, pool)
715 broken = counts.get_broken(pool)
716 if broken == 0:
717 continue
718 working = counts.get_working(pool)
719 total = counts.get_total(pool)
720 data_list.append((board, broken, working, total))
721 if data_list:
722 data_list = sorted(data_list, key=lambda d: -d[1])
723 message.extend(
724 ['%-20s %5d %5d %5d' % t for t in data_list])
725 else:
726 message.append('(All boards at full strength)')
727 newline = '\n'
728 return '\n'.join(message)
729
730
731def _send_email(arguments, tag, subject, recipients, body):
732 """Send an inventory e-mail message.
733
734 The message is logged in the selected log directory using `tag`
735 for the file name.
736
737 If the --print option was requested, the message is neither
738 logged nor sent, but merely printed on stdout.
739
740 @param arguments Parsed command-line options.
741 @param tag Tag identifying the inventory for logging
742 purposes.
743 @param subject E-mail Subject: header line.
744 @param recipients E-mail addresses for the To: header line.
745 @param body E-mail message body.
746
747 """
748 logging.debug('Generating email: "%s"', subject)
749 all_recipients = ', '.join(recipients)
750 report_body = '\n'.join([
751 'To: %s' % all_recipients,
752 'Subject: %s' % subject,
753 '', body, ''])
754 if arguments.print_:
755 print report_body
756 else:
757 filename = os.path.join(arguments.logdir, tag)
758 try:
759 report_file = open(filename, 'w')
760 report_file.write(report_body)
761 report_file.close()
762 except EnvironmentError as e:
763 logging.error('Failed to write %s: %s', filename, e)
764 try:
765 gmail_lib.send_email(all_recipients, subject, body)
766 except Exception as e:
767 logging.error('Failed to send e-mail to %s: %s',
768 all_recipients, e)
769
770
771def _separate_email_addresses(address_list):
772 """Parse a list of comma-separated lists of e-mail addresses.
773
774 @param address_list A list of strings containing comma
775 separate e-mail addresses.
776 @return A list of the individual e-mail addresses.
777
778 """
779 newlist = []
780 for arg in address_list:
781 newlist.extend([email.strip() for email in arg.split(',')])
782 return newlist
783
784
785def _verify_arguments(arguments):
786 """Validate command-line arguments.
787
788 Join comma separated e-mail addresses for `--board-notify` and
789 `--pool-notify` in separate option arguments into a single list.
790
791 @param arguments Command-line arguments as returned by
792 `ArgumentParser`
793
794 """
795 arguments.board_notify = _separate_email_addresses(
796 arguments.board_notify)
797 arguments.pool_notify = _separate_email_addresses(
798 arguments.pool_notify)
799
800
801def _get_logdir(script):
802 """Get the default directory for the `--logdir` option.
803
804 The default log directory is based on the parent directory
805 containing this script.
806
807 @param script Path to this script file.
808 @return A path to a directory.
809
810 """
811 basedir = os.path.dirname(os.path.abspath(script))
812 basedir = os.path.dirname(basedir)
813 return os.path.join(basedir, _LOGDIR)
814
815
816def _parse_command(argv):
817 """Parse the command line arguments.
818
819 Create an argument parser for this command's syntax, parse the
820 command line, and return the result of the ArgumentParser
821 parse_args() method.
822
823 @param argv Standard command line argument vector; argv[0] is
824 assumed to be the command name.
825 @return Result returned by ArgumentParser.parse_args().
826
827 """
828 parser = argparse.ArgumentParser(
829 prog=argv[0],
830 description='Gather and report lab inventory statistics')
831 parser.add_argument('-d', '--duration', type=int,
832 default=_DEFAULT_DURATION, metavar='HOURS',
833 help='number of hours back to search for status'
834 ' (default: %d)' % _DEFAULT_DURATION)
835 parser.add_argument('--board-notify', action='append',
836 default=[], metavar='ADDRESS',
837 help='Generate board inventory message, '
838 'and send it to the given e-mail address(es)')
839 parser.add_argument('--pool-notify', action='append',
840 default=[], metavar='ADDRESS',
841 help='Generate pool inventory message, '
842 'and send it to the given address(es)')
J. Richard Barnettef6839282015-06-01 16:00:35 -0700843 parser.add_argument('--recommend-notify', action='append',
844 default=[], metavar='ADDRESS',
845 help='Generate repair recommendations, '
846 'and send it to the given address(es)')
847 parser.add_argument('-r', '--recommend', type=int,
848 default=_DEFAULT_NUM_RECOMMEND,
849 help=('Specify how many DUTs should be '
850 'recommended for repair (default: %d)' %
851 _DEFAULT_NUM_RECOMMEND))
J. Richard Barnette96db3492015-03-27 17:23:52 -0700852 parser.add_argument('--print', dest='print_', action='store_true',
853 help='Print e-mail messages on stdout '
854 'without sending them.')
855 parser.add_argument('--logdir', default=_get_logdir(argv[0]),
856 help='Directory where logs will be written.')
857 parser.add_argument('boardnames', nargs='*',
858 metavar='BOARD',
859 help='names of boards to report on '
860 '(default: all boards)')
861 arguments = parser.parse_args(argv[1:])
862 _verify_arguments(arguments)
863 return arguments
864
865
866def _configure_logging(arguments):
867 """Configure the `logging` module for our needs.
868
869 How we log depends on whether the `--print` option was
870 provided on the command line. Without the option, we log all
871 messages at DEBUG level or above, and write them to a file in
872 the directory specified by the `--logdir` option. With the
873 option, we write log messages to stdout; messages below INFO
874 level are discarded.
875
876 The log file is configured to rotate once a week on Friday
877 evening, preserving ~3 months worth of history.
878
879 @param arguments Command-line arguments as returned by
880 `ArgumentParser`
881
882 """
J. Richard Barnettef6839282015-06-01 16:00:35 -0700883 root_logger = logging.getLogger()
J. Richard Barnette96db3492015-03-27 17:23:52 -0700884 if arguments.print_:
J. Richard Barnettef6839282015-06-01 16:00:35 -0700885 root_logger.setLevel(logging.INFO)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700886 handler = logging.StreamHandler(sys.stdout)
887 handler.setFormatter(logging.Formatter())
888 else:
J. Richard Barnettef6839282015-06-01 16:00:35 -0700889 root_logger.setLevel(logging.DEBUG)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700890 logfile = os.path.join(arguments.logdir, _LOGFILE)
891 handler = logging.handlers.TimedRotatingFileHandler(
892 logfile, when='W4', backupCount=13)
893 formatter = logging.Formatter(_LOG_FORMAT,
894 time_utils.TIME_FMT)
895 handler.setFormatter(formatter)
J. Richard Barnettef6839282015-06-01 16:00:35 -0700896 # TODO(jrbarnette) This is gross. Importing client.bin.utils
897 # implicitly imported logging_config, which calls
898 # logging.basicConfig() *at module level*. That gives us an
899 # extra logging handler that we don't want. So, clear out all
900 # the handlers here.
901 for h in root_logger.handlers:
902 root_logger.removeHandler(h)
903 root_logger.addHandler(handler)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700904
905
906def _populate_board_counts(inventory):
907 """Gather board counts while providing interactive feedback.
908
909 Gathering the status of all individual DUTs in the lab can take
910 considerable time (~30 minutes at the time of this writing).
911
912 Normally, we pay that cost by querying as we go. However, with
913 the `--print` option, a human being may be watching the
914 progress. So, we force the first (expensive) queries to happen
915 up front, and provide a small ASCII progress bar to give an
916 indicator of how many boards have been processed.
917
918 @param inventory _LabInventory object with the inventory to
919 be gathered.
920
921 """
922 n = 0
J. Richard Barnettef6839282015-06-01 16:00:35 -0700923 total_broken = 0
J. Richard Barnette96db3492015-03-27 17:23:52 -0700924 for counts in inventory.values():
925 n += 1
926 if n % 10 == 5:
927 c = '+'
928 elif n % 10 == 0:
929 c = '%d' % ((n / 10) % 10)
930 else:
931 c = '.'
932 sys.stdout.write(c)
933 sys.stdout.flush()
934 # This next call is where all the time goes - it forces all
935 # of a board's HostJobHistory objects to query the database
936 # and cache their results.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700937 total_broken += counts.get_broken()
J. Richard Barnette96db3492015-03-27 17:23:52 -0700938 sys.stdout.write('\n')
J. Richard Barnettef6839282015-06-01 16:00:35 -0700939 sys.stdout.write('Found %d broken DUTs\n' % total_broken)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700940
941
942def main(argv):
943 """Standard main routine.
944 @param argv Command line arguments including `sys.argv[0]`.
945 """
946 arguments = _parse_command(argv)
947 _configure_logging(arguments)
948 try:
949 end_time = int(time.time())
950 start_time = end_time - arguments.duration * 60 * 60
951 timestamp = time.strftime('%Y-%m-%d.%H',
952 time.localtime(end_time))
953 logging.debug('Starting lab inventory for %s', timestamp)
J. Richard Barnettef6839282015-06-01 16:00:35 -0700954 if arguments.recommend_notify:
955 logging.debug('Will include repair recommendations')
J. Richard Barnette96db3492015-03-27 17:23:52 -0700956 if arguments.board_notify:
957 logging.debug('Will include board inventory')
958 if arguments.pool_notify:
959 logging.debug('Will include pool inventory')
960
961 afe = frontend.AFE(server=None)
962 inventory = _LabInventory.create_inventory(
963 afe, start_time, end_time, arguments.boardnames)
964 logging.info('Found %d hosts across %d boards',
965 inventory.get_num_duts(),
966 inventory.get_num_boards())
967
968 if arguments.print_:
969 _populate_board_counts(inventory)
970
J. Richard Barnettef6839282015-06-01 16:00:35 -0700971 if arguments.print_ or arguments.recommend_notify:
972 recommend_message = _generate_repair_recommendation(
973 inventory, arguments.recommend)
974 _send_email(arguments,
975 'recommend-%s.txt' % timestamp,
976 'DUT repair recommendations %s' % timestamp,
977 arguments.recommend_notify,
978 recommend_message)
979
J. Richard Barnette96db3492015-03-27 17:23:52 -0700980 if arguments.print_ or arguments.board_notify:
981 _send_email(arguments,
982 'boards-%s.txt' % timestamp,
983 'DUT board inventory %s' % timestamp,
984 arguments.board_notify,
985 _generate_board_inventory_message(inventory))
986
987 if arguments.print_ or arguments.pool_notify:
988 _send_email(arguments,
989 'pools-%s.txt' % timestamp,
990 'DUT pool inventory %s' % timestamp,
991 arguments.pool_notify,
992 _generate_pool_inventory_message(inventory))
993 except KeyboardInterrupt:
994 pass
995 except EnvironmentError as e:
996 logging.exception('Unexpected OS error: %s', e)
997 except Exception as e:
998 logging.exception('Unexpected exception: %s', e)
999
1000
1001if __name__ == '__main__':
1002 main(sys.argv)