blob: 1fda391165b119c9698336c1e9974f1063b54151 [file] [log] [blame]
J. Richard Barnette96db3492015-03-27 17:23:52 -07001#!/usr/bin/env python
2# Copyright 2015 The Chromium OS Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6"""Create e-mail reports of the Lab's DUT inventory.
7
8Gathers a list of all DUTs of interest in the Lab, segregated by
9board and pool, and determines whether each DUT is working or
10broken. Then, send one or more e-mail reports summarizing the
11status to e-mail addresses provided on the command line.
12
13usage: lab_inventory.py [ options ] [ board ... ]
14
15Options:
16--duration / -d <hours>
17 How far back in time to search job history to determine DUT
18 status.
19
20--board-notify <address>[,<address>]
21 Send the "board status" e-mail to all the specified e-mail
22 addresses.
23
24--pool-notify <address>[,<address>]
25 Send the "pool status" e-mail to all the specified e-mail
26 addresses.
27
J. Richard Barnette1df6a562015-06-09 10:06:17 -070028--recommend <number>
29 When generating the "board status" e-mail, included a list of
30 <number> specific DUTs to be recommended for repair.
31
J. Richard Barnette96db3492015-03-27 17:23:52 -070032--logdir <directory>
33 Log progress and actions in a file under this directory. Text
34 of any e-mail sent will also be logged in a timestamped file in
35 this directory.
36
37--print
38 Suppress all logging and sending e-mail. Instead, write the
39 output that would be generated onto stdout.
40
41<board> arguments:
42 With no arguments, gathers the status for all boards in the lab.
43 With one or more named boards on the command line, restricts
44 reporting to just those boards.
45
46"""
47
48
49import argparse
50import logging
51import logging.handlers
52import os
J. Richard Barnettef6839282015-06-01 16:00:35 -070053import re
J. Richard Barnette96db3492015-03-27 17:23:52 -070054import sys
55import time
56
57import common
J. Richard Barnettef6839282015-06-01 16:00:35 -070058from autotest_lib.client.bin import utils
J. Richard Barnette96db3492015-03-27 17:23:52 -070059from autotest_lib.client.common_lib import time_utils
J. Richard Barnettea7c514e2015-09-15 11:13:23 -070060from autotest_lib.server.cros.dynamic_suite import frontend_wrappers
J. Richard Barnettef6839282015-06-01 16:00:35 -070061from autotest_lib.server.hosts import servo_host
J. Richard Barnette96db3492015-03-27 17:23:52 -070062from autotest_lib.site_utils import gmail_lib
63from autotest_lib.site_utils import status_history
64from autotest_lib.site_utils.suite_scheduler import constants
65
66
67# The pools in the Lab that are actually of interest.
68#
69# These are general purpose pools of DUTs that are considered
70# identical for purposes of testing. That is, a device in one of
71# these pools can be shifted to another pool at will for purposes
72# of supplying test demand.
73#
74# Devices in these pools are not allowed to have special-purpose
75# attachments, or to be part of in any kind of custom fixture.
76# Devices in these pools are also required to reside in areas
77# managed by the Platforms team (i.e. at the time of this writing,
78# only in "Atlantis" or "Destiny").
79#
80# _CRITICAL_POOLS - Pools that must be kept fully supplied in order
81# to guarantee timely completion of tests from builders.
82# _SPARE_POOL - A low priority pool that is allowed to provide
83# spares to replace broken devices in the critical pools.
84# _MANAGED_POOLS - The set of all the general purpose pools
85# monitored by this script.
86
87_CRITICAL_POOLS = ['bvt', 'cq']
88_SPARE_POOL = 'suites'
89_MANAGED_POOLS = _CRITICAL_POOLS + [_SPARE_POOL]
90
J. Richard Barnette96db3492015-03-27 17:23:52 -070091# _DEFAULT_DURATION:
92# Default value used for the --duration command line option.
93# Specifies how far back in time to search in order to determine
94# DUT status.
95
96_DEFAULT_DURATION = 24
97
J. Richard Barnette96db3492015-03-27 17:23:52 -070098# _LOGDIR:
99# Relative path used in the calculation of the default setting
100# for the --logdir option. The full path path is relative to
101# the root of the autotest directory, as determined from
102# sys.argv[0].
103# _LOGFILE:
104# Basename of a file to which general log information will be
105# written.
106# _LOG_FORMAT:
107# Format string for log messages.
108
109_LOGDIR = os.path.join('logs', 'dut-data')
110_LOGFILE = 'lab-inventory.log'
111_LOG_FORMAT = '%(asctime)s | %(levelname)-10s | %(message)s'
112
J. Richard Barnettef6839282015-06-01 16:00:35 -0700113# Pattern describing location-based host names in the Chrome OS test
114# labs. Each DUT hostname designates the DUT's location:
115# * A lab (room) that's physically separated from other labs
116# (i.e. there's a door).
117# * A row (or aisle) of DUTs within the lab.
118# * A vertical rack of shelves on the row.
119# * A specific host on one shelf of the rack.
120
121_HOSTNAME_PATTERN = re.compile(
122 r'(chromeos\d+)-row(\d+)-rack(\d+)-host(\d+)')
123
J. Richard Barnette96db3492015-03-27 17:23:52 -0700124
125class _PoolCounts(object):
126 """Maintains a set of `HostJobHistory` objects for a pool.
127
128 The collected history objects are nominally all part of a single
J. Richard Barnettef6839282015-06-01 16:00:35 -0700129 scheduling pool of DUTs. The collection maintains a list of
130 working DUTs, a list of broken DUTs, and a list of all DUTs.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700131
J. Richard Barnettef6839282015-06-01 16:00:35 -0700132 Performance note: Certain methods in this class are potentially
133 expensive:
134 * `get_working()`
135 * `get_working_list()`
136 * `get_broken()`
137 * `get_broken_list()`
138 The first time any one of these methods is called, it causes
139 multiple RPC calls with a relatively expensive set of database
140 queries. However, the results of the queries are cached in the
141 individual `HostJobHistory` objects, so only the first call
142 actually pays the full cost.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700143
J. Richard Barnettef6839282015-06-01 16:00:35 -0700144 Additionally, `get_working_list()` and `get_broken_list()` both
145 cache their return values to avoid recalculating lists at every
146 call; this caching is separate from the caching of RPC results
147 described above.
148
149 This class is deliberately constructed to delay the RPC cost
150 until the accessor methods are called (rather than to query in
J. Richard Barnette96db3492015-03-27 17:23:52 -0700151 `record_host()`) so that it's possible to construct a complete
152 `_LabInventory` without making the expensive queries at creation
J. Richard Barnettef6839282015-06-01 16:00:35 -0700153 time. `_populate_board_counts()`, below, assumes this behavior.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700154
155 """
156
157 def __init__(self):
158 self._histories = []
J. Richard Barnettef6839282015-06-01 16:00:35 -0700159 self._working_list = None
160 self._broken_list = None
J. Richard Barnette96db3492015-03-27 17:23:52 -0700161
162
163 def record_host(self, host_history):
164 """Add one `HostJobHistory` object to the collection.
165
166 @param host_history The `HostJobHistory` object to be
167 remembered.
168
169 """
J. Richard Barnettef6839282015-06-01 16:00:35 -0700170 self._working_list = None
171 self._broken_list = None
J. Richard Barnette96db3492015-03-27 17:23:52 -0700172 self._histories.append(host_history)
173
174
J. Richard Barnettef6839282015-06-01 16:00:35 -0700175 def get_working_list(self):
176 """Return a list of all working DUTs in the pool.
177
178 Filter `self._histories` for histories where the last
179 diagnosis is `WORKING`.
180
181 Cache the result so that we only cacluate it once.
182
183 @return A list of HostJobHistory objects.
184
185 """
186 if self._working_list is None:
187 self._working_list = [h for h in self._histories
188 if h.last_diagnosis()[0] == status_history.WORKING]
189 return self._working_list
190
191
J. Richard Barnette96db3492015-03-27 17:23:52 -0700192 def get_working(self):
J. Richard Barnettef6839282015-06-01 16:00:35 -0700193 """Return the number of working DUTs in the pool."""
194 return len(self.get_working_list())
195
196
197 def get_broken_list(self):
198 """Return a list of all broken DUTs in the pool.
199
200 Filter `self._histories` for histories where the last
201 diagnosis is not `WORKING`.
202
203 Cache the result so that we only cacluate it once.
204
205 @return A list of HostJobHistory objects.
206
207 """
208 if self._broken_list is None:
209 self._broken_list = [h for h in self._histories
210 if h.last_diagnosis()[0] != status_history.WORKING]
211 return self._broken_list
J. Richard Barnette96db3492015-03-27 17:23:52 -0700212
213
214 def get_broken(self):
J. Richard Barnettef6839282015-06-01 16:00:35 -0700215 """Return the number of broken DUTs in the pool."""
216 return len(self.get_broken_list())
J. Richard Barnette96db3492015-03-27 17:23:52 -0700217
218
219 def get_total(self):
J. Richard Barnettef6839282015-06-01 16:00:35 -0700220 """Return the total number of DUTs in the pool."""
J. Richard Barnette96db3492015-03-27 17:23:52 -0700221 return len(self._histories)
222
223
224class _BoardCounts(object):
225 """Maintains a set of `HostJobHistory` objects for a board.
226
227 The collected history objects are nominally all of the same
228 board. The collection maintains a count of working DUTs, a
229 count of broken DUTs, and a total count. The counts can be
230 obtained either for a single pool, or as a total across all
231 pools.
232
233 DUTs in the collection must be assigned to one of the pools
234 in `_MANAGED_POOLS`.
235
236 The `get_working()` and `get_broken()` methods rely on the
237 methods of the same name in _PoolCounts, so the performance
238 note in _PoolCounts applies here as well.
239
240 """
241
242 def __init__(self):
243 self._pools = {
244 pool: _PoolCounts() for pool in _MANAGED_POOLS
245 }
246
247 def record_host(self, host_history):
248 """Add one `HostJobHistory` object to the collection.
249
250 @param host_history The `HostJobHistory` object to be
251 remembered.
252
253 """
J. Richard Barnette3d0590a2015-04-29 12:56:12 -0700254 pool = host_history.host_pool
J. Richard Barnette96db3492015-03-27 17:23:52 -0700255 self._pools[pool].record_host(host_history)
256
257
258 def _count_pool(self, get_pool_count, pool=None):
259 """Internal helper to count hosts in a given pool.
260
261 The `get_pool_count` parameter is a function to calculate
262 the exact count of interest for the pool.
263
264 @param get_pool_count Function to return a count from a
265 _PoolCount object.
266 @param pool The pool to be counted. If `None`,
267 return the total across all pools.
268
269 """
270 if pool is None:
271 return sum([get_pool_count(counts)
272 for counts in self._pools.values()])
273 else:
274 return get_pool_count(self._pools[pool])
275
276
J. Richard Barnettef6839282015-06-01 16:00:35 -0700277 def get_working_list(self):
278 """Return a list of all working DUTs for the board.
279
280 Go through all HostJobHistory objects in the board's pools,
281 selecting the ones where the last diagnosis is `WORKING`.
282
283 @return A list of HostJobHistory objects.
284
285 """
286 l = []
287 for p in self._pools.values():
288 l.extend(p.get_working_list())
289 return l
290
291
J. Richard Barnette96db3492015-03-27 17:23:52 -0700292 def get_working(self, pool=None):
293 """Return the number of working DUTs in a pool.
294
295 @param pool The pool to be counted. If `None`, return the
296 total across all pools.
297
298 """
299 return self._count_pool(_PoolCounts.get_working, pool)
300
301
J. Richard Barnettef6839282015-06-01 16:00:35 -0700302 def get_broken_list(self):
303 """Return a list of all broken DUTs for the board.
304
305 Go through all HostJobHistory objects in the board's pools,
306 selecting the ones where the last diagnosis is not
307 `WORKING`.
308
309 @return A list of HostJobHistory objects.
310
311 """
312 l = []
313 for p in self._pools.values():
314 l.extend(p.get_broken_list())
315 return l
316
317
J. Richard Barnette96db3492015-03-27 17:23:52 -0700318 def get_broken(self, pool=None):
319 """Return the number of broken DUTs in a pool.
320
321 @param pool The pool to be counted. If `None`, return the
322 total across all pools.
323
324 """
325 return self._count_pool(_PoolCounts.get_broken, pool)
326
327
328 def get_total(self, pool=None):
329 """Return the total number of DUTs in a pool.
330
331 @param pool The pool to be counted. If `None`, return the
332 total across all pools.
333
334 """
335 return self._count_pool(_PoolCounts.get_total, pool)
336
337
338class _LabInventory(dict):
339 """Collection of `HostJobHistory` objects for the Lab's inventory.
340
341 The collection is indexed by board. Indexing returns the
342 _BoardCounts object associated with the board.
343
344 The collection is also iterable. The iterator returns all the
345 boards in the inventory, in unspecified order.
346
347 """
348
349 @classmethod
350 def create_inventory(cls, afe, start_time, end_time, boardlist=[]):
351 """Return a Lab inventory with specified parameters.
352
353 By default, gathers inventory from `HostJobHistory` objects
354 for all DUTs in the `_MANAGED_POOLS` list. If `boardlist`
355 is supplied, the inventory will be restricted to only the
356 given boards.
357
358 @param afe AFE object for constructing the
359 `HostJobHistory` objects.
360 @param start_time Start time for the `HostJobHistory`
361 objects.
362 @param end_time End time for the `HostJobHistory`
363 objects.
364 @param boardlist List of boards to include. If empty,
365 include all available boards.
366 @return A `_LabInventory` object for the specified boards.
367
368 """
369 label_list = [constants.Labels.POOL_PREFIX + l
370 for l in _MANAGED_POOLS]
371 afehosts = afe.get_hosts(labels__name__in=label_list)
372 if boardlist:
373 boardhosts = []
374 for board in boardlist:
375 board_label = constants.Labels.BOARD_PREFIX + board
376 host_list = [h for h in afehosts
377 if board_label in h.labels]
378 boardhosts.extend(host_list)
379 afehosts = boardhosts
380 create = lambda host: (
381 status_history.HostJobHistory(afe, host,
382 start_time, end_time))
383 return cls([create(host) for host in afehosts])
384
385
386 def __init__(self, histories):
J. Richard Barnette6948ed32015-05-06 08:57:10 -0700387 # N.B. The query that finds our hosts is restricted to those
388 # with a valid pool: label, but doesn't check for a valid
389 # board: label. In some (insufficiently) rare cases, the
390 # AFE hosts table has been known to (incorrectly) have DUTs
391 # with a pool: but no board: label. We explicitly exclude
392 # those here.
393 histories = [h for h in histories
394 if h.host_board is not None]
J. Richard Barnette3d0590a2015-04-29 12:56:12 -0700395 boards = set([h.host_board for h in histories])
J. Richard Barnette96db3492015-03-27 17:23:52 -0700396 initval = { board: _BoardCounts() for board in boards }
397 super(_LabInventory, self).__init__(initval)
398 self._dut_count = len(histories)
J. Richard Barnettef6839282015-06-01 16:00:35 -0700399 self._board_counts = None
J. Richard Barnette96db3492015-03-27 17:23:52 -0700400 for h in histories:
J. Richard Barnette3d0590a2015-04-29 12:56:12 -0700401 self[h.host_board].record_host(h)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700402
403
J. Richard Barnettef6839282015-06-01 16:00:35 -0700404 def get_working_list(self):
405 """Return a list of all working DUTs in the inventory.
406
407 Go through all HostJobHistory objects in the inventory,
408 selecting the ones where the last diagnosis is `WORKING`.
409
410 @return A list of HostJobHistory objects.
411
412 """
413 l = []
414 for counts in self.values():
415 l.extend(counts.get_working_list())
416 return l
417
418
419 def get_broken_list(self):
420 """Return a list of all broken DUTs in the inventory.
421
422 Go through all HostJobHistory objects in the inventory,
423 selecting the ones where the last diagnosis is not
424 `WORKING`.
425
426 @return A list of HostJobHistory objects.
427
428 """
429 l = []
430 for counts in self.values():
431 l.extend(counts.get_broken_list())
432 return l
433
434
435 def get_board_counts(self):
436 """Calculate a summary of board counts.
437
438 The summary is a list of tuples. The tuple elements, in
439 order, are:
440 * board - The name of the board associated with the
441 counts.
442 * buffer - The buffer of working spares (the total number
443 of spares, less the number of broken DUTs).
444 * broken - The number of broken DUTs.
445 * working - The number of working DUTs.
446 * spares - The number of DUTs in the spares pool.
447 * total - The the total number of DUTs.
448
449 Boards with no DUTs in the spares pool or no DUTs in a
450 critical pool will be excluded from the listed counts.
451
452 The ordering of the boards is unspecified.
453
454 @param inventory The inventory to be summarized.
455 @return A list of tuples with board data.
456
457 """
458 if self._board_counts is None:
459 self._board_counts = []
460 for board, counts in self.items():
461 logging.debug('Counting inventory for %s', board)
462 spares = counts.get_total(_SPARE_POOL)
463 total = counts.get_total()
464 if spares == 0 or spares == total:
465 continue
466 working = counts.get_working()
467 broken = counts.get_broken()
468 spare_buffer = spares - broken
469 element = (board, spare_buffer, broken, working,
470 spares, total)
471 self._board_counts.append(element)
472 return self._board_counts
473
474
J. Richard Barnette96db3492015-03-27 17:23:52 -0700475 def get_num_duts(self):
476 """Return the total number of DUTs in the inventory."""
477 return self._dut_count
478
479
480 def get_num_boards(self):
481 """Return the total number of boards in the inventory."""
482 return len(self)
483
484
J. Richard Barnettef6839282015-06-01 16:00:35 -0700485def _sort_by_location(inventory_list):
486 """Return a list of DUTs, organized by location.
487
488 Take the given list of `HostJobHistory` objects, separate it
489 into a list per lab, and sort each lab's list by location. The
490 order of sorting within a lab is
491 * By row number within the lab,
492 * then by rack number within the row,
493 * then by host shelf number within the rack.
494
495 Return a list of the sorted lists.
496
497 Implementation note: host locations are sorted by converting
498 each location into a base 100 number. If row, rack or
499 host numbers exceed the range [0..99], then sorting will
500 break down.
501
502 @return A list of sorted lists of DUTs.
503
504 """
505 BASE = 100
506 lab_lists = {}
507 for history in inventory_list:
508 location = _HOSTNAME_PATTERN.match(history.host.hostname)
509 if location:
510 lab = location.group(1)
511 key = 0
512 for idx in location.group(2, 3, 4):
513 key = BASE * key + int(idx)
514 lab_lists.setdefault(lab, []).append((key, history))
515 return_list = []
516 for dut_list in lab_lists.values():
517 dut_list.sort(key=lambda t: t[0])
518 return_list.append([t[1] for t in dut_list])
519 return return_list
520
521
522def _score_repair_set(buffer_counts, repair_list):
523 """Return a numeric score rating a set of DUTs to be repaired.
524
525 `buffer_counts` is a dictionary mapping board names to the
526 size of the board's spares buffer.
527
528 `repair_list` is a list of DUTs to be repaired.
529
530 This function calculates the new set of buffer counts that would
531 result from the proposed repairs, and scores the new set using
532 two numbers:
533 * Worst case buffer count for any board (higher is better).
534 This is the more siginficant number for comparison.
535 * Number of boards at the worst case (lower is better). This
536 is the less significant number.
537
538 Implementation note: The score could fail to reflect the
539 intended criteria if there are more than 1000 boards in the
540 inventory.
541
542 @param spare_counts A dictionary mapping boards to buffer counts.
543 @param repair_list A list of boards to be repaired.
544 @return A numeric score.
545
546 """
547 # Go through `buffer_counts`, and create a list of new counts
548 # that records the buffer count for each board after repair.
549 # The new list of counts discards the board names, as they don't
550 # contribute to the final score.
551 _NBOARDS = 1000
552 repair_inventory = _LabInventory(repair_list)
553 new_counts = []
554 for b, c in buffer_counts.items():
555 if b in repair_inventory:
556 newcount = repair_inventory[b].get_total()
557 else:
558 newcount = 0
559 new_counts.append(c + newcount)
560 # Go through the new list of counts. Find the worst available
561 # spares count, and count how many times that worst case occurs.
562 worst_count = new_counts[0]
563 num_worst = 1
564 for c in new_counts[1:]:
565 if c == worst_count:
566 num_worst += 1
567 elif c < worst_count:
568 worst_count = c
569 num_worst = 1
570 # Return the calculated score
571 return _NBOARDS * worst_count - num_worst
572
573
574def _generate_repair_recommendation(inventory, num_recommend):
575 """Return a summary of selected DUTs needing repair.
576
577 Returns a message recommending a list of broken DUTs to be
578 repaired. The list of DUTs is selected based on these
579 criteria:
580 * No more than `num_recommend` DUTs will be listed.
581 * All DUTs must be in the same lab.
582 * DUTs should be selected for some degree of physical
583 proximity.
584 * DUTs for boards with a low spares buffer are more important
585 than DUTs with larger buffers.
586
587 The algorithm used will guarantee that at least one DUT from a
588 board with the smallest spares buffer will be recommended. If
589 the worst spares buffer number is shared by more than one board,
590 the algorithm will tend to prefer repair sets that include more
591 of those boards over sets that cover fewer boards.
592
J. Richard Barnette1df6a562015-06-09 10:06:17 -0700593 @param inventory Inventory for generating recommendations.
594 @param num_recommend Number of DUTs to recommend for repair.
595
J. Richard Barnettef6839282015-06-01 16:00:35 -0700596 """
597 logging.debug('Creating DUT repair recommendations')
598 board_counts = inventory.get_board_counts()
599 # t[0] - board name
600 # t[1] - size of spares buffer
601 # t[2] - number of broken devices
602 board_buffer_counts = {t[0]: t[1] for t in board_counts
603 if t[2] != 0}
604 recommendation = None
605 best_score = None
606 # N.B. The logic of this loop may seem complicated, but
607 # simplification is hard:
608 # * Calculating an initial recommendation outside of
609 # the loop likely would make things more complicated,
610 # not less.
611 # * It's necessary to calculate an initial lab slice once per
612 # lab _before_ the while loop, in case the number of broken
613 # DUTs in a lab is less than `num_recommend`.
614 for lab_duts in _sort_by_location(inventory.get_broken_list()):
615 start = 0
616 end = num_recommend
617 lab_slice = lab_duts[start : end]
618 lab_score = _score_repair_set(board_buffer_counts,
619 lab_slice)
620 while end < len(lab_duts):
621 start += 1
622 end += 1
623 new_slice = lab_duts[start : end]
624 new_score = _score_repair_set(board_buffer_counts,
625 new_slice)
626 if new_score > lab_score:
627 lab_slice = new_slice
628 lab_score = new_score
629 if recommendation is None or lab_score > best_score:
630 recommendation = lab_slice
631 best_score = lab_score
J. Richard Barnette1df6a562015-06-09 10:06:17 -0700632 message = ['Repair recommendations:\n',
633 '%-30s %-16s %s' % (
J. Richard Barnettef6839282015-06-01 16:00:35 -0700634 'Hostname', 'Board', 'Servo instructions')]
635 for h in recommendation:
636 servo_name = servo_host.make_servo_hostname(h.host.hostname)
637 if utils.host_is_in_lab_zone(servo_name):
638 servo_message = 'Repair servo first'
639 else:
640 servo_message = 'No servo present'
641 line = '%-30s %-16s %s' % (
642 h.host.hostname, h.host_board, servo_message)
643 message.append(line)
644 return '\n'.join(message)
645
646
J. Richard Barnette96db3492015-03-27 17:23:52 -0700647def _generate_board_inventory_message(inventory):
648 """Generate the "board inventory" e-mail message.
649
650 The board inventory is a list by board summarizing the number
651 of working and broken DUTs, and the total shortfall or surplus
652 of working devices relative to the minimum critical pool
653 requirement.
654
655 The report omits boards with no DUTs in the spare pool or with
656 no DUTs in a critical pool.
657
658 N.B. For sample output text formattted as users can expect to
659 see it in e-mail and log files, refer to the unit tests.
660
661 @param inventory _LabInventory object with the inventory to
662 be reported on.
663 @return String with the inventory message to be sent.
664
665 """
666 logging.debug('Creating board inventory')
J. Richard Barnette1df6a562015-06-09 10:06:17 -0700667 message = ['Full board inventory:\n',
668 '%-22s %5s %5s %5s %5s %5s' % (
669 'Board', 'Avail', 'Bad', 'Good',
670 'Spare', 'Total')]
J. Richard Barnettef6839282015-06-01 16:00:35 -0700671 data_list = inventory.get_board_counts()
J. Richard Barnette96db3492015-03-27 17:23:52 -0700672 data_list = sorted(sorted(data_list, key=lambda t: -t[2]),
673 key=lambda t: t[1])
674 message.extend(
J. Richard Barnette1df6a562015-06-09 10:06:17 -0700675 ['%-22s %5d %5d %5d %5d %5d' % t for t in data_list])
J. Richard Barnette96db3492015-03-27 17:23:52 -0700676 return '\n'.join(message)
677
678
J. Richard Barnette4845fcf2015-04-20 14:26:25 -0700679_POOL_INVENTORY_HEADER = '''\
Aviv Keshet056d74c2015-07-14 09:18:43 -0700680Notice to Infrastructure deputies: All boards shown below are at
J. Richard Barnettec9a143c2015-06-04 11:11:19 -0700681less than full strength, please take action to resolve the issues.
682Once you're satisified that failures won't recur, failed DUTs can
683be replaced with spares by running `balance_pool`. Detailed
684instructions can be found here:
J. Richard Barnette4845fcf2015-04-20 14:26:25 -0700685 http://go/cros-manage-duts
686'''
687
688
J. Richard Barnette96db3492015-03-27 17:23:52 -0700689def _generate_pool_inventory_message(inventory):
690 """Generate the "pool inventory" e-mail message.
691
692 The pool inventory is a list by pool and board summarizing the
693 number of working and broken DUTs in the pool. Only boards with
694 at least one broken DUT are included in the list.
695
696 N.B. For sample output text formattted as users can expect to
697 see it in e-mail and log files, refer to the unit tests.
698
699 @param inventory _LabInventory object with the inventory to
700 be reported on.
701 @return String with the inventory message to be sent.
702
703 """
704 logging.debug('Creating pool inventory')
J. Richard Barnette4845fcf2015-04-20 14:26:25 -0700705 message = [_POOL_INVENTORY_HEADER]
J. Richard Barnette96db3492015-03-27 17:23:52 -0700706 newline = ''
707 for pool in _CRITICAL_POOLS:
708 message.append(
709 '%sStatus for pool:%s, by board:' % (newline, pool))
710 message.append(
711 '%-20s %5s %5s %5s' % (
712 'Board', 'Bad', 'Good', 'Total'))
713 data_list = []
714 for board, counts in inventory.items():
715 logging.debug('Counting inventory for %s, %s',
716 board, pool)
717 broken = counts.get_broken(pool)
718 if broken == 0:
719 continue
720 working = counts.get_working(pool)
721 total = counts.get_total(pool)
722 data_list.append((board, broken, working, total))
723 if data_list:
724 data_list = sorted(data_list, key=lambda d: -d[1])
725 message.extend(
726 ['%-20s %5d %5d %5d' % t for t in data_list])
727 else:
728 message.append('(All boards at full strength)')
729 newline = '\n'
730 return '\n'.join(message)
731
732
733def _send_email(arguments, tag, subject, recipients, body):
734 """Send an inventory e-mail message.
735
736 The message is logged in the selected log directory using `tag`
737 for the file name.
738
739 If the --print option was requested, the message is neither
740 logged nor sent, but merely printed on stdout.
741
742 @param arguments Parsed command-line options.
743 @param tag Tag identifying the inventory for logging
744 purposes.
745 @param subject E-mail Subject: header line.
746 @param recipients E-mail addresses for the To: header line.
747 @param body E-mail message body.
748
749 """
750 logging.debug('Generating email: "%s"', subject)
751 all_recipients = ', '.join(recipients)
752 report_body = '\n'.join([
753 'To: %s' % all_recipients,
754 'Subject: %s' % subject,
755 '', body, ''])
756 if arguments.print_:
757 print report_body
758 else:
759 filename = os.path.join(arguments.logdir, tag)
760 try:
761 report_file = open(filename, 'w')
762 report_file.write(report_body)
763 report_file.close()
764 except EnvironmentError as e:
765 logging.error('Failed to write %s: %s', filename, e)
766 try:
767 gmail_lib.send_email(all_recipients, subject, body)
768 except Exception as e:
769 logging.error('Failed to send e-mail to %s: %s',
770 all_recipients, e)
771
772
773def _separate_email_addresses(address_list):
774 """Parse a list of comma-separated lists of e-mail addresses.
775
776 @param address_list A list of strings containing comma
777 separate e-mail addresses.
778 @return A list of the individual e-mail addresses.
779
780 """
781 newlist = []
782 for arg in address_list:
783 newlist.extend([email.strip() for email in arg.split(',')])
784 return newlist
785
786
787def _verify_arguments(arguments):
788 """Validate command-line arguments.
789
790 Join comma separated e-mail addresses for `--board-notify` and
791 `--pool-notify` in separate option arguments into a single list.
792
793 @param arguments Command-line arguments as returned by
794 `ArgumentParser`
795
796 """
797 arguments.board_notify = _separate_email_addresses(
798 arguments.board_notify)
799 arguments.pool_notify = _separate_email_addresses(
800 arguments.pool_notify)
801
802
803def _get_logdir(script):
804 """Get the default directory for the `--logdir` option.
805
806 The default log directory is based on the parent directory
807 containing this script.
808
809 @param script Path to this script file.
810 @return A path to a directory.
811
812 """
813 basedir = os.path.dirname(os.path.abspath(script))
814 basedir = os.path.dirname(basedir)
815 return os.path.join(basedir, _LOGDIR)
816
817
818def _parse_command(argv):
819 """Parse the command line arguments.
820
821 Create an argument parser for this command's syntax, parse the
822 command line, and return the result of the ArgumentParser
823 parse_args() method.
824
825 @param argv Standard command line argument vector; argv[0] is
826 assumed to be the command name.
827 @return Result returned by ArgumentParser.parse_args().
828
829 """
830 parser = argparse.ArgumentParser(
831 prog=argv[0],
832 description='Gather and report lab inventory statistics')
833 parser.add_argument('-d', '--duration', type=int,
834 default=_DEFAULT_DURATION, metavar='HOURS',
835 help='number of hours back to search for status'
836 ' (default: %d)' % _DEFAULT_DURATION)
837 parser.add_argument('--board-notify', action='append',
838 default=[], metavar='ADDRESS',
839 help='Generate board inventory message, '
840 'and send it to the given e-mail address(es)')
841 parser.add_argument('--pool-notify', action='append',
842 default=[], metavar='ADDRESS',
843 help='Generate pool inventory message, '
844 'and send it to the given address(es)')
J. Richard Barnette1df6a562015-06-09 10:06:17 -0700845 parser.add_argument('-r', '--recommend', type=int, default=None,
J. Richard Barnettef6839282015-06-01 16:00:35 -0700846 help=('Specify how many DUTs should be '
J. Richard Barnette1df6a562015-06-09 10:06:17 -0700847 'recommended for repair (default: no '
848 'recommendation)'))
J. Richard Barnette96db3492015-03-27 17:23:52 -0700849 parser.add_argument('--print', dest='print_', action='store_true',
850 help='Print e-mail messages on stdout '
851 'without sending them.')
852 parser.add_argument('--logdir', default=_get_logdir(argv[0]),
853 help='Directory where logs will be written.')
854 parser.add_argument('boardnames', nargs='*',
855 metavar='BOARD',
856 help='names of boards to report on '
857 '(default: all boards)')
858 arguments = parser.parse_args(argv[1:])
859 _verify_arguments(arguments)
860 return arguments
861
862
863def _configure_logging(arguments):
864 """Configure the `logging` module for our needs.
865
866 How we log depends on whether the `--print` option was
867 provided on the command line. Without the option, we log all
868 messages at DEBUG level or above, and write them to a file in
869 the directory specified by the `--logdir` option. With the
870 option, we write log messages to stdout; messages below INFO
871 level are discarded.
872
873 The log file is configured to rotate once a week on Friday
874 evening, preserving ~3 months worth of history.
875
876 @param arguments Command-line arguments as returned by
877 `ArgumentParser`
878
879 """
J. Richard Barnettef6839282015-06-01 16:00:35 -0700880 root_logger = logging.getLogger()
J. Richard Barnette96db3492015-03-27 17:23:52 -0700881 if arguments.print_:
J. Richard Barnettef6839282015-06-01 16:00:35 -0700882 root_logger.setLevel(logging.INFO)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700883 handler = logging.StreamHandler(sys.stdout)
884 handler.setFormatter(logging.Formatter())
885 else:
J. Richard Barnettef6839282015-06-01 16:00:35 -0700886 root_logger.setLevel(logging.DEBUG)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700887 logfile = os.path.join(arguments.logdir, _LOGFILE)
888 handler = logging.handlers.TimedRotatingFileHandler(
889 logfile, when='W4', backupCount=13)
890 formatter = logging.Formatter(_LOG_FORMAT,
891 time_utils.TIME_FMT)
892 handler.setFormatter(formatter)
J. Richard Barnettef6839282015-06-01 16:00:35 -0700893 # TODO(jrbarnette) This is gross. Importing client.bin.utils
894 # implicitly imported logging_config, which calls
895 # logging.basicConfig() *at module level*. That gives us an
896 # extra logging handler that we don't want. So, clear out all
897 # the handlers here.
898 for h in root_logger.handlers:
899 root_logger.removeHandler(h)
900 root_logger.addHandler(handler)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700901
902
903def _populate_board_counts(inventory):
904 """Gather board counts while providing interactive feedback.
905
906 Gathering the status of all individual DUTs in the lab can take
907 considerable time (~30 minutes at the time of this writing).
908
909 Normally, we pay that cost by querying as we go. However, with
910 the `--print` option, a human being may be watching the
911 progress. So, we force the first (expensive) queries to happen
912 up front, and provide a small ASCII progress bar to give an
913 indicator of how many boards have been processed.
914
915 @param inventory _LabInventory object with the inventory to
916 be gathered.
917
918 """
919 n = 0
J. Richard Barnettef6839282015-06-01 16:00:35 -0700920 total_broken = 0
J. Richard Barnette96db3492015-03-27 17:23:52 -0700921 for counts in inventory.values():
922 n += 1
923 if n % 10 == 5:
924 c = '+'
925 elif n % 10 == 0:
926 c = '%d' % ((n / 10) % 10)
927 else:
928 c = '.'
929 sys.stdout.write(c)
930 sys.stdout.flush()
931 # This next call is where all the time goes - it forces all
932 # of a board's HostJobHistory objects to query the database
933 # and cache their results.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700934 total_broken += counts.get_broken()
J. Richard Barnette96db3492015-03-27 17:23:52 -0700935 sys.stdout.write('\n')
J. Richard Barnettef6839282015-06-01 16:00:35 -0700936 sys.stdout.write('Found %d broken DUTs\n' % total_broken)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700937
938
939def main(argv):
940 """Standard main routine.
941 @param argv Command line arguments including `sys.argv[0]`.
942 """
943 arguments = _parse_command(argv)
944 _configure_logging(arguments)
945 try:
946 end_time = int(time.time())
947 start_time = end_time - arguments.duration * 60 * 60
948 timestamp = time.strftime('%Y-%m-%d.%H',
949 time.localtime(end_time))
950 logging.debug('Starting lab inventory for %s', timestamp)
951 if arguments.board_notify:
J. Richard Barnette1df6a562015-06-09 10:06:17 -0700952 if arguments.recommend:
953 logging.debug('Will include repair recommendations')
J. Richard Barnette96db3492015-03-27 17:23:52 -0700954 logging.debug('Will include board inventory')
955 if arguments.pool_notify:
956 logging.debug('Will include pool inventory')
957
J. Richard Barnettea7c514e2015-09-15 11:13:23 -0700958 afe = frontend_wrappers.RetryingAFE(server=None)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700959 inventory = _LabInventory.create_inventory(
960 afe, start_time, end_time, arguments.boardnames)
961 logging.info('Found %d hosts across %d boards',
962 inventory.get_num_duts(),
963 inventory.get_num_boards())
964
965 if arguments.print_:
966 _populate_board_counts(inventory)
967
Richard Barnette1f87ee12015-06-09 20:54:14 +0000968 if arguments.print_ or arguments.board_notify:
J. Richard Barnette1df6a562015-06-09 10:06:17 -0700969 if arguments.recommend:
970 recommend_message = _generate_repair_recommendation(
971 inventory, arguments.recommend) + '\n\n\n'
972 else:
973 recommend_message = ''
974 board_message = _generate_board_inventory_message(inventory)
975 full_message = recommend_message + board_message
J. Richard Barnette96db3492015-03-27 17:23:52 -0700976 _send_email(arguments,
977 'boards-%s.txt' % timestamp,
978 'DUT board inventory %s' % timestamp,
979 arguments.board_notify,
J. Richard Barnette1df6a562015-06-09 10:06:17 -0700980 full_message)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700981
982 if arguments.print_ or arguments.pool_notify:
983 _send_email(arguments,
984 'pools-%s.txt' % timestamp,
985 'DUT pool inventory %s' % timestamp,
986 arguments.pool_notify,
987 _generate_pool_inventory_message(inventory))
988 except KeyboardInterrupt:
989 pass
990 except EnvironmentError as e:
991 logging.exception('Unexpected OS error: %s', e)
992 except Exception as e:
993 logging.exception('Unexpected exception: %s', e)
994
995
996if __name__ == '__main__':
997 main(sys.argv)