blob: 182a62fda8e916898deac2932749a6c060ecbf37 [file] [log] [blame]
J. Richard Barnette96db3492015-03-27 17:23:52 -07001#!/usr/bin/env python
2# Copyright 2015 The Chromium OS Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6"""Create e-mail reports of the Lab's DUT inventory.
7
8Gathers a list of all DUTs of interest in the Lab, segregated by
9board and pool, and determines whether each DUT is working or
10broken. Then, send one or more e-mail reports summarizing the
11status to e-mail addresses provided on the command line.
12
13usage: lab_inventory.py [ options ] [ board ... ]
14
15Options:
16--duration / -d <hours>
17 How far back in time to search job history to determine DUT
18 status.
19
20--board-notify <address>[,<address>]
21 Send the "board status" e-mail to all the specified e-mail
22 addresses.
23
24--pool-notify <address>[,<address>]
25 Send the "pool status" e-mail to all the specified e-mail
26 addresses.
27
28--logdir <directory>
29 Log progress and actions in a file under this directory. Text
30 of any e-mail sent will also be logged in a timestamped file in
31 this directory.
32
33--print
34 Suppress all logging and sending e-mail. Instead, write the
35 output that would be generated onto stdout.
36
37<board> arguments:
38 With no arguments, gathers the status for all boards in the lab.
39 With one or more named boards on the command line, restricts
40 reporting to just those boards.
41
42"""
43
44
45import argparse
46import logging
47import logging.handlers
48import os
49import sys
50import time
51
52import common
53from autotest_lib.client.common_lib import time_utils
54from autotest_lib.server import frontend
55from autotest_lib.site_utils import gmail_lib
56from autotest_lib.site_utils import status_history
57from autotest_lib.site_utils.suite_scheduler import constants
58
59
60# The pools in the Lab that are actually of interest.
61#
62# These are general purpose pools of DUTs that are considered
63# identical for purposes of testing. That is, a device in one of
64# these pools can be shifted to another pool at will for purposes
65# of supplying test demand.
66#
67# Devices in these pools are not allowed to have special-purpose
68# attachments, or to be part of in any kind of custom fixture.
69# Devices in these pools are also required to reside in areas
70# managed by the Platforms team (i.e. at the time of this writing,
71# only in "Atlantis" or "Destiny").
72#
73# _CRITICAL_POOLS - Pools that must be kept fully supplied in order
74# to guarantee timely completion of tests from builders.
75# _SPARE_POOL - A low priority pool that is allowed to provide
76# spares to replace broken devices in the critical pools.
77# _MANAGED_POOLS - The set of all the general purpose pools
78# monitored by this script.
79
80_CRITICAL_POOLS = ['bvt', 'cq']
81_SPARE_POOL = 'suites'
82_MANAGED_POOLS = _CRITICAL_POOLS + [_SPARE_POOL]
83
84
85# _DEFAULT_DURATION:
86# Default value used for the --duration command line option.
87# Specifies how far back in time to search in order to determine
88# DUT status.
89
90_DEFAULT_DURATION = 24
91
92
93# _LOGDIR:
94# Relative path used in the calculation of the default setting
95# for the --logdir option. The full path path is relative to
96# the root of the autotest directory, as determined from
97# sys.argv[0].
98# _LOGFILE:
99# Basename of a file to which general log information will be
100# written.
101# _LOG_FORMAT:
102# Format string for log messages.
103
104_LOGDIR = os.path.join('logs', 'dut-data')
105_LOGFILE = 'lab-inventory.log'
106_LOG_FORMAT = '%(asctime)s | %(levelname)-10s | %(message)s'
107
108
109class _PoolCounts(object):
110 """Maintains a set of `HostJobHistory` objects for a pool.
111
112 The collected history objects are nominally all part of a single
113 scheduling pool of DUTs. The collection maintains a count of
114 working DUTs, a count of broken DUTs, and a total count.
115
116 Performance note: The methods `get_working()` and
117 `get_broken()` (but not `get_total()`) are potentially
118 expensive. The first time they're called, they must make a
119 potentially expensive set of database queries. The results of
120 the queries are cached in the individual `HostJobHistory`
121 objects, so only the first call actually pays the cost.
122
123 This class is deliberately constructed to delay that cost until
124 the accessor methods are called (rather than to query in
125 `record_host()`) so that it's possible to construct a complete
126 `_LabInventory` without making the expensive queries at creation
127 time. `_populate_board_counts()`, below, relies on this
128 behavior.
129
130 """
131
132 def __init__(self):
133 self._histories = []
134
135
136 def record_host(self, host_history):
137 """Add one `HostJobHistory` object to the collection.
138
139 @param host_history The `HostJobHistory` object to be
140 remembered.
141
142 """
143 self._histories.append(host_history)
144
145
146 def get_working(self):
147 """Return the number of working DUTs in the collection."""
148 return len([h for h in self._histories
149 if h.last_diagnosis()[0] == status_history.WORKING])
150
151
152 def get_broken(self):
153 """Return the number of broken DUTs in the collection."""
154 return len([h for h in self._histories
155 if h.last_diagnosis()[0] != status_history.WORKING])
156
157
158 def get_total(self):
159 """Return the total number of DUTs in the collection."""
160 return len(self._histories)
161
162
163class _BoardCounts(object):
164 """Maintains a set of `HostJobHistory` objects for a board.
165
166 The collected history objects are nominally all of the same
167 board. The collection maintains a count of working DUTs, a
168 count of broken DUTs, and a total count. The counts can be
169 obtained either for a single pool, or as a total across all
170 pools.
171
172 DUTs in the collection must be assigned to one of the pools
173 in `_MANAGED_POOLS`.
174
175 The `get_working()` and `get_broken()` methods rely on the
176 methods of the same name in _PoolCounts, so the performance
177 note in _PoolCounts applies here as well.
178
179 """
180
181 def __init__(self):
182 self._pools = {
183 pool: _PoolCounts() for pool in _MANAGED_POOLS
184 }
185
186 def record_host(self, host_history):
187 """Add one `HostJobHistory` object to the collection.
188
189 @param host_history The `HostJobHistory` object to be
190 remembered.
191
192 """
193 pool = host_history.get_host_pool()
194 self._pools[pool].record_host(host_history)
195
196
197 def _count_pool(self, get_pool_count, pool=None):
198 """Internal helper to count hosts in a given pool.
199
200 The `get_pool_count` parameter is a function to calculate
201 the exact count of interest for the pool.
202
203 @param get_pool_count Function to return a count from a
204 _PoolCount object.
205 @param pool The pool to be counted. If `None`,
206 return the total across all pools.
207
208 """
209 if pool is None:
210 return sum([get_pool_count(counts)
211 for counts in self._pools.values()])
212 else:
213 return get_pool_count(self._pools[pool])
214
215
216 def get_working(self, pool=None):
217 """Return the number of working DUTs in a pool.
218
219 @param pool The pool to be counted. If `None`, return the
220 total across all pools.
221
222 """
223 return self._count_pool(_PoolCounts.get_working, pool)
224
225
226 def get_broken(self, pool=None):
227 """Return the number of broken DUTs in a pool.
228
229 @param pool The pool to be counted. If `None`, return the
230 total across all pools.
231
232 """
233 return self._count_pool(_PoolCounts.get_broken, pool)
234
235
236 def get_total(self, pool=None):
237 """Return the total number of DUTs in a pool.
238
239 @param pool The pool to be counted. If `None`, return the
240 total across all pools.
241
242 """
243 return self._count_pool(_PoolCounts.get_total, pool)
244
245
246class _LabInventory(dict):
247 """Collection of `HostJobHistory` objects for the Lab's inventory.
248
249 The collection is indexed by board. Indexing returns the
250 _BoardCounts object associated with the board.
251
252 The collection is also iterable. The iterator returns all the
253 boards in the inventory, in unspecified order.
254
255 """
256
257 @classmethod
258 def create_inventory(cls, afe, start_time, end_time, boardlist=[]):
259 """Return a Lab inventory with specified parameters.
260
261 By default, gathers inventory from `HostJobHistory` objects
262 for all DUTs in the `_MANAGED_POOLS` list. If `boardlist`
263 is supplied, the inventory will be restricted to only the
264 given boards.
265
266 @param afe AFE object for constructing the
267 `HostJobHistory` objects.
268 @param start_time Start time for the `HostJobHistory`
269 objects.
270 @param end_time End time for the `HostJobHistory`
271 objects.
272 @param boardlist List of boards to include. If empty,
273 include all available boards.
274 @return A `_LabInventory` object for the specified boards.
275
276 """
277 label_list = [constants.Labels.POOL_PREFIX + l
278 for l in _MANAGED_POOLS]
279 afehosts = afe.get_hosts(labels__name__in=label_list)
280 if boardlist:
281 boardhosts = []
282 for board in boardlist:
283 board_label = constants.Labels.BOARD_PREFIX + board
284 host_list = [h for h in afehosts
285 if board_label in h.labels]
286 boardhosts.extend(host_list)
287 afehosts = boardhosts
288 create = lambda host: (
289 status_history.HostJobHistory(afe, host,
290 start_time, end_time))
291 return cls([create(host) for host in afehosts])
292
293
294 def __init__(self, histories):
295 boards = set([h.get_host_board() for h in histories])
296 initval = { board: _BoardCounts() for board in boards }
297 super(_LabInventory, self).__init__(initval)
298 self._dut_count = len(histories)
299 for h in histories:
300 self[h.get_host_board()].record_host(h)
301
302
303 def get_num_duts(self):
304 """Return the total number of DUTs in the inventory."""
305 return self._dut_count
306
307
308 def get_num_boards(self):
309 """Return the total number of boards in the inventory."""
310 return len(self)
311
312
313def _generate_board_inventory_message(inventory):
314 """Generate the "board inventory" e-mail message.
315
316 The board inventory is a list by board summarizing the number
317 of working and broken DUTs, and the total shortfall or surplus
318 of working devices relative to the minimum critical pool
319 requirement.
320
321 The report omits boards with no DUTs in the spare pool or with
322 no DUTs in a critical pool.
323
324 N.B. For sample output text formattted as users can expect to
325 see it in e-mail and log files, refer to the unit tests.
326
327 @param inventory _LabInventory object with the inventory to
328 be reported on.
329 @return String with the inventory message to be sent.
330
331 """
332 logging.debug('Creating board inventory')
333 message = []
334 message.append(
335 '%-20s %5s %5s %5s %5s %5s' % (
336 'Board', 'Avail', 'Bad', 'Good', 'Spare', 'Total'))
337 data_list = []
338 for board, counts in inventory.items():
339 logging.debug('Counting inventory for %s', board)
340 spares = counts.get_total(_SPARE_POOL)
341 total = counts.get_total()
342 if spares == 0 or spares == total:
343 continue
344 working = counts.get_working()
345 broken = counts.get_broken()
346 buffer = spares - broken
347 data_list.append((board, buffer, broken, working, spares, total))
348 data_list = sorted(sorted(data_list, key=lambda t: -t[2]),
349 key=lambda t: t[1])
350 message.extend(
351 ['%-20s %5d %5d %5d %5d %5d' % t for t in data_list])
352 return '\n'.join(message)
353
354
J. Richard Barnette4845fcf2015-04-20 14:26:25 -0700355_POOL_INVENTORY_HEADER = '''\
356Notice to Infrastructure deputy: If there are shortages below,
357please take action to resolve them. If it's safe, you should
358balance shortages by running `balance_pool` or `freon_swap` as
359necessary. Detailed instructions can be found here:
360 http://go/cros-manage-duts
361'''
362
363
J. Richard Barnette96db3492015-03-27 17:23:52 -0700364def _generate_pool_inventory_message(inventory):
365 """Generate the "pool inventory" e-mail message.
366
367 The pool inventory is a list by pool and board summarizing the
368 number of working and broken DUTs in the pool. Only boards with
369 at least one broken DUT are included in the list.
370
371 N.B. For sample output text formattted as users can expect to
372 see it in e-mail and log files, refer to the unit tests.
373
374 @param inventory _LabInventory object with the inventory to
375 be reported on.
376 @return String with the inventory message to be sent.
377
378 """
379 logging.debug('Creating pool inventory')
J. Richard Barnette4845fcf2015-04-20 14:26:25 -0700380 message = [_POOL_INVENTORY_HEADER]
J. Richard Barnette96db3492015-03-27 17:23:52 -0700381 newline = ''
382 for pool in _CRITICAL_POOLS:
383 message.append(
384 '%sStatus for pool:%s, by board:' % (newline, pool))
385 message.append(
386 '%-20s %5s %5s %5s' % (
387 'Board', 'Bad', 'Good', 'Total'))
388 data_list = []
389 for board, counts in inventory.items():
390 logging.debug('Counting inventory for %s, %s',
391 board, pool)
392 broken = counts.get_broken(pool)
393 if broken == 0:
394 continue
395 working = counts.get_working(pool)
396 total = counts.get_total(pool)
397 data_list.append((board, broken, working, total))
398 if data_list:
399 data_list = sorted(data_list, key=lambda d: -d[1])
400 message.extend(
401 ['%-20s %5d %5d %5d' % t for t in data_list])
402 else:
403 message.append('(All boards at full strength)')
404 newline = '\n'
405 return '\n'.join(message)
406
407
408def _send_email(arguments, tag, subject, recipients, body):
409 """Send an inventory e-mail message.
410
411 The message is logged in the selected log directory using `tag`
412 for the file name.
413
414 If the --print option was requested, the message is neither
415 logged nor sent, but merely printed on stdout.
416
417 @param arguments Parsed command-line options.
418 @param tag Tag identifying the inventory for logging
419 purposes.
420 @param subject E-mail Subject: header line.
421 @param recipients E-mail addresses for the To: header line.
422 @param body E-mail message body.
423
424 """
425 logging.debug('Generating email: "%s"', subject)
426 all_recipients = ', '.join(recipients)
427 report_body = '\n'.join([
428 'To: %s' % all_recipients,
429 'Subject: %s' % subject,
430 '', body, ''])
431 if arguments.print_:
432 print report_body
433 else:
434 filename = os.path.join(arguments.logdir, tag)
435 try:
436 report_file = open(filename, 'w')
437 report_file.write(report_body)
438 report_file.close()
439 except EnvironmentError as e:
440 logging.error('Failed to write %s: %s', filename, e)
441 try:
442 gmail_lib.send_email(all_recipients, subject, body)
443 except Exception as e:
444 logging.error('Failed to send e-mail to %s: %s',
445 all_recipients, e)
446
447
448def _separate_email_addresses(address_list):
449 """Parse a list of comma-separated lists of e-mail addresses.
450
451 @param address_list A list of strings containing comma
452 separate e-mail addresses.
453 @return A list of the individual e-mail addresses.
454
455 """
456 newlist = []
457 for arg in address_list:
458 newlist.extend([email.strip() for email in arg.split(',')])
459 return newlist
460
461
462def _verify_arguments(arguments):
463 """Validate command-line arguments.
464
465 Join comma separated e-mail addresses for `--board-notify` and
466 `--pool-notify` in separate option arguments into a single list.
467
468 @param arguments Command-line arguments as returned by
469 `ArgumentParser`
470
471 """
472 arguments.board_notify = _separate_email_addresses(
473 arguments.board_notify)
474 arguments.pool_notify = _separate_email_addresses(
475 arguments.pool_notify)
476
477
478def _get_logdir(script):
479 """Get the default directory for the `--logdir` option.
480
481 The default log directory is based on the parent directory
482 containing this script.
483
484 @param script Path to this script file.
485 @return A path to a directory.
486
487 """
488 basedir = os.path.dirname(os.path.abspath(script))
489 basedir = os.path.dirname(basedir)
490 return os.path.join(basedir, _LOGDIR)
491
492
493def _parse_command(argv):
494 """Parse the command line arguments.
495
496 Create an argument parser for this command's syntax, parse the
497 command line, and return the result of the ArgumentParser
498 parse_args() method.
499
500 @param argv Standard command line argument vector; argv[0] is
501 assumed to be the command name.
502 @return Result returned by ArgumentParser.parse_args().
503
504 """
505 parser = argparse.ArgumentParser(
506 prog=argv[0],
507 description='Gather and report lab inventory statistics')
508 parser.add_argument('-d', '--duration', type=int,
509 default=_DEFAULT_DURATION, metavar='HOURS',
510 help='number of hours back to search for status'
511 ' (default: %d)' % _DEFAULT_DURATION)
512 parser.add_argument('--board-notify', action='append',
513 default=[], metavar='ADDRESS',
514 help='Generate board inventory message, '
515 'and send it to the given e-mail address(es)')
516 parser.add_argument('--pool-notify', action='append',
517 default=[], metavar='ADDRESS',
518 help='Generate pool inventory message, '
519 'and send it to the given address(es)')
520 parser.add_argument('--print', dest='print_', action='store_true',
521 help='Print e-mail messages on stdout '
522 'without sending them.')
523 parser.add_argument('--logdir', default=_get_logdir(argv[0]),
524 help='Directory where logs will be written.')
525 parser.add_argument('boardnames', nargs='*',
526 metavar='BOARD',
527 help='names of boards to report on '
528 '(default: all boards)')
529 arguments = parser.parse_args(argv[1:])
530 _verify_arguments(arguments)
531 return arguments
532
533
534def _configure_logging(arguments):
535 """Configure the `logging` module for our needs.
536
537 How we log depends on whether the `--print` option was
538 provided on the command line. Without the option, we log all
539 messages at DEBUG level or above, and write them to a file in
540 the directory specified by the `--logdir` option. With the
541 option, we write log messages to stdout; messages below INFO
542 level are discarded.
543
544 The log file is configured to rotate once a week on Friday
545 evening, preserving ~3 months worth of history.
546
547 @param arguments Command-line arguments as returned by
548 `ArgumentParser`
549
550 """
551 if arguments.print_:
552 logging.getLogger().setLevel(logging.INFO)
553 handler = logging.StreamHandler(sys.stdout)
554 handler.setFormatter(logging.Formatter())
555 else:
556 logging.getLogger().setLevel(logging.DEBUG)
557 logfile = os.path.join(arguments.logdir, _LOGFILE)
558 handler = logging.handlers.TimedRotatingFileHandler(
559 logfile, when='W4', backupCount=13)
560 formatter = logging.Formatter(_LOG_FORMAT,
561 time_utils.TIME_FMT)
562 handler.setFormatter(formatter)
563 logging.getLogger().addHandler(handler)
564
565
566def _populate_board_counts(inventory):
567 """Gather board counts while providing interactive feedback.
568
569 Gathering the status of all individual DUTs in the lab can take
570 considerable time (~30 minutes at the time of this writing).
571
572 Normally, we pay that cost by querying as we go. However, with
573 the `--print` option, a human being may be watching the
574 progress. So, we force the first (expensive) queries to happen
575 up front, and provide a small ASCII progress bar to give an
576 indicator of how many boards have been processed.
577
578 @param inventory _LabInventory object with the inventory to
579 be gathered.
580
581 """
582 n = 0
583 for counts in inventory.values():
584 n += 1
585 if n % 10 == 5:
586 c = '+'
587 elif n % 10 == 0:
588 c = '%d' % ((n / 10) % 10)
589 else:
590 c = '.'
591 sys.stdout.write(c)
592 sys.stdout.flush()
593 # This next call is where all the time goes - it forces all
594 # of a board's HostJobHistory objects to query the database
595 # and cache their results.
596 counts.get_working()
597 sys.stdout.write('\n')
598
599
600def main(argv):
601 """Standard main routine.
602 @param argv Command line arguments including `sys.argv[0]`.
603 """
604 arguments = _parse_command(argv)
605 _configure_logging(arguments)
606 try:
607 end_time = int(time.time())
608 start_time = end_time - arguments.duration * 60 * 60
609 timestamp = time.strftime('%Y-%m-%d.%H',
610 time.localtime(end_time))
611 logging.debug('Starting lab inventory for %s', timestamp)
612 if arguments.board_notify:
613 logging.debug('Will include board inventory')
614 if arguments.pool_notify:
615 logging.debug('Will include pool inventory')
616
617 afe = frontend.AFE(server=None)
618 inventory = _LabInventory.create_inventory(
619 afe, start_time, end_time, arguments.boardnames)
620 logging.info('Found %d hosts across %d boards',
621 inventory.get_num_duts(),
622 inventory.get_num_boards())
623
624 if arguments.print_:
625 _populate_board_counts(inventory)
626
627 if arguments.print_ or arguments.board_notify:
628 _send_email(arguments,
629 'boards-%s.txt' % timestamp,
630 'DUT board inventory %s' % timestamp,
631 arguments.board_notify,
632 _generate_board_inventory_message(inventory))
633
634 if arguments.print_ or arguments.pool_notify:
635 _send_email(arguments,
636 'pools-%s.txt' % timestamp,
637 'DUT pool inventory %s' % timestamp,
638 arguments.pool_notify,
639 _generate_pool_inventory_message(inventory))
640 except KeyboardInterrupt:
641 pass
642 except EnvironmentError as e:
643 logging.exception('Unexpected OS error: %s', e)
644 except Exception as e:
645 logging.exception('Unexpected exception: %s', e)
646
647
648if __name__ == '__main__':
649 main(sys.argv)