blob: 476f5419ed40d078f0a560a733980ccafab6f1e8 [file] [log] [blame]
J. Richard Barnette96db3492015-03-27 17:23:52 -07001#!/usr/bin/env python
2# Copyright 2015 The Chromium OS Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6"""Create e-mail reports of the Lab's DUT inventory.
7
8Gathers a list of all DUTs of interest in the Lab, segregated by
9board and pool, and determines whether each DUT is working or
10broken. Then, send one or more e-mail reports summarizing the
11status to e-mail addresses provided on the command line.
12
13usage: lab_inventory.py [ options ] [ board ... ]
14
15Options:
16--duration / -d <hours>
17 How far back in time to search job history to determine DUT
18 status.
19
20--board-notify <address>[,<address>]
21 Send the "board status" e-mail to all the specified e-mail
22 addresses.
23
24--pool-notify <address>[,<address>]
25 Send the "pool status" e-mail to all the specified e-mail
26 addresses.
27
28--logdir <directory>
29 Log progress and actions in a file under this directory. Text
30 of any e-mail sent will also be logged in a timestamped file in
31 this directory.
32
33--print
34 Suppress all logging and sending e-mail. Instead, write the
35 output that would be generated onto stdout.
36
37<board> arguments:
38 With no arguments, gathers the status for all boards in the lab.
39 With one or more named boards on the command line, restricts
40 reporting to just those boards.
41
42"""
43
44
45import argparse
46import logging
47import logging.handlers
48import os
49import sys
50import time
51
52import common
53from autotest_lib.client.common_lib import time_utils
54from autotest_lib.server import frontend
55from autotest_lib.site_utils import gmail_lib
56from autotest_lib.site_utils import status_history
57from autotest_lib.site_utils.suite_scheduler import constants
58
59
60# The pools in the Lab that are actually of interest.
61#
62# These are general purpose pools of DUTs that are considered
63# identical for purposes of testing. That is, a device in one of
64# these pools can be shifted to another pool at will for purposes
65# of supplying test demand.
66#
67# Devices in these pools are not allowed to have special-purpose
68# attachments, or to be part of in any kind of custom fixture.
69# Devices in these pools are also required to reside in areas
70# managed by the Platforms team (i.e. at the time of this writing,
71# only in "Atlantis" or "Destiny").
72#
73# _CRITICAL_POOLS - Pools that must be kept fully supplied in order
74# to guarantee timely completion of tests from builders.
75# _SPARE_POOL - A low priority pool that is allowed to provide
76# spares to replace broken devices in the critical pools.
77# _MANAGED_POOLS - The set of all the general purpose pools
78# monitored by this script.
79
80_CRITICAL_POOLS = ['bvt', 'cq']
81_SPARE_POOL = 'suites'
82_MANAGED_POOLS = _CRITICAL_POOLS + [_SPARE_POOL]
83
84
85# _DEFAULT_DURATION:
86# Default value used for the --duration command line option.
87# Specifies how far back in time to search in order to determine
88# DUT status.
89
90_DEFAULT_DURATION = 24
91
92
93# _LOGDIR:
94# Relative path used in the calculation of the default setting
95# for the --logdir option. The full path path is relative to
96# the root of the autotest directory, as determined from
97# sys.argv[0].
98# _LOGFILE:
99# Basename of a file to which general log information will be
100# written.
101# _LOG_FORMAT:
102# Format string for log messages.
103
104_LOGDIR = os.path.join('logs', 'dut-data')
105_LOGFILE = 'lab-inventory.log'
106_LOG_FORMAT = '%(asctime)s | %(levelname)-10s | %(message)s'
107
108
109class _PoolCounts(object):
110 """Maintains a set of `HostJobHistory` objects for a pool.
111
112 The collected history objects are nominally all part of a single
113 scheduling pool of DUTs. The collection maintains a count of
114 working DUTs, a count of broken DUTs, and a total count.
115
116 Performance note: The methods `get_working()` and
117 `get_broken()` (but not `get_total()`) are potentially
118 expensive. The first time they're called, they must make a
119 potentially expensive set of database queries. The results of
120 the queries are cached in the individual `HostJobHistory`
121 objects, so only the first call actually pays the cost.
122
123 This class is deliberately constructed to delay that cost until
124 the accessor methods are called (rather than to query in
125 `record_host()`) so that it's possible to construct a complete
126 `_LabInventory` without making the expensive queries at creation
127 time. `_populate_board_counts()`, below, relies on this
128 behavior.
129
130 """
131
132 def __init__(self):
133 self._histories = []
134
135
136 def record_host(self, host_history):
137 """Add one `HostJobHistory` object to the collection.
138
139 @param host_history The `HostJobHistory` object to be
140 remembered.
141
142 """
143 self._histories.append(host_history)
144
145
146 def get_working(self):
147 """Return the number of working DUTs in the collection."""
148 return len([h for h in self._histories
149 if h.last_diagnosis()[0] == status_history.WORKING])
150
151
152 def get_broken(self):
153 """Return the number of broken DUTs in the collection."""
154 return len([h for h in self._histories
155 if h.last_diagnosis()[0] != status_history.WORKING])
156
157
158 def get_total(self):
159 """Return the total number of DUTs in the collection."""
160 return len(self._histories)
161
162
163class _BoardCounts(object):
164 """Maintains a set of `HostJobHistory` objects for a board.
165
166 The collected history objects are nominally all of the same
167 board. The collection maintains a count of working DUTs, a
168 count of broken DUTs, and a total count. The counts can be
169 obtained either for a single pool, or as a total across all
170 pools.
171
172 DUTs in the collection must be assigned to one of the pools
173 in `_MANAGED_POOLS`.
174
175 The `get_working()` and `get_broken()` methods rely on the
176 methods of the same name in _PoolCounts, so the performance
177 note in _PoolCounts applies here as well.
178
179 """
180
181 def __init__(self):
182 self._pools = {
183 pool: _PoolCounts() for pool in _MANAGED_POOLS
184 }
185
186 def record_host(self, host_history):
187 """Add one `HostJobHistory` object to the collection.
188
189 @param host_history The `HostJobHistory` object to be
190 remembered.
191
192 """
J. Richard Barnette3d0590a2015-04-29 12:56:12 -0700193 pool = host_history.host_pool
J. Richard Barnette96db3492015-03-27 17:23:52 -0700194 self._pools[pool].record_host(host_history)
195
196
197 def _count_pool(self, get_pool_count, pool=None):
198 """Internal helper to count hosts in a given pool.
199
200 The `get_pool_count` parameter is a function to calculate
201 the exact count of interest for the pool.
202
203 @param get_pool_count Function to return a count from a
204 _PoolCount object.
205 @param pool The pool to be counted. If `None`,
206 return the total across all pools.
207
208 """
209 if pool is None:
210 return sum([get_pool_count(counts)
211 for counts in self._pools.values()])
212 else:
213 return get_pool_count(self._pools[pool])
214
215
216 def get_working(self, pool=None):
217 """Return the number of working DUTs in a pool.
218
219 @param pool The pool to be counted. If `None`, return the
220 total across all pools.
221
222 """
223 return self._count_pool(_PoolCounts.get_working, pool)
224
225
226 def get_broken(self, pool=None):
227 """Return the number of broken DUTs in a pool.
228
229 @param pool The pool to be counted. If `None`, return the
230 total across all pools.
231
232 """
233 return self._count_pool(_PoolCounts.get_broken, pool)
234
235
236 def get_total(self, pool=None):
237 """Return the total number of DUTs in a pool.
238
239 @param pool The pool to be counted. If `None`, return the
240 total across all pools.
241
242 """
243 return self._count_pool(_PoolCounts.get_total, pool)
244
245
246class _LabInventory(dict):
247 """Collection of `HostJobHistory` objects for the Lab's inventory.
248
249 The collection is indexed by board. Indexing returns the
250 _BoardCounts object associated with the board.
251
252 The collection is also iterable. The iterator returns all the
253 boards in the inventory, in unspecified order.
254
255 """
256
257 @classmethod
258 def create_inventory(cls, afe, start_time, end_time, boardlist=[]):
259 """Return a Lab inventory with specified parameters.
260
261 By default, gathers inventory from `HostJobHistory` objects
262 for all DUTs in the `_MANAGED_POOLS` list. If `boardlist`
263 is supplied, the inventory will be restricted to only the
264 given boards.
265
266 @param afe AFE object for constructing the
267 `HostJobHistory` objects.
268 @param start_time Start time for the `HostJobHistory`
269 objects.
270 @param end_time End time for the `HostJobHistory`
271 objects.
272 @param boardlist List of boards to include. If empty,
273 include all available boards.
274 @return A `_LabInventory` object for the specified boards.
275
276 """
277 label_list = [constants.Labels.POOL_PREFIX + l
278 for l in _MANAGED_POOLS]
279 afehosts = afe.get_hosts(labels__name__in=label_list)
280 if boardlist:
281 boardhosts = []
282 for board in boardlist:
283 board_label = constants.Labels.BOARD_PREFIX + board
284 host_list = [h for h in afehosts
285 if board_label in h.labels]
286 boardhosts.extend(host_list)
287 afehosts = boardhosts
288 create = lambda host: (
289 status_history.HostJobHistory(afe, host,
290 start_time, end_time))
291 return cls([create(host) for host in afehosts])
292
293
294 def __init__(self, histories):
J. Richard Barnette6948ed32015-05-06 08:57:10 -0700295 # N.B. The query that finds our hosts is restricted to those
296 # with a valid pool: label, but doesn't check for a valid
297 # board: label. In some (insufficiently) rare cases, the
298 # AFE hosts table has been known to (incorrectly) have DUTs
299 # with a pool: but no board: label. We explicitly exclude
300 # those here.
301 histories = [h for h in histories
302 if h.host_board is not None]
J. Richard Barnette3d0590a2015-04-29 12:56:12 -0700303 boards = set([h.host_board for h in histories])
J. Richard Barnette96db3492015-03-27 17:23:52 -0700304 initval = { board: _BoardCounts() for board in boards }
305 super(_LabInventory, self).__init__(initval)
306 self._dut_count = len(histories)
307 for h in histories:
J. Richard Barnette3d0590a2015-04-29 12:56:12 -0700308 self[h.host_board].record_host(h)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700309
310
311 def get_num_duts(self):
312 """Return the total number of DUTs in the inventory."""
313 return self._dut_count
314
315
316 def get_num_boards(self):
317 """Return the total number of boards in the inventory."""
318 return len(self)
319
320
321def _generate_board_inventory_message(inventory):
322 """Generate the "board inventory" e-mail message.
323
324 The board inventory is a list by board summarizing the number
325 of working and broken DUTs, and the total shortfall or surplus
326 of working devices relative to the minimum critical pool
327 requirement.
328
329 The report omits boards with no DUTs in the spare pool or with
330 no DUTs in a critical pool.
331
332 N.B. For sample output text formattted as users can expect to
333 see it in e-mail and log files, refer to the unit tests.
334
335 @param inventory _LabInventory object with the inventory to
336 be reported on.
337 @return String with the inventory message to be sent.
338
339 """
340 logging.debug('Creating board inventory')
341 message = []
342 message.append(
343 '%-20s %5s %5s %5s %5s %5s' % (
344 'Board', 'Avail', 'Bad', 'Good', 'Spare', 'Total'))
345 data_list = []
346 for board, counts in inventory.items():
347 logging.debug('Counting inventory for %s', board)
348 spares = counts.get_total(_SPARE_POOL)
349 total = counts.get_total()
350 if spares == 0 or spares == total:
351 continue
352 working = counts.get_working()
353 broken = counts.get_broken()
354 buffer = spares - broken
355 data_list.append((board, buffer, broken, working, spares, total))
356 data_list = sorted(sorted(data_list, key=lambda t: -t[2]),
357 key=lambda t: t[1])
358 message.extend(
359 ['%-20s %5d %5d %5d %5d %5d' % t for t in data_list])
360 return '\n'.join(message)
361
362
J. Richard Barnette4845fcf2015-04-20 14:26:25 -0700363_POOL_INVENTORY_HEADER = '''\
364Notice to Infrastructure deputy: If there are shortages below,
365please take action to resolve them. If it's safe, you should
366balance shortages by running `balance_pool` or `freon_swap` as
367necessary. Detailed instructions can be found here:
368 http://go/cros-manage-duts
369'''
370
371
J. Richard Barnette96db3492015-03-27 17:23:52 -0700372def _generate_pool_inventory_message(inventory):
373 """Generate the "pool inventory" e-mail message.
374
375 The pool inventory is a list by pool and board summarizing the
376 number of working and broken DUTs in the pool. Only boards with
377 at least one broken DUT are included in the list.
378
379 N.B. For sample output text formattted as users can expect to
380 see it in e-mail and log files, refer to the unit tests.
381
382 @param inventory _LabInventory object with the inventory to
383 be reported on.
384 @return String with the inventory message to be sent.
385
386 """
387 logging.debug('Creating pool inventory')
J. Richard Barnette4845fcf2015-04-20 14:26:25 -0700388 message = [_POOL_INVENTORY_HEADER]
J. Richard Barnette96db3492015-03-27 17:23:52 -0700389 newline = ''
390 for pool in _CRITICAL_POOLS:
391 message.append(
392 '%sStatus for pool:%s, by board:' % (newline, pool))
393 message.append(
394 '%-20s %5s %5s %5s' % (
395 'Board', 'Bad', 'Good', 'Total'))
396 data_list = []
397 for board, counts in inventory.items():
398 logging.debug('Counting inventory for %s, %s',
399 board, pool)
400 broken = counts.get_broken(pool)
401 if broken == 0:
402 continue
403 working = counts.get_working(pool)
404 total = counts.get_total(pool)
405 data_list.append((board, broken, working, total))
406 if data_list:
407 data_list = sorted(data_list, key=lambda d: -d[1])
408 message.extend(
409 ['%-20s %5d %5d %5d' % t for t in data_list])
410 else:
411 message.append('(All boards at full strength)')
412 newline = '\n'
413 return '\n'.join(message)
414
415
416def _send_email(arguments, tag, subject, recipients, body):
417 """Send an inventory e-mail message.
418
419 The message is logged in the selected log directory using `tag`
420 for the file name.
421
422 If the --print option was requested, the message is neither
423 logged nor sent, but merely printed on stdout.
424
425 @param arguments Parsed command-line options.
426 @param tag Tag identifying the inventory for logging
427 purposes.
428 @param subject E-mail Subject: header line.
429 @param recipients E-mail addresses for the To: header line.
430 @param body E-mail message body.
431
432 """
433 logging.debug('Generating email: "%s"', subject)
434 all_recipients = ', '.join(recipients)
435 report_body = '\n'.join([
436 'To: %s' % all_recipients,
437 'Subject: %s' % subject,
438 '', body, ''])
439 if arguments.print_:
440 print report_body
441 else:
442 filename = os.path.join(arguments.logdir, tag)
443 try:
444 report_file = open(filename, 'w')
445 report_file.write(report_body)
446 report_file.close()
447 except EnvironmentError as e:
448 logging.error('Failed to write %s: %s', filename, e)
449 try:
450 gmail_lib.send_email(all_recipients, subject, body)
451 except Exception as e:
452 logging.error('Failed to send e-mail to %s: %s',
453 all_recipients, e)
454
455
456def _separate_email_addresses(address_list):
457 """Parse a list of comma-separated lists of e-mail addresses.
458
459 @param address_list A list of strings containing comma
460 separate e-mail addresses.
461 @return A list of the individual e-mail addresses.
462
463 """
464 newlist = []
465 for arg in address_list:
466 newlist.extend([email.strip() for email in arg.split(',')])
467 return newlist
468
469
470def _verify_arguments(arguments):
471 """Validate command-line arguments.
472
473 Join comma separated e-mail addresses for `--board-notify` and
474 `--pool-notify` in separate option arguments into a single list.
475
476 @param arguments Command-line arguments as returned by
477 `ArgumentParser`
478
479 """
480 arguments.board_notify = _separate_email_addresses(
481 arguments.board_notify)
482 arguments.pool_notify = _separate_email_addresses(
483 arguments.pool_notify)
484
485
486def _get_logdir(script):
487 """Get the default directory for the `--logdir` option.
488
489 The default log directory is based on the parent directory
490 containing this script.
491
492 @param script Path to this script file.
493 @return A path to a directory.
494
495 """
496 basedir = os.path.dirname(os.path.abspath(script))
497 basedir = os.path.dirname(basedir)
498 return os.path.join(basedir, _LOGDIR)
499
500
501def _parse_command(argv):
502 """Parse the command line arguments.
503
504 Create an argument parser for this command's syntax, parse the
505 command line, and return the result of the ArgumentParser
506 parse_args() method.
507
508 @param argv Standard command line argument vector; argv[0] is
509 assumed to be the command name.
510 @return Result returned by ArgumentParser.parse_args().
511
512 """
513 parser = argparse.ArgumentParser(
514 prog=argv[0],
515 description='Gather and report lab inventory statistics')
516 parser.add_argument('-d', '--duration', type=int,
517 default=_DEFAULT_DURATION, metavar='HOURS',
518 help='number of hours back to search for status'
519 ' (default: %d)' % _DEFAULT_DURATION)
520 parser.add_argument('--board-notify', action='append',
521 default=[], metavar='ADDRESS',
522 help='Generate board inventory message, '
523 'and send it to the given e-mail address(es)')
524 parser.add_argument('--pool-notify', action='append',
525 default=[], metavar='ADDRESS',
526 help='Generate pool inventory message, '
527 'and send it to the given address(es)')
528 parser.add_argument('--print', dest='print_', action='store_true',
529 help='Print e-mail messages on stdout '
530 'without sending them.')
531 parser.add_argument('--logdir', default=_get_logdir(argv[0]),
532 help='Directory where logs will be written.')
533 parser.add_argument('boardnames', nargs='*',
534 metavar='BOARD',
535 help='names of boards to report on '
536 '(default: all boards)')
537 arguments = parser.parse_args(argv[1:])
538 _verify_arguments(arguments)
539 return arguments
540
541
542def _configure_logging(arguments):
543 """Configure the `logging` module for our needs.
544
545 How we log depends on whether the `--print` option was
546 provided on the command line. Without the option, we log all
547 messages at DEBUG level or above, and write them to a file in
548 the directory specified by the `--logdir` option. With the
549 option, we write log messages to stdout; messages below INFO
550 level are discarded.
551
552 The log file is configured to rotate once a week on Friday
553 evening, preserving ~3 months worth of history.
554
555 @param arguments Command-line arguments as returned by
556 `ArgumentParser`
557
558 """
559 if arguments.print_:
560 logging.getLogger().setLevel(logging.INFO)
561 handler = logging.StreamHandler(sys.stdout)
562 handler.setFormatter(logging.Formatter())
563 else:
564 logging.getLogger().setLevel(logging.DEBUG)
565 logfile = os.path.join(arguments.logdir, _LOGFILE)
566 handler = logging.handlers.TimedRotatingFileHandler(
567 logfile, when='W4', backupCount=13)
568 formatter = logging.Formatter(_LOG_FORMAT,
569 time_utils.TIME_FMT)
570 handler.setFormatter(formatter)
571 logging.getLogger().addHandler(handler)
572
573
574def _populate_board_counts(inventory):
575 """Gather board counts while providing interactive feedback.
576
577 Gathering the status of all individual DUTs in the lab can take
578 considerable time (~30 minutes at the time of this writing).
579
580 Normally, we pay that cost by querying as we go. However, with
581 the `--print` option, a human being may be watching the
582 progress. So, we force the first (expensive) queries to happen
583 up front, and provide a small ASCII progress bar to give an
584 indicator of how many boards have been processed.
585
586 @param inventory _LabInventory object with the inventory to
587 be gathered.
588
589 """
590 n = 0
591 for counts in inventory.values():
592 n += 1
593 if n % 10 == 5:
594 c = '+'
595 elif n % 10 == 0:
596 c = '%d' % ((n / 10) % 10)
597 else:
598 c = '.'
599 sys.stdout.write(c)
600 sys.stdout.flush()
601 # This next call is where all the time goes - it forces all
602 # of a board's HostJobHistory objects to query the database
603 # and cache their results.
604 counts.get_working()
605 sys.stdout.write('\n')
606
607
608def main(argv):
609 """Standard main routine.
610 @param argv Command line arguments including `sys.argv[0]`.
611 """
612 arguments = _parse_command(argv)
613 _configure_logging(arguments)
614 try:
615 end_time = int(time.time())
616 start_time = end_time - arguments.duration * 60 * 60
617 timestamp = time.strftime('%Y-%m-%d.%H',
618 time.localtime(end_time))
619 logging.debug('Starting lab inventory for %s', timestamp)
620 if arguments.board_notify:
621 logging.debug('Will include board inventory')
622 if arguments.pool_notify:
623 logging.debug('Will include pool inventory')
624
625 afe = frontend.AFE(server=None)
626 inventory = _LabInventory.create_inventory(
627 afe, start_time, end_time, arguments.boardnames)
628 logging.info('Found %d hosts across %d boards',
629 inventory.get_num_duts(),
630 inventory.get_num_boards())
631
632 if arguments.print_:
633 _populate_board_counts(inventory)
634
635 if arguments.print_ or arguments.board_notify:
636 _send_email(arguments,
637 'boards-%s.txt' % timestamp,
638 'DUT board inventory %s' % timestamp,
639 arguments.board_notify,
640 _generate_board_inventory_message(inventory))
641
642 if arguments.print_ or arguments.pool_notify:
643 _send_email(arguments,
644 'pools-%s.txt' % timestamp,
645 'DUT pool inventory %s' % timestamp,
646 arguments.pool_notify,
647 _generate_pool_inventory_message(inventory))
648 except KeyboardInterrupt:
649 pass
650 except EnvironmentError as e:
651 logging.exception('Unexpected OS error: %s', e)
652 except Exception as e:
653 logging.exception('Unexpected exception: %s', e)
654
655
656if __name__ == '__main__':
657 main(sys.argv)