Blame - site_utils/lab_inventory.py - platform/external/autotest

blob: 476f5419ed40d078f0a560a733980ccafab6f1e8 [file] [log] [blame]

J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	1	#!/usr/bin/env python
				2	# Copyright 2015 The Chromium OS Authors. All rights reserved.
				3	# Use of this source code is governed by a BSD-style license that can be
				4	# found in the LICENSE file.
				5
				6	"""Create e-mail reports of the Lab's DUT inventory.
				7
				8	Gathers a list of all DUTs of interest in the Lab, segregated by
				9	board and pool, and determines whether each DUT is working or
				10	broken. Then, send one or more e-mail reports summarizing the
				11	status to e-mail addresses provided on the command line.
				12
				13	usage: lab_inventory.py [ options ] [ board ... ]
				14
				15	Options:
				16	--duration / -d <hours>
				17	How far back in time to search job history to determine DUT
				18	status.
				19
				20	--board-notify <address>[,<address>]
				21	Send the "board status" e-mail to all the specified e-mail
				22	addresses.
				23
				24	--pool-notify <address>[,<address>]
				25	Send the "pool status" e-mail to all the specified e-mail
				26	addresses.
				27
				28	--logdir <directory>
				29	Log progress and actions in a file under this directory. Text
				30	of any e-mail sent will also be logged in a timestamped file in
				31	this directory.
				32
				33	--print
				34	Suppress all logging and sending e-mail. Instead, write the
				35	output that would be generated onto stdout.
				36
				37	<board> arguments:
				38	With no arguments, gathers the status for all boards in the lab.
				39	With one or more named boards on the command line, restricts
				40	reporting to just those boards.
				41
				42	"""
				43
				44
				45	import argparse
				46	import logging
				47	import logging.handlers
				48	import os
				49	import sys
				50	import time
				51
				52	import common
				53	from autotest_lib.client.common_lib import time_utils
				54	from autotest_lib.server import frontend
				55	from autotest_lib.site_utils import gmail_lib
				56	from autotest_lib.site_utils import status_history
				57	from autotest_lib.site_utils.suite_scheduler import constants
				58
				59
				60	# The pools in the Lab that are actually of interest.
				61	#
				62	# These are general purpose pools of DUTs that are considered
				63	# identical for purposes of testing. That is, a device in one of
				64	# these pools can be shifted to another pool at will for purposes
				65	# of supplying test demand.
				66	#
				67	# Devices in these pools are not allowed to have special-purpose
				68	# attachments, or to be part of in any kind of custom fixture.
				69	# Devices in these pools are also required to reside in areas
				70	# managed by the Platforms team (i.e. at the time of this writing,
				71	# only in "Atlantis" or "Destiny").
				72	#
				73	# _CRITICAL_POOLS - Pools that must be kept fully supplied in order
				74	# to guarantee timely completion of tests from builders.
				75	# _SPARE_POOL - A low priority pool that is allowed to provide
				76	# spares to replace broken devices in the critical pools.
				77	# _MANAGED_POOLS - The set of all the general purpose pools
				78	# monitored by this script.
				79
				80	_CRITICAL_POOLS = ['bvt', 'cq']
				81	_SPARE_POOL = 'suites'
				82	_MANAGED_POOLS = _CRITICAL_POOLS + [_SPARE_POOL]
				83
				84
				85	# _DEFAULT_DURATION:
				86	# Default value used for the --duration command line option.
				87	# Specifies how far back in time to search in order to determine
				88	# DUT status.
				89
				90	_DEFAULT_DURATION = 24
				91
				92
				93	# _LOGDIR:
				94	# Relative path used in the calculation of the default setting
				95	# for the --logdir option. The full path path is relative to
				96	# the root of the autotest directory, as determined from
				97	# sys.argv[0].
				98	# _LOGFILE:
				99	# Basename of a file to which general log information will be
				100	# written.
				101	# _LOG_FORMAT:
				102	# Format string for log messages.
				103
				104	_LOGDIR = os.path.join('logs', 'dut-data')
				105	_LOGFILE = 'lab-inventory.log'
				106	_LOG_FORMAT = '%(asctime)s \| %(levelname)-10s \| %(message)s'
				107
				108
				109	class _PoolCounts(object):
				110	"""Maintains a set of `HostJobHistory` objects for a pool.
				111
				112	The collected history objects are nominally all part of a single
				113	scheduling pool of DUTs. The collection maintains a count of
				114	working DUTs, a count of broken DUTs, and a total count.
				115
				116	Performance note: The methods `get_working()` and
				117	`get_broken()` (but not `get_total()`) are potentially
				118	expensive. The first time they're called, they must make a
				119	potentially expensive set of database queries. The results of
				120	the queries are cached in the individual `HostJobHistory`
				121	objects, so only the first call actually pays the cost.
				122
				123	This class is deliberately constructed to delay that cost until
				124	the accessor methods are called (rather than to query in
				125	`record_host()`) so that it's possible to construct a complete
				126	`_LabInventory` without making the expensive queries at creation
				127	time. `_populate_board_counts()`, below, relies on this
				128	behavior.
				129
				130	"""
				131
				132	def __init__(self):
				133	self._histories = []
				134
				135
				136	def record_host(self, host_history):
				137	"""Add one `HostJobHistory` object to the collection.
				138
				139	@param host_history The `HostJobHistory` object to be
				140	remembered.
				141
				142	"""
				143	self._histories.append(host_history)
				144
				145
				146	def get_working(self):
				147	"""Return the number of working DUTs in the collection."""
				148	return len([h for h in self._histories
				149	if h.last_diagnosis()[0] == status_history.WORKING])
				150
				151
				152	def get_broken(self):
				153	"""Return the number of broken DUTs in the collection."""
				154	return len([h for h in self._histories
				155	if h.last_diagnosis()[0] != status_history.WORKING])
				156
				157
				158	def get_total(self):
				159	"""Return the total number of DUTs in the collection."""
				160	return len(self._histories)
				161
				162
				163	class _BoardCounts(object):
				164	"""Maintains a set of `HostJobHistory` objects for a board.
				165
				166	The collected history objects are nominally all of the same
				167	board. The collection maintains a count of working DUTs, a
				168	count of broken DUTs, and a total count. The counts can be
				169	obtained either for a single pool, or as a total across all
				170	pools.
				171
				172	DUTs in the collection must be assigned to one of the pools
				173	in `_MANAGED_POOLS`.
				174
				175	The `get_working()` and `get_broken()` methods rely on the
				176	methods of the same name in _PoolCounts, so the performance
				177	note in _PoolCounts applies here as well.
				178
				179	"""
				180
				181	def __init__(self):
				182	self._pools = {
				183	pool: _PoolCounts() for pool in _MANAGED_POOLS
				184	}
				185
				186	def record_host(self, host_history):
				187	"""Add one `HostJobHistory` object to the collection.
				188
				189	@param host_history The `HostJobHistory` object to be
				190	remembered.
				191
				192	"""
J. Richard Barnette	3d0590a	2015-04-29 12:56:12 -0700	[diff] [blame]	193	pool = host_history.host_pool
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	194	self._pools[pool].record_host(host_history)
				195
				196
				197	def _count_pool(self, get_pool_count, pool=None):
				198	"""Internal helper to count hosts in a given pool.
				199
				200	The `get_pool_count` parameter is a function to calculate
				201	the exact count of interest for the pool.
				202
				203	@param get_pool_count Function to return a count from a
				204	_PoolCount object.
				205	@param pool The pool to be counted. If `None`,
				206	return the total across all pools.
				207
				208	"""
				209	if pool is None:
				210	return sum([get_pool_count(counts)
				211	for counts in self._pools.values()])
				212	else:
				213	return get_pool_count(self._pools[pool])
				214
				215
				216	def get_working(self, pool=None):
				217	"""Return the number of working DUTs in a pool.
				218
				219	@param pool The pool to be counted. If `None`, return the
				220	total across all pools.
				221
				222	"""
				223	return self._count_pool(_PoolCounts.get_working, pool)
				224
				225
				226	def get_broken(self, pool=None):
				227	"""Return the number of broken DUTs in a pool.
				228
				229	@param pool The pool to be counted. If `None`, return the
				230	total across all pools.
				231
				232	"""
				233	return self._count_pool(_PoolCounts.get_broken, pool)
				234
				235
				236	def get_total(self, pool=None):
				237	"""Return the total number of DUTs in a pool.
				238
				239	@param pool The pool to be counted. If `None`, return the
				240	total across all pools.
				241
				242	"""
				243	return self._count_pool(_PoolCounts.get_total, pool)
				244
				245
				246	class _LabInventory(dict):
				247	"""Collection of `HostJobHistory` objects for the Lab's inventory.
				248
				249	The collection is indexed by board. Indexing returns the
				250	_BoardCounts object associated with the board.
				251
				252	The collection is also iterable. The iterator returns all the
				253	boards in the inventory, in unspecified order.
				254
				255	"""
				256
				257	@classmethod
				258	def create_inventory(cls, afe, start_time, end_time, boardlist=[]):
				259	"""Return a Lab inventory with specified parameters.
				260
				261	By default, gathers inventory from `HostJobHistory` objects
				262	for all DUTs in the `_MANAGED_POOLS` list. If `boardlist`
				263	is supplied, the inventory will be restricted to only the
				264	given boards.
				265
				266	@param afe AFE object for constructing the
				267	`HostJobHistory` objects.
				268	@param start_time Start time for the `HostJobHistory`
				269	objects.
				270	@param end_time End time for the `HostJobHistory`
				271	objects.
				272	@param boardlist List of boards to include. If empty,
				273	include all available boards.
				274	@return A `_LabInventory` object for the specified boards.
				275
				276	"""
				277	label_list = [constants.Labels.POOL_PREFIX + l
				278	for l in _MANAGED_POOLS]
				279	afehosts = afe.get_hosts(labels__name__in=label_list)
				280	if boardlist:
				281	boardhosts = []
				282	for board in boardlist:
				283	board_label = constants.Labels.BOARD_PREFIX + board
				284	host_list = [h for h in afehosts
				285	if board_label in h.labels]
				286	boardhosts.extend(host_list)
				287	afehosts = boardhosts
				288	create = lambda host: (
				289	status_history.HostJobHistory(afe, host,
				290	start_time, end_time))
				291	return cls([create(host) for host in afehosts])
				292
				293
				294	def __init__(self, histories):
J. Richard Barnette	6948ed3	2015-05-06 08:57:10 -0700	[diff] [blame^]	295	# N.B. The query that finds our hosts is restricted to those
				296	# with a valid pool: label, but doesn't check for a valid
				297	# board: label. In some (insufficiently) rare cases, the
				298	# AFE hosts table has been known to (incorrectly) have DUTs
				299	# with a pool: but no board: label. We explicitly exclude
				300	# those here.
				301	histories = [h for h in histories
				302	if h.host_board is not None]
J. Richard Barnette	3d0590a	2015-04-29 12:56:12 -0700	[diff] [blame]	303	boards = set([h.host_board for h in histories])
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	304	initval = { board: _BoardCounts() for board in boards }
				305	super(_LabInventory, self).__init__(initval)
				306	self._dut_count = len(histories)
				307	for h in histories:
J. Richard Barnette	3d0590a	2015-04-29 12:56:12 -0700	[diff] [blame]	308	self[h.host_board].record_host(h)
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	309
				310
				311	def get_num_duts(self):
				312	"""Return the total number of DUTs in the inventory."""
				313	return self._dut_count
				314
				315
				316	def get_num_boards(self):
				317	"""Return the total number of boards in the inventory."""
				318	return len(self)
				319
				320
				321	def _generate_board_inventory_message(inventory):
				322	"""Generate the "board inventory" e-mail message.
				323
				324	The board inventory is a list by board summarizing the number
				325	of working and broken DUTs, and the total shortfall or surplus
				326	of working devices relative to the minimum critical pool
				327	requirement.
				328
				329	The report omits boards with no DUTs in the spare pool or with
				330	no DUTs in a critical pool.
				331
				332	N.B. For sample output text formattted as users can expect to
				333	see it in e-mail and log files, refer to the unit tests.
				334
				335	@param inventory _LabInventory object with the inventory to
				336	be reported on.
				337	@return String with the inventory message to be sent.
				338
				339	"""
				340	logging.debug('Creating board inventory')
				341	message = []
				342	message.append(
				343	'%-20s %5s %5s %5s %5s %5s' % (
				344	'Board', 'Avail', 'Bad', 'Good', 'Spare', 'Total'))
				345	data_list = []
				346	for board, counts in inventory.items():
				347	logging.debug('Counting inventory for %s', board)
				348	spares = counts.get_total(_SPARE_POOL)
				349	total = counts.get_total()
				350	if spares == 0 or spares == total:
				351	continue
				352	working = counts.get_working()
				353	broken = counts.get_broken()
				354	buffer = spares - broken
				355	data_list.append((board, buffer, broken, working, spares, total))
				356	data_list = sorted(sorted(data_list, key=lambda t: -t[2]),
				357	key=lambda t: t[1])
				358	message.extend(
				359	['%-20s %5d %5d %5d %5d %5d' % t for t in data_list])
				360	return '\n'.join(message)
				361
				362
J. Richard Barnette	4845fcf	2015-04-20 14:26:25 -0700	[diff] [blame]	363	_POOL_INVENTORY_HEADER = '''\
				364	Notice to Infrastructure deputy: If there are shortages below,
				365	please take action to resolve them. If it's safe, you should
				366	balance shortages by running `balance_pool` or `freon_swap` as
				367	necessary. Detailed instructions can be found here:
				368	http://go/cros-manage-duts
				369	'''
				370
				371
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	372	def _generate_pool_inventory_message(inventory):
				373	"""Generate the "pool inventory" e-mail message.
				374
				375	The pool inventory is a list by pool and board summarizing the
				376	number of working and broken DUTs in the pool. Only boards with
				377	at least one broken DUT are included in the list.
				378
				379	N.B. For sample output text formattted as users can expect to
				380	see it in e-mail and log files, refer to the unit tests.
				381
				382	@param inventory _LabInventory object with the inventory to
				383	be reported on.
				384	@return String with the inventory message to be sent.
				385
				386	"""
				387	logging.debug('Creating pool inventory')
J. Richard Barnette	4845fcf	2015-04-20 14:26:25 -0700	[diff] [blame]	388	message = [_POOL_INVENTORY_HEADER]
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	389	newline = ''
				390	for pool in _CRITICAL_POOLS:
				391	message.append(
				392	'%sStatus for pool:%s, by board:' % (newline, pool))
				393	message.append(
				394	'%-20s %5s %5s %5s' % (
				395	'Board', 'Bad', 'Good', 'Total'))
				396	data_list = []
				397	for board, counts in inventory.items():
				398	logging.debug('Counting inventory for %s, %s',
				399	board, pool)
				400	broken = counts.get_broken(pool)
				401	if broken == 0:
				402	continue
				403	working = counts.get_working(pool)
				404	total = counts.get_total(pool)
				405	data_list.append((board, broken, working, total))
				406	if data_list:
				407	data_list = sorted(data_list, key=lambda d: -d[1])
				408	message.extend(
				409	['%-20s %5d %5d %5d' % t for t in data_list])
				410	else:
				411	message.append('(All boards at full strength)')
				412	newline = '\n'
				413	return '\n'.join(message)
				414
				415
				416	def _send_email(arguments, tag, subject, recipients, body):
				417	"""Send an inventory e-mail message.
				418
				419	The message is logged in the selected log directory using `tag`
				420	for the file name.
				421
				422	If the --print option was requested, the message is neither
				423	logged nor sent, but merely printed on stdout.
				424
				425	@param arguments Parsed command-line options.
				426	@param tag Tag identifying the inventory for logging
				427	purposes.
				428	@param subject E-mail Subject: header line.
				429	@param recipients E-mail addresses for the To: header line.
				430	@param body E-mail message body.
				431
				432	"""
				433	logging.debug('Generating email: "%s"', subject)
				434	all_recipients = ', '.join(recipients)
				435	report_body = '\n'.join([
				436	'To: %s' % all_recipients,
				437	'Subject: %s' % subject,
				438	'', body, ''])
				439	if arguments.print_:
				440	print report_body
				441	else:
				442	filename = os.path.join(arguments.logdir, tag)
				443	try:
				444	report_file = open(filename, 'w')
				445	report_file.write(report_body)
				446	report_file.close()
				447	except EnvironmentError as e:
				448	logging.error('Failed to write %s: %s', filename, e)
				449	try:
				450	gmail_lib.send_email(all_recipients, subject, body)
				451	except Exception as e:
				452	logging.error('Failed to send e-mail to %s: %s',
				453	all_recipients, e)
				454
				455
				456	def _separate_email_addresses(address_list):
				457	"""Parse a list of comma-separated lists of e-mail addresses.
				458
				459	@param address_list A list of strings containing comma
				460	separate e-mail addresses.
				461	@return A list of the individual e-mail addresses.
				462
				463	"""
				464	newlist = []
				465	for arg in address_list:
				466	newlist.extend([email.strip() for email in arg.split(',')])
				467	return newlist
				468
				469
				470	def _verify_arguments(arguments):
				471	"""Validate command-line arguments.
				472
				473	Join comma separated e-mail addresses for `--board-notify` and
				474	`--pool-notify` in separate option arguments into a single list.
				475
				476	@param arguments Command-line arguments as returned by
				477	`ArgumentParser`
				478
				479	"""
				480	arguments.board_notify = _separate_email_addresses(
				481	arguments.board_notify)
				482	arguments.pool_notify = _separate_email_addresses(
				483	arguments.pool_notify)
				484
				485
				486	def _get_logdir(script):
				487	"""Get the default directory for the `--logdir` option.
				488
				489	The default log directory is based on the parent directory
				490	containing this script.
				491
				492	@param script Path to this script file.
				493	@return A path to a directory.
				494
				495	"""
				496	basedir = os.path.dirname(os.path.abspath(script))
				497	basedir = os.path.dirname(basedir)
				498	return os.path.join(basedir, _LOGDIR)
				499
				500
				501	def _parse_command(argv):
				502	"""Parse the command line arguments.
				503
				504	Create an argument parser for this command's syntax, parse the
				505	command line, and return the result of the ArgumentParser
				506	parse_args() method.
				507
				508	@param argv Standard command line argument vector; argv[0] is
				509	assumed to be the command name.
				510	@return Result returned by ArgumentParser.parse_args().
				511
				512	"""
				513	parser = argparse.ArgumentParser(
				514	prog=argv[0],
				515	description='Gather and report lab inventory statistics')
				516	parser.add_argument('-d', '--duration', type=int,
				517	default=_DEFAULT_DURATION, metavar='HOURS',
				518	help='number of hours back to search for status'
				519	' (default: %d)' % _DEFAULT_DURATION)
				520	parser.add_argument('--board-notify', action='append',
				521	default=[], metavar='ADDRESS',
				522	help='Generate board inventory message, '
				523	'and send it to the given e-mail address(es)')
				524	parser.add_argument('--pool-notify', action='append',
				525	default=[], metavar='ADDRESS',
				526	help='Generate pool inventory message, '
				527	'and send it to the given address(es)')
				528	parser.add_argument('--print', dest='print_', action='store_true',
				529	help='Print e-mail messages on stdout '
				530	'without sending them.')
				531	parser.add_argument('--logdir', default=_get_logdir(argv[0]),
				532	help='Directory where logs will be written.')
				533	parser.add_argument('boardnames', nargs='*',
				534	metavar='BOARD',
				535	help='names of boards to report on '
				536	'(default: all boards)')
				537	arguments = parser.parse_args(argv[1:])
				538	_verify_arguments(arguments)
				539	return arguments
				540
				541
				542	def _configure_logging(arguments):
				543	"""Configure the `logging` module for our needs.
				544
				545	How we log depends on whether the `--print` option was
				546	provided on the command line. Without the option, we log all
				547	messages at DEBUG level or above, and write them to a file in
				548	the directory specified by the `--logdir` option. With the
				549	option, we write log messages to stdout; messages below INFO
				550	level are discarded.
				551
				552	The log file is configured to rotate once a week on Friday
				553	evening, preserving ~3 months worth of history.
				554
				555	@param arguments Command-line arguments as returned by
				556	`ArgumentParser`
				557
				558	"""
				559	if arguments.print_:
				560	logging.getLogger().setLevel(logging.INFO)
				561	handler = logging.StreamHandler(sys.stdout)
				562	handler.setFormatter(logging.Formatter())
				563	else:
				564	logging.getLogger().setLevel(logging.DEBUG)
				565	logfile = os.path.join(arguments.logdir, _LOGFILE)
				566	handler = logging.handlers.TimedRotatingFileHandler(
				567	logfile, when='W4', backupCount=13)
				568	formatter = logging.Formatter(_LOG_FORMAT,
				569	time_utils.TIME_FMT)
				570	handler.setFormatter(formatter)
				571	logging.getLogger().addHandler(handler)
				572
				573
				574	def _populate_board_counts(inventory):
				575	"""Gather board counts while providing interactive feedback.
				576
				577	Gathering the status of all individual DUTs in the lab can take
				578	considerable time (~30 minutes at the time of this writing).
				579
				580	Normally, we pay that cost by querying as we go. However, with
				581	the `--print` option, a human being may be watching the
				582	progress. So, we force the first (expensive) queries to happen
				583	up front, and provide a small ASCII progress bar to give an
				584	indicator of how many boards have been processed.
				585
				586	@param inventory _LabInventory object with the inventory to
				587	be gathered.
				588
				589	"""
				590	n = 0
				591	for counts in inventory.values():
				592	n += 1
				593	if n % 10 == 5:
				594	c = '+'
				595	elif n % 10 == 0:
				596	c = '%d' % ((n / 10) % 10)
				597	else:
				598	c = '.'
				599	sys.stdout.write(c)
				600	sys.stdout.flush()
				601	# This next call is where all the time goes - it forces all
				602	# of a board's HostJobHistory objects to query the database
				603	# and cache their results.
				604	counts.get_working()
				605	sys.stdout.write('\n')
				606
				607
				608	def main(argv):
				609	"""Standard main routine.
				610	@param argv Command line arguments including `sys.argv[0]`.
				611	"""
				612	arguments = _parse_command(argv)
				613	_configure_logging(arguments)
				614	try:
				615	end_time = int(time.time())
				616	start_time = end_time - arguments.duration * 60 * 60
				617	timestamp = time.strftime('%Y-%m-%d.%H',
				618	time.localtime(end_time))
				619	logging.debug('Starting lab inventory for %s', timestamp)
				620	if arguments.board_notify:
				621	logging.debug('Will include board inventory')
				622	if arguments.pool_notify:
				623	logging.debug('Will include pool inventory')
				624
				625	afe = frontend.AFE(server=None)
				626	inventory = _LabInventory.create_inventory(
				627	afe, start_time, end_time, arguments.boardnames)
				628	logging.info('Found %d hosts across %d boards',
				629	inventory.get_num_duts(),
				630	inventory.get_num_boards())
				631
				632	if arguments.print_:
				633	_populate_board_counts(inventory)
				634
				635	if arguments.print_ or arguments.board_notify:
				636	_send_email(arguments,
				637	'boards-%s.txt' % timestamp,
				638	'DUT board inventory %s' % timestamp,
				639	arguments.board_notify,
				640	_generate_board_inventory_message(inventory))
				641
				642	if arguments.print_ or arguments.pool_notify:
				643	_send_email(arguments,
				644	'pools-%s.txt' % timestamp,
				645	'DUT pool inventory %s' % timestamp,
				646	arguments.pool_notify,
				647	_generate_pool_inventory_message(inventory))
				648	except KeyboardInterrupt:
				649	pass
				650	except EnvironmentError as e:
				651	logging.exception('Unexpected OS error: %s', e)
				652	except Exception as e:
				653	logging.exception('Unexpected exception: %s', e)
				654
				655
				656	if __name__ == '__main__':
				657	main(sys.argv)