Blame - site_utils/lab_inventory.py - platform/external/autotest

blob: 182a62fda8e916898deac2932749a6c060ecbf37 [file] [log] [blame]

J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	1	#!/usr/bin/env python
				2	# Copyright 2015 The Chromium OS Authors. All rights reserved.
				3	# Use of this source code is governed by a BSD-style license that can be
				4	# found in the LICENSE file.
				5
				6	"""Create e-mail reports of the Lab's DUT inventory.
				7
				8	Gathers a list of all DUTs of interest in the Lab, segregated by
				9	board and pool, and determines whether each DUT is working or
				10	broken. Then, send one or more e-mail reports summarizing the
				11	status to e-mail addresses provided on the command line.
				12
				13	usage: lab_inventory.py [ options ] [ board ... ]
				14
				15	Options:
				16	--duration / -d <hours>
				17	How far back in time to search job history to determine DUT
				18	status.
				19
				20	--board-notify <address>[,<address>]
				21	Send the "board status" e-mail to all the specified e-mail
				22	addresses.
				23
				24	--pool-notify <address>[,<address>]
				25	Send the "pool status" e-mail to all the specified e-mail
				26	addresses.
				27
				28	--logdir <directory>
				29	Log progress and actions in a file under this directory. Text
				30	of any e-mail sent will also be logged in a timestamped file in
				31	this directory.
				32
				33	--print
				34	Suppress all logging and sending e-mail. Instead, write the
				35	output that would be generated onto stdout.
				36
				37	<board> arguments:
				38	With no arguments, gathers the status for all boards in the lab.
				39	With one or more named boards on the command line, restricts
				40	reporting to just those boards.
				41
				42	"""
				43
				44
				45	import argparse
				46	import logging
				47	import logging.handlers
				48	import os
				49	import sys
				50	import time
				51
				52	import common
				53	from autotest_lib.client.common_lib import time_utils
				54	from autotest_lib.server import frontend
				55	from autotest_lib.site_utils import gmail_lib
				56	from autotest_lib.site_utils import status_history
				57	from autotest_lib.site_utils.suite_scheduler import constants
				58
				59
				60	# The pools in the Lab that are actually of interest.
				61	#
				62	# These are general purpose pools of DUTs that are considered
				63	# identical for purposes of testing. That is, a device in one of
				64	# these pools can be shifted to another pool at will for purposes
				65	# of supplying test demand.
				66	#
				67	# Devices in these pools are not allowed to have special-purpose
				68	# attachments, or to be part of in any kind of custom fixture.
				69	# Devices in these pools are also required to reside in areas
				70	# managed by the Platforms team (i.e. at the time of this writing,
				71	# only in "Atlantis" or "Destiny").
				72	#
				73	# _CRITICAL_POOLS - Pools that must be kept fully supplied in order
				74	# to guarantee timely completion of tests from builders.
				75	# _SPARE_POOL - A low priority pool that is allowed to provide
				76	# spares to replace broken devices in the critical pools.
				77	# _MANAGED_POOLS - The set of all the general purpose pools
				78	# monitored by this script.
				79
				80	_CRITICAL_POOLS = ['bvt', 'cq']
				81	_SPARE_POOL = 'suites'
				82	_MANAGED_POOLS = _CRITICAL_POOLS + [_SPARE_POOL]
				83
				84
				85	# _DEFAULT_DURATION:
				86	# Default value used for the --duration command line option.
				87	# Specifies how far back in time to search in order to determine
				88	# DUT status.
				89
				90	_DEFAULT_DURATION = 24
				91
				92
				93	# _LOGDIR:
				94	# Relative path used in the calculation of the default setting
				95	# for the --logdir option. The full path path is relative to
				96	# the root of the autotest directory, as determined from
				97	# sys.argv[0].
				98	# _LOGFILE:
				99	# Basename of a file to which general log information will be
				100	# written.
				101	# _LOG_FORMAT:
				102	# Format string for log messages.
				103
				104	_LOGDIR = os.path.join('logs', 'dut-data')
				105	_LOGFILE = 'lab-inventory.log'
				106	_LOG_FORMAT = '%(asctime)s \| %(levelname)-10s \| %(message)s'
				107
				108
				109	class _PoolCounts(object):
				110	"""Maintains a set of `HostJobHistory` objects for a pool.
				111
				112	The collected history objects are nominally all part of a single
				113	scheduling pool of DUTs. The collection maintains a count of
				114	working DUTs, a count of broken DUTs, and a total count.
				115
				116	Performance note: The methods `get_working()` and
				117	`get_broken()` (but not `get_total()`) are potentially
				118	expensive. The first time they're called, they must make a
				119	potentially expensive set of database queries. The results of
				120	the queries are cached in the individual `HostJobHistory`
				121	objects, so only the first call actually pays the cost.
				122
				123	This class is deliberately constructed to delay that cost until
				124	the accessor methods are called (rather than to query in
				125	`record_host()`) so that it's possible to construct a complete
				126	`_LabInventory` without making the expensive queries at creation
				127	time. `_populate_board_counts()`, below, relies on this
				128	behavior.
				129
				130	"""
				131
				132	def __init__(self):
				133	self._histories = []
				134
				135
				136	def record_host(self, host_history):
				137	"""Add one `HostJobHistory` object to the collection.
				138
				139	@param host_history The `HostJobHistory` object to be
				140	remembered.
				141
				142	"""
				143	self._histories.append(host_history)
				144
				145
				146	def get_working(self):
				147	"""Return the number of working DUTs in the collection."""
				148	return len([h for h in self._histories
				149	if h.last_diagnosis()[0] == status_history.WORKING])
				150
				151
				152	def get_broken(self):
				153	"""Return the number of broken DUTs in the collection."""
				154	return len([h for h in self._histories
				155	if h.last_diagnosis()[0] != status_history.WORKING])
				156
				157
				158	def get_total(self):
				159	"""Return the total number of DUTs in the collection."""
				160	return len(self._histories)
				161
				162
				163	class _BoardCounts(object):
				164	"""Maintains a set of `HostJobHistory` objects for a board.
				165
				166	The collected history objects are nominally all of the same
				167	board. The collection maintains a count of working DUTs, a
				168	count of broken DUTs, and a total count. The counts can be
				169	obtained either for a single pool, or as a total across all
				170	pools.
				171
				172	DUTs in the collection must be assigned to one of the pools
				173	in `_MANAGED_POOLS`.
				174
				175	The `get_working()` and `get_broken()` methods rely on the
				176	methods of the same name in _PoolCounts, so the performance
				177	note in _PoolCounts applies here as well.
				178
				179	"""
				180
				181	def __init__(self):
				182	self._pools = {
				183	pool: _PoolCounts() for pool in _MANAGED_POOLS
				184	}
				185
				186	def record_host(self, host_history):
				187	"""Add one `HostJobHistory` object to the collection.
				188
				189	@param host_history The `HostJobHistory` object to be
				190	remembered.
				191
				192	"""
				193	pool = host_history.get_host_pool()
				194	self._pools[pool].record_host(host_history)
				195
				196
				197	def _count_pool(self, get_pool_count, pool=None):
				198	"""Internal helper to count hosts in a given pool.
				199
				200	The `get_pool_count` parameter is a function to calculate
				201	the exact count of interest for the pool.
				202
				203	@param get_pool_count Function to return a count from a
				204	_PoolCount object.
				205	@param pool The pool to be counted. If `None`,
				206	return the total across all pools.
				207
				208	"""
				209	if pool is None:
				210	return sum([get_pool_count(counts)
				211	for counts in self._pools.values()])
				212	else:
				213	return get_pool_count(self._pools[pool])
				214
				215
				216	def get_working(self, pool=None):
				217	"""Return the number of working DUTs in a pool.
				218
				219	@param pool The pool to be counted. If `None`, return the
				220	total across all pools.
				221
				222	"""
				223	return self._count_pool(_PoolCounts.get_working, pool)
				224
				225
				226	def get_broken(self, pool=None):
				227	"""Return the number of broken DUTs in a pool.
				228
				229	@param pool The pool to be counted. If `None`, return the
				230	total across all pools.
				231
				232	"""
				233	return self._count_pool(_PoolCounts.get_broken, pool)
				234
				235
				236	def get_total(self, pool=None):
				237	"""Return the total number of DUTs in a pool.
				238
				239	@param pool The pool to be counted. If `None`, return the
				240	total across all pools.
				241
				242	"""
				243	return self._count_pool(_PoolCounts.get_total, pool)
				244
				245
				246	class _LabInventory(dict):
				247	"""Collection of `HostJobHistory` objects for the Lab's inventory.
				248
				249	The collection is indexed by board. Indexing returns the
				250	_BoardCounts object associated with the board.
				251
				252	The collection is also iterable. The iterator returns all the
				253	boards in the inventory, in unspecified order.
				254
				255	"""
				256
				257	@classmethod
				258	def create_inventory(cls, afe, start_time, end_time, boardlist=[]):
				259	"""Return a Lab inventory with specified parameters.
				260
				261	By default, gathers inventory from `HostJobHistory` objects
				262	for all DUTs in the `_MANAGED_POOLS` list. If `boardlist`
				263	is supplied, the inventory will be restricted to only the
				264	given boards.
				265
				266	@param afe AFE object for constructing the
				267	`HostJobHistory` objects.
				268	@param start_time Start time for the `HostJobHistory`
				269	objects.
				270	@param end_time End time for the `HostJobHistory`
				271	objects.
				272	@param boardlist List of boards to include. If empty,
				273	include all available boards.
				274	@return A `_LabInventory` object for the specified boards.
				275
				276	"""
				277	label_list = [constants.Labels.POOL_PREFIX + l
				278	for l in _MANAGED_POOLS]
				279	afehosts = afe.get_hosts(labels__name__in=label_list)
				280	if boardlist:
				281	boardhosts = []
				282	for board in boardlist:
				283	board_label = constants.Labels.BOARD_PREFIX + board
				284	host_list = [h for h in afehosts
				285	if board_label in h.labels]
				286	boardhosts.extend(host_list)
				287	afehosts = boardhosts
				288	create = lambda host: (
				289	status_history.HostJobHistory(afe, host,
				290	start_time, end_time))
				291	return cls([create(host) for host in afehosts])
				292
				293
				294	def __init__(self, histories):
				295	boards = set([h.get_host_board() for h in histories])
				296	initval = { board: _BoardCounts() for board in boards }
				297	super(_LabInventory, self).__init__(initval)
				298	self._dut_count = len(histories)
				299	for h in histories:
				300	self[h.get_host_board()].record_host(h)
				301
				302
				303	def get_num_duts(self):
				304	"""Return the total number of DUTs in the inventory."""
				305	return self._dut_count
				306
				307
				308	def get_num_boards(self):
				309	"""Return the total number of boards in the inventory."""
				310	return len(self)
				311
				312
				313	def _generate_board_inventory_message(inventory):
				314	"""Generate the "board inventory" e-mail message.
				315
				316	The board inventory is a list by board summarizing the number
				317	of working and broken DUTs, and the total shortfall or surplus
				318	of working devices relative to the minimum critical pool
				319	requirement.
				320
				321	The report omits boards with no DUTs in the spare pool or with
				322	no DUTs in a critical pool.
				323
				324	N.B. For sample output text formattted as users can expect to
				325	see it in e-mail and log files, refer to the unit tests.
				326
				327	@param inventory _LabInventory object with the inventory to
				328	be reported on.
				329	@return String with the inventory message to be sent.
				330
				331	"""
				332	logging.debug('Creating board inventory')
				333	message = []
				334	message.append(
				335	'%-20s %5s %5s %5s %5s %5s' % (
				336	'Board', 'Avail', 'Bad', 'Good', 'Spare', 'Total'))
				337	data_list = []
				338	for board, counts in inventory.items():
				339	logging.debug('Counting inventory for %s', board)
				340	spares = counts.get_total(_SPARE_POOL)
				341	total = counts.get_total()
				342	if spares == 0 or spares == total:
				343	continue
				344	working = counts.get_working()
				345	broken = counts.get_broken()
				346	buffer = spares - broken
				347	data_list.append((board, buffer, broken, working, spares, total))
				348	data_list = sorted(sorted(data_list, key=lambda t: -t[2]),
				349	key=lambda t: t[1])
				350	message.extend(
				351	['%-20s %5d %5d %5d %5d %5d' % t for t in data_list])
				352	return '\n'.join(message)
				353
				354
J. Richard Barnette	4845fcf	2015-04-20 14:26:25 -0700	[diff] [blame]	355	_POOL_INVENTORY_HEADER = '''\
				356	Notice to Infrastructure deputy: If there are shortages below,
				357	please take action to resolve them. If it's safe, you should
				358	balance shortages by running `balance_pool` or `freon_swap` as
				359	necessary. Detailed instructions can be found here:
				360	http://go/cros-manage-duts
				361	'''
				362
				363
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	364	def _generate_pool_inventory_message(inventory):
				365	"""Generate the "pool inventory" e-mail message.
				366
				367	The pool inventory is a list by pool and board summarizing the
				368	number of working and broken DUTs in the pool. Only boards with
				369	at least one broken DUT are included in the list.
				370
				371	N.B. For sample output text formattted as users can expect to
				372	see it in e-mail and log files, refer to the unit tests.
				373
				374	@param inventory _LabInventory object with the inventory to
				375	be reported on.
				376	@return String with the inventory message to be sent.
				377
				378	"""
				379	logging.debug('Creating pool inventory')
J. Richard Barnette	4845fcf	2015-04-20 14:26:25 -0700	[diff] [blame]	380	message = [_POOL_INVENTORY_HEADER]
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	381	newline = ''
				382	for pool in _CRITICAL_POOLS:
				383	message.append(
				384	'%sStatus for pool:%s, by board:' % (newline, pool))
				385	message.append(
				386	'%-20s %5s %5s %5s' % (
				387	'Board', 'Bad', 'Good', 'Total'))
				388	data_list = []
				389	for board, counts in inventory.items():
				390	logging.debug('Counting inventory for %s, %s',
				391	board, pool)
				392	broken = counts.get_broken(pool)
				393	if broken == 0:
				394	continue
				395	working = counts.get_working(pool)
				396	total = counts.get_total(pool)
				397	data_list.append((board, broken, working, total))
				398	if data_list:
				399	data_list = sorted(data_list, key=lambda d: -d[1])
				400	message.extend(
				401	['%-20s %5d %5d %5d' % t for t in data_list])
				402	else:
				403	message.append('(All boards at full strength)')
				404	newline = '\n'
				405	return '\n'.join(message)
				406
				407
				408	def _send_email(arguments, tag, subject, recipients, body):
				409	"""Send an inventory e-mail message.
				410
				411	The message is logged in the selected log directory using `tag`
				412	for the file name.
				413
				414	If the --print option was requested, the message is neither
				415	logged nor sent, but merely printed on stdout.
				416
				417	@param arguments Parsed command-line options.
				418	@param tag Tag identifying the inventory for logging
				419	purposes.
				420	@param subject E-mail Subject: header line.
				421	@param recipients E-mail addresses for the To: header line.
				422	@param body E-mail message body.
				423
				424	"""
				425	logging.debug('Generating email: "%s"', subject)
				426	all_recipients = ', '.join(recipients)
				427	report_body = '\n'.join([
				428	'To: %s' % all_recipients,
				429	'Subject: %s' % subject,
				430	'', body, ''])
				431	if arguments.print_:
				432	print report_body
				433	else:
				434	filename = os.path.join(arguments.logdir, tag)
				435	try:
				436	report_file = open(filename, 'w')
				437	report_file.write(report_body)
				438	report_file.close()
				439	except EnvironmentError as e:
				440	logging.error('Failed to write %s: %s', filename, e)
				441	try:
				442	gmail_lib.send_email(all_recipients, subject, body)
				443	except Exception as e:
				444	logging.error('Failed to send e-mail to %s: %s',
				445	all_recipients, e)
				446
				447
				448	def _separate_email_addresses(address_list):
				449	"""Parse a list of comma-separated lists of e-mail addresses.
				450
				451	@param address_list A list of strings containing comma
				452	separate e-mail addresses.
				453	@return A list of the individual e-mail addresses.
				454
				455	"""
				456	newlist = []
				457	for arg in address_list:
				458	newlist.extend([email.strip() for email in arg.split(',')])
				459	return newlist
				460
				461
				462	def _verify_arguments(arguments):
				463	"""Validate command-line arguments.
				464
				465	Join comma separated e-mail addresses for `--board-notify` and
				466	`--pool-notify` in separate option arguments into a single list.
				467
				468	@param arguments Command-line arguments as returned by
				469	`ArgumentParser`
				470
				471	"""
				472	arguments.board_notify = _separate_email_addresses(
				473	arguments.board_notify)
				474	arguments.pool_notify = _separate_email_addresses(
				475	arguments.pool_notify)
				476
				477
				478	def _get_logdir(script):
				479	"""Get the default directory for the `--logdir` option.
				480
				481	The default log directory is based on the parent directory
				482	containing this script.
				483
				484	@param script Path to this script file.
				485	@return A path to a directory.
				486
				487	"""
				488	basedir = os.path.dirname(os.path.abspath(script))
				489	basedir = os.path.dirname(basedir)
				490	return os.path.join(basedir, _LOGDIR)
				491
				492
				493	def _parse_command(argv):
				494	"""Parse the command line arguments.
				495
				496	Create an argument parser for this command's syntax, parse the
				497	command line, and return the result of the ArgumentParser
				498	parse_args() method.
				499
				500	@param argv Standard command line argument vector; argv[0] is
				501	assumed to be the command name.
				502	@return Result returned by ArgumentParser.parse_args().
				503
				504	"""
				505	parser = argparse.ArgumentParser(
				506	prog=argv[0],
				507	description='Gather and report lab inventory statistics')
				508	parser.add_argument('-d', '--duration', type=int,
				509	default=_DEFAULT_DURATION, metavar='HOURS',
				510	help='number of hours back to search for status'
				511	' (default: %d)' % _DEFAULT_DURATION)
				512	parser.add_argument('--board-notify', action='append',
				513	default=[], metavar='ADDRESS',
				514	help='Generate board inventory message, '
				515	'and send it to the given e-mail address(es)')
				516	parser.add_argument('--pool-notify', action='append',
				517	default=[], metavar='ADDRESS',
				518	help='Generate pool inventory message, '
				519	'and send it to the given address(es)')
				520	parser.add_argument('--print', dest='print_', action='store_true',
				521	help='Print e-mail messages on stdout '
				522	'without sending them.')
				523	parser.add_argument('--logdir', default=_get_logdir(argv[0]),
				524	help='Directory where logs will be written.')
				525	parser.add_argument('boardnames', nargs='*',
				526	metavar='BOARD',
				527	help='names of boards to report on '
				528	'(default: all boards)')
				529	arguments = parser.parse_args(argv[1:])
				530	_verify_arguments(arguments)
				531	return arguments
				532
				533
				534	def _configure_logging(arguments):
				535	"""Configure the `logging` module for our needs.
				536
				537	How we log depends on whether the `--print` option was
				538	provided on the command line. Without the option, we log all
				539	messages at DEBUG level or above, and write them to a file in
				540	the directory specified by the `--logdir` option. With the
				541	option, we write log messages to stdout; messages below INFO
				542	level are discarded.
				543
				544	The log file is configured to rotate once a week on Friday
				545	evening, preserving ~3 months worth of history.
				546
				547	@param arguments Command-line arguments as returned by
				548	`ArgumentParser`
				549
				550	"""
				551	if arguments.print_:
				552	logging.getLogger().setLevel(logging.INFO)
				553	handler = logging.StreamHandler(sys.stdout)
				554	handler.setFormatter(logging.Formatter())
				555	else:
				556	logging.getLogger().setLevel(logging.DEBUG)
				557	logfile = os.path.join(arguments.logdir, _LOGFILE)
				558	handler = logging.handlers.TimedRotatingFileHandler(
				559	logfile, when='W4', backupCount=13)
				560	formatter = logging.Formatter(_LOG_FORMAT,
				561	time_utils.TIME_FMT)
				562	handler.setFormatter(formatter)
				563	logging.getLogger().addHandler(handler)
				564
				565
				566	def _populate_board_counts(inventory):
				567	"""Gather board counts while providing interactive feedback.
				568
				569	Gathering the status of all individual DUTs in the lab can take
				570	considerable time (~30 minutes at the time of this writing).
				571
				572	Normally, we pay that cost by querying as we go. However, with
				573	the `--print` option, a human being may be watching the
				574	progress. So, we force the first (expensive) queries to happen
				575	up front, and provide a small ASCII progress bar to give an
				576	indicator of how many boards have been processed.
				577
				578	@param inventory _LabInventory object with the inventory to
				579	be gathered.
				580
				581	"""
				582	n = 0
				583	for counts in inventory.values():
				584	n += 1
				585	if n % 10 == 5:
				586	c = '+'
				587	elif n % 10 == 0:
				588	c = '%d' % ((n / 10) % 10)
				589	else:
				590	c = '.'
				591	sys.stdout.write(c)
				592	sys.stdout.flush()
				593	# This next call is where all the time goes - it forces all
				594	# of a board's HostJobHistory objects to query the database
				595	# and cache their results.
				596	counts.get_working()
				597	sys.stdout.write('\n')
				598
				599
				600	def main(argv):
				601	"""Standard main routine.
				602	@param argv Command line arguments including `sys.argv[0]`.
				603	"""
				604	arguments = _parse_command(argv)
				605	_configure_logging(arguments)
				606	try:
				607	end_time = int(time.time())
				608	start_time = end_time - arguments.duration * 60 * 60
				609	timestamp = time.strftime('%Y-%m-%d.%H',
				610	time.localtime(end_time))
				611	logging.debug('Starting lab inventory for %s', timestamp)
				612	if arguments.board_notify:
				613	logging.debug('Will include board inventory')
				614	if arguments.pool_notify:
				615	logging.debug('Will include pool inventory')
				616
				617	afe = frontend.AFE(server=None)
				618	inventory = _LabInventory.create_inventory(
				619	afe, start_time, end_time, arguments.boardnames)
				620	logging.info('Found %d hosts across %d boards',
				621	inventory.get_num_duts(),
				622	inventory.get_num_boards())
				623
				624	if arguments.print_:
				625	_populate_board_counts(inventory)
				626
				627	if arguments.print_ or arguments.board_notify:
				628	_send_email(arguments,
				629	'boards-%s.txt' % timestamp,
				630	'DUT board inventory %s' % timestamp,
				631	arguments.board_notify,
				632	_generate_board_inventory_message(inventory))
				633
				634	if arguments.print_ or arguments.pool_notify:
				635	_send_email(arguments,
				636	'pools-%s.txt' % timestamp,
				637	'DUT pool inventory %s' % timestamp,
				638	arguments.pool_notify,
				639	_generate_pool_inventory_message(inventory))
				640	except KeyboardInterrupt:
				641	pass
				642	except EnvironmentError as e:
				643	logging.exception('Unexpected OS error: %s', e)
				644	except Exception as e:
				645	logging.exception('Unexpected exception: %s', e)
				646
				647
				648	if __name__ == '__main__':
				649	main(sys.argv)