Blame - site_utils/lab_inventory.py - platform/external/autotest

blob: 1fda391165b119c9698336c1e9974f1063b54151 [file] [log] [blame]

J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	1	#!/usr/bin/env python
				2	# Copyright 2015 The Chromium OS Authors. All rights reserved.
				3	# Use of this source code is governed by a BSD-style license that can be
				4	# found in the LICENSE file.
				5
				6	"""Create e-mail reports of the Lab's DUT inventory.
				7
				8	Gathers a list of all DUTs of interest in the Lab, segregated by
				9	board and pool, and determines whether each DUT is working or
				10	broken. Then, send one or more e-mail reports summarizing the
				11	status to e-mail addresses provided on the command line.
				12
				13	usage: lab_inventory.py [ options ] [ board ... ]
				14
				15	Options:
				16	--duration / -d <hours>
				17	How far back in time to search job history to determine DUT
				18	status.
				19
				20	--board-notify <address>[,<address>]
				21	Send the "board status" e-mail to all the specified e-mail
				22	addresses.
				23
				24	--pool-notify <address>[,<address>]
				25	Send the "pool status" e-mail to all the specified e-mail
				26	addresses.
				27
J. Richard Barnette	1df6a56	2015-06-09 10:06:17 -0700	[diff] [blame]	28	--recommend <number>
				29	When generating the "board status" e-mail, included a list of
				30	<number> specific DUTs to be recommended for repair.
				31
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	32	--logdir <directory>
				33	Log progress and actions in a file under this directory. Text
				34	of any e-mail sent will also be logged in a timestamped file in
				35	this directory.
				36
				37	--print
				38	Suppress all logging and sending e-mail. Instead, write the
				39	output that would be generated onto stdout.
				40
				41	<board> arguments:
				42	With no arguments, gathers the status for all boards in the lab.
				43	With one or more named boards on the command line, restricts
				44	reporting to just those boards.
				45
				46	"""
				47
				48
				49	import argparse
				50	import logging
				51	import logging.handlers
				52	import os
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	53	import re
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	54	import sys
				55	import time
				56
				57	import common
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	58	from autotest_lib.client.bin import utils
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	59	from autotest_lib.client.common_lib import time_utils
J. Richard Barnette	a7c514e	2015-09-15 11:13:23 -0700	[diff] [blame^]	60	from autotest_lib.server.cros.dynamic_suite import frontend_wrappers
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	61	from autotest_lib.server.hosts import servo_host
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	62	from autotest_lib.site_utils import gmail_lib
				63	from autotest_lib.site_utils import status_history
				64	from autotest_lib.site_utils.suite_scheduler import constants
				65
				66
				67	# The pools in the Lab that are actually of interest.
				68	#
				69	# These are general purpose pools of DUTs that are considered
				70	# identical for purposes of testing. That is, a device in one of
				71	# these pools can be shifted to another pool at will for purposes
				72	# of supplying test demand.
				73	#
				74	# Devices in these pools are not allowed to have special-purpose
				75	# attachments, or to be part of in any kind of custom fixture.
				76	# Devices in these pools are also required to reside in areas
				77	# managed by the Platforms team (i.e. at the time of this writing,
				78	# only in "Atlantis" or "Destiny").
				79	#
				80	# _CRITICAL_POOLS - Pools that must be kept fully supplied in order
				81	# to guarantee timely completion of tests from builders.
				82	# _SPARE_POOL - A low priority pool that is allowed to provide
				83	# spares to replace broken devices in the critical pools.
				84	# _MANAGED_POOLS - The set of all the general purpose pools
				85	# monitored by this script.
				86
				87	_CRITICAL_POOLS = ['bvt', 'cq']
				88	_SPARE_POOL = 'suites'
				89	_MANAGED_POOLS = _CRITICAL_POOLS + [_SPARE_POOL]
				90
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	91	# _DEFAULT_DURATION:
				92	# Default value used for the --duration command line option.
				93	# Specifies how far back in time to search in order to determine
				94	# DUT status.
				95
				96	_DEFAULT_DURATION = 24
				97
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	98	# _LOGDIR:
				99	# Relative path used in the calculation of the default setting
				100	# for the --logdir option. The full path path is relative to
				101	# the root of the autotest directory, as determined from
				102	# sys.argv[0].
				103	# _LOGFILE:
				104	# Basename of a file to which general log information will be
				105	# written.
				106	# _LOG_FORMAT:
				107	# Format string for log messages.
				108
				109	_LOGDIR = os.path.join('logs', 'dut-data')
				110	_LOGFILE = 'lab-inventory.log'
				111	_LOG_FORMAT = '%(asctime)s \| %(levelname)-10s \| %(message)s'
				112
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	113	# Pattern describing location-based host names in the Chrome OS test
				114	# labs. Each DUT hostname designates the DUT's location:
				115	# * A lab (room) that's physically separated from other labs
				116	# (i.e. there's a door).
				117	# * A row (or aisle) of DUTs within the lab.
				118	# * A vertical rack of shelves on the row.
				119	# * A specific host on one shelf of the rack.
				120
				121	_HOSTNAME_PATTERN = re.compile(
				122	r'(chromeos\d+)-row(\d+)-rack(\d+)-host(\d+)')
				123
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	124
				125	class _PoolCounts(object):
				126	"""Maintains a set of `HostJobHistory` objects for a pool.
				127
				128	The collected history objects are nominally all part of a single
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	129	scheduling pool of DUTs. The collection maintains a list of
				130	working DUTs, a list of broken DUTs, and a list of all DUTs.
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	131
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	132	Performance note: Certain methods in this class are potentially
				133	expensive:
				134	* `get_working()`
				135	* `get_working_list()`
				136	* `get_broken()`
				137	* `get_broken_list()`
				138	The first time any one of these methods is called, it causes
				139	multiple RPC calls with a relatively expensive set of database
				140	queries. However, the results of the queries are cached in the
				141	individual `HostJobHistory` objects, so only the first call
				142	actually pays the full cost.
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	143
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	144	Additionally, `get_working_list()` and `get_broken_list()` both
				145	cache their return values to avoid recalculating lists at every
				146	call; this caching is separate from the caching of RPC results
				147	described above.
				148
				149	This class is deliberately constructed to delay the RPC cost
				150	until the accessor methods are called (rather than to query in
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	151	`record_host()`) so that it's possible to construct a complete
				152	`_LabInventory` without making the expensive queries at creation
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	153	time. `_populate_board_counts()`, below, assumes this behavior.
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	154
				155	"""
				156
				157	def __init__(self):
				158	self._histories = []
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	159	self._working_list = None
				160	self._broken_list = None
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	161
				162
				163	def record_host(self, host_history):
				164	"""Add one `HostJobHistory` object to the collection.
				165
				166	@param host_history The `HostJobHistory` object to be
				167	remembered.
				168
				169	"""
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	170	self._working_list = None
				171	self._broken_list = None
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	172	self._histories.append(host_history)
				173
				174
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	175	def get_working_list(self):
				176	"""Return a list of all working DUTs in the pool.
				177
				178	Filter `self._histories` for histories where the last
				179	diagnosis is `WORKING`.
				180
				181	Cache the result so that we only cacluate it once.
				182
				183	@return A list of HostJobHistory objects.
				184
				185	"""
				186	if self._working_list is None:
				187	self._working_list = [h for h in self._histories
				188	if h.last_diagnosis()[0] == status_history.WORKING]
				189	return self._working_list
				190
				191
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	192	def get_working(self):
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	193	"""Return the number of working DUTs in the pool."""
				194	return len(self.get_working_list())
				195
				196
				197	def get_broken_list(self):
				198	"""Return a list of all broken DUTs in the pool.
				199
				200	Filter `self._histories` for histories where the last
				201	diagnosis is not `WORKING`.
				202
				203	Cache the result so that we only cacluate it once.
				204
				205	@return A list of HostJobHistory objects.
				206
				207	"""
				208	if self._broken_list is None:
				209	self._broken_list = [h for h in self._histories
				210	if h.last_diagnosis()[0] != status_history.WORKING]
				211	return self._broken_list
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	212
				213
				214	def get_broken(self):
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	215	"""Return the number of broken DUTs in the pool."""
				216	return len(self.get_broken_list())
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	217
				218
				219	def get_total(self):
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	220	"""Return the total number of DUTs in the pool."""
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	221	return len(self._histories)
				222
				223
				224	class _BoardCounts(object):
				225	"""Maintains a set of `HostJobHistory` objects for a board.
				226
				227	The collected history objects are nominally all of the same
				228	board. The collection maintains a count of working DUTs, a
				229	count of broken DUTs, and a total count. The counts can be
				230	obtained either for a single pool, or as a total across all
				231	pools.
				232
				233	DUTs in the collection must be assigned to one of the pools
				234	in `_MANAGED_POOLS`.
				235
				236	The `get_working()` and `get_broken()` methods rely on the
				237	methods of the same name in _PoolCounts, so the performance
				238	note in _PoolCounts applies here as well.
				239
				240	"""
				241
				242	def __init__(self):
				243	self._pools = {
				244	pool: _PoolCounts() for pool in _MANAGED_POOLS
				245	}
				246
				247	def record_host(self, host_history):
				248	"""Add one `HostJobHistory` object to the collection.
				249
				250	@param host_history The `HostJobHistory` object to be
				251	remembered.
				252
				253	"""
J. Richard Barnette	3d0590a	2015-04-29 12:56:12 -0700	[diff] [blame]	254	pool = host_history.host_pool
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	255	self._pools[pool].record_host(host_history)
				256
				257
				258	def _count_pool(self, get_pool_count, pool=None):
				259	"""Internal helper to count hosts in a given pool.
				260
				261	The `get_pool_count` parameter is a function to calculate
				262	the exact count of interest for the pool.
				263
				264	@param get_pool_count Function to return a count from a
				265	_PoolCount object.
				266	@param pool The pool to be counted. If `None`,
				267	return the total across all pools.
				268
				269	"""
				270	if pool is None:
				271	return sum([get_pool_count(counts)
				272	for counts in self._pools.values()])
				273	else:
				274	return get_pool_count(self._pools[pool])
				275
				276
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	277	def get_working_list(self):
				278	"""Return a list of all working DUTs for the board.
				279
				280	Go through all HostJobHistory objects in the board's pools,
				281	selecting the ones where the last diagnosis is `WORKING`.
				282
				283	@return A list of HostJobHistory objects.
				284
				285	"""
				286	l = []
				287	for p in self._pools.values():
				288	l.extend(p.get_working_list())
				289	return l
				290
				291
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	292	def get_working(self, pool=None):
				293	"""Return the number of working DUTs in a pool.
				294
				295	@param pool The pool to be counted. If `None`, return the
				296	total across all pools.
				297
				298	"""
				299	return self._count_pool(_PoolCounts.get_working, pool)
				300
				301
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	302	def get_broken_list(self):
				303	"""Return a list of all broken DUTs for the board.
				304
				305	Go through all HostJobHistory objects in the board's pools,
				306	selecting the ones where the last diagnosis is not
				307	`WORKING`.
				308
				309	@return A list of HostJobHistory objects.
				310
				311	"""
				312	l = []
				313	for p in self._pools.values():
				314	l.extend(p.get_broken_list())
				315	return l
				316
				317
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	318	def get_broken(self, pool=None):
				319	"""Return the number of broken DUTs in a pool.
				320
				321	@param pool The pool to be counted. If `None`, return the
				322	total across all pools.
				323
				324	"""
				325	return self._count_pool(_PoolCounts.get_broken, pool)
				326
				327
				328	def get_total(self, pool=None):
				329	"""Return the total number of DUTs in a pool.
				330
				331	@param pool The pool to be counted. If `None`, return the
				332	total across all pools.
				333
				334	"""
				335	return self._count_pool(_PoolCounts.get_total, pool)
				336
				337
				338	class _LabInventory(dict):
				339	"""Collection of `HostJobHistory` objects for the Lab's inventory.
				340
				341	The collection is indexed by board. Indexing returns the
				342	_BoardCounts object associated with the board.
				343
				344	The collection is also iterable. The iterator returns all the
				345	boards in the inventory, in unspecified order.
				346
				347	"""
				348
				349	@classmethod
				350	def create_inventory(cls, afe, start_time, end_time, boardlist=[]):
				351	"""Return a Lab inventory with specified parameters.
				352
				353	By default, gathers inventory from `HostJobHistory` objects
				354	for all DUTs in the `_MANAGED_POOLS` list. If `boardlist`
				355	is supplied, the inventory will be restricted to only the
				356	given boards.
				357
				358	@param afe AFE object for constructing the
				359	`HostJobHistory` objects.
				360	@param start_time Start time for the `HostJobHistory`
				361	objects.
				362	@param end_time End time for the `HostJobHistory`
				363	objects.
				364	@param boardlist List of boards to include. If empty,
				365	include all available boards.
				366	@return A `_LabInventory` object for the specified boards.
				367
				368	"""
				369	label_list = [constants.Labels.POOL_PREFIX + l
				370	for l in _MANAGED_POOLS]
				371	afehosts = afe.get_hosts(labels__name__in=label_list)
				372	if boardlist:
				373	boardhosts = []
				374	for board in boardlist:
				375	board_label = constants.Labels.BOARD_PREFIX + board
				376	host_list = [h for h in afehosts
				377	if board_label in h.labels]
				378	boardhosts.extend(host_list)
				379	afehosts = boardhosts
				380	create = lambda host: (
				381	status_history.HostJobHistory(afe, host,
				382	start_time, end_time))
				383	return cls([create(host) for host in afehosts])
				384
				385
				386	def __init__(self, histories):
J. Richard Barnette	6948ed3	2015-05-06 08:57:10 -0700	[diff] [blame]	387	# N.B. The query that finds our hosts is restricted to those
				388	# with a valid pool: label, but doesn't check for a valid
				389	# board: label. In some (insufficiently) rare cases, the
				390	# AFE hosts table has been known to (incorrectly) have DUTs
				391	# with a pool: but no board: label. We explicitly exclude
				392	# those here.
				393	histories = [h for h in histories
				394	if h.host_board is not None]
J. Richard Barnette	3d0590a	2015-04-29 12:56:12 -0700	[diff] [blame]	395	boards = set([h.host_board for h in histories])
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	396	initval = { board: _BoardCounts() for board in boards }
				397	super(_LabInventory, self).__init__(initval)
				398	self._dut_count = len(histories)
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	399	self._board_counts = None
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	400	for h in histories:
J. Richard Barnette	3d0590a	2015-04-29 12:56:12 -0700	[diff] [blame]	401	self[h.host_board].record_host(h)
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	402
				403
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	404	def get_working_list(self):
				405	"""Return a list of all working DUTs in the inventory.
				406
				407	Go through all HostJobHistory objects in the inventory,
				408	selecting the ones where the last diagnosis is `WORKING`.
				409
				410	@return A list of HostJobHistory objects.
				411
				412	"""
				413	l = []
				414	for counts in self.values():
				415	l.extend(counts.get_working_list())
				416	return l
				417
				418
				419	def get_broken_list(self):
				420	"""Return a list of all broken DUTs in the inventory.
				421
				422	Go through all HostJobHistory objects in the inventory,
				423	selecting the ones where the last diagnosis is not
				424	`WORKING`.
				425
				426	@return A list of HostJobHistory objects.
				427
				428	"""
				429	l = []
				430	for counts in self.values():
				431	l.extend(counts.get_broken_list())
				432	return l
				433
				434
				435	def get_board_counts(self):
				436	"""Calculate a summary of board counts.
				437
				438	The summary is a list of tuples. The tuple elements, in
				439	order, are:
				440	* board - The name of the board associated with the
				441	counts.
				442	* buffer - The buffer of working spares (the total number
				443	of spares, less the number of broken DUTs).
				444	* broken - The number of broken DUTs.
				445	* working - The number of working DUTs.
				446	* spares - The number of DUTs in the spares pool.
				447	* total - The the total number of DUTs.
				448
				449	Boards with no DUTs in the spares pool or no DUTs in a
				450	critical pool will be excluded from the listed counts.
				451
				452	The ordering of the boards is unspecified.
				453
				454	@param inventory The inventory to be summarized.
				455	@return A list of tuples with board data.
				456
				457	"""
				458	if self._board_counts is None:
				459	self._board_counts = []
				460	for board, counts in self.items():
				461	logging.debug('Counting inventory for %s', board)
				462	spares = counts.get_total(_SPARE_POOL)
				463	total = counts.get_total()
				464	if spares == 0 or spares == total:
				465	continue
				466	working = counts.get_working()
				467	broken = counts.get_broken()
				468	spare_buffer = spares - broken
				469	element = (board, spare_buffer, broken, working,
				470	spares, total)
				471	self._board_counts.append(element)
				472	return self._board_counts
				473
				474
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	475	def get_num_duts(self):
				476	"""Return the total number of DUTs in the inventory."""
				477	return self._dut_count
				478
				479
				480	def get_num_boards(self):
				481	"""Return the total number of boards in the inventory."""
				482	return len(self)
				483
				484
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	485	def _sort_by_location(inventory_list):
				486	"""Return a list of DUTs, organized by location.
				487
				488	Take the given list of `HostJobHistory` objects, separate it
				489	into a list per lab, and sort each lab's list by location. The
				490	order of sorting within a lab is
				491	* By row number within the lab,
				492	* then by rack number within the row,
				493	* then by host shelf number within the rack.
				494
				495	Return a list of the sorted lists.
				496
				497	Implementation note: host locations are sorted by converting
				498	each location into a base 100 number. If row, rack or
				499	host numbers exceed the range [0..99], then sorting will
				500	break down.
				501
				502	@return A list of sorted lists of DUTs.
				503
				504	"""
				505	BASE = 100
				506	lab_lists = {}
				507	for history in inventory_list:
				508	location = _HOSTNAME_PATTERN.match(history.host.hostname)
				509	if location:
				510	lab = location.group(1)
				511	key = 0
				512	for idx in location.group(2, 3, 4):
				513	key = BASE * key + int(idx)
				514	lab_lists.setdefault(lab, []).append((key, history))
				515	return_list = []
				516	for dut_list in lab_lists.values():
				517	dut_list.sort(key=lambda t: t[0])
				518	return_list.append([t[1] for t in dut_list])
				519	return return_list
				520
				521
				522	def _score_repair_set(buffer_counts, repair_list):
				523	"""Return a numeric score rating a set of DUTs to be repaired.
				524
				525	`buffer_counts` is a dictionary mapping board names to the
				526	size of the board's spares buffer.
				527
				528	`repair_list` is a list of DUTs to be repaired.
				529
				530	This function calculates the new set of buffer counts that would
				531	result from the proposed repairs, and scores the new set using
				532	two numbers:
				533	* Worst case buffer count for any board (higher is better).
				534	This is the more siginficant number for comparison.
				535	* Number of boards at the worst case (lower is better). This
				536	is the less significant number.
				537
				538	Implementation note: The score could fail to reflect the
				539	intended criteria if there are more than 1000 boards in the
				540	inventory.
				541
				542	@param spare_counts A dictionary mapping boards to buffer counts.
				543	@param repair_list A list of boards to be repaired.
				544	@return A numeric score.
				545
				546	"""
				547	# Go through `buffer_counts`, and create a list of new counts
				548	# that records the buffer count for each board after repair.
				549	# The new list of counts discards the board names, as they don't
				550	# contribute to the final score.
				551	_NBOARDS = 1000
				552	repair_inventory = _LabInventory(repair_list)
				553	new_counts = []
				554	for b, c in buffer_counts.items():
				555	if b in repair_inventory:
				556	newcount = repair_inventory[b].get_total()
				557	else:
				558	newcount = 0
				559	new_counts.append(c + newcount)
				560	# Go through the new list of counts. Find the worst available
				561	# spares count, and count how many times that worst case occurs.
				562	worst_count = new_counts[0]
				563	num_worst = 1
				564	for c in new_counts[1:]:
				565	if c == worst_count:
				566	num_worst += 1
				567	elif c < worst_count:
				568	worst_count = c
				569	num_worst = 1
				570	# Return the calculated score
				571	return _NBOARDS * worst_count - num_worst
				572
				573
				574	def _generate_repair_recommendation(inventory, num_recommend):
				575	"""Return a summary of selected DUTs needing repair.
				576
				577	Returns a message recommending a list of broken DUTs to be
				578	repaired. The list of DUTs is selected based on these
				579	criteria:
				580	* No more than `num_recommend` DUTs will be listed.
				581	* All DUTs must be in the same lab.
				582	* DUTs should be selected for some degree of physical
				583	proximity.
				584	* DUTs for boards with a low spares buffer are more important
				585	than DUTs with larger buffers.
				586
				587	The algorithm used will guarantee that at least one DUT from a
				588	board with the smallest spares buffer will be recommended. If
				589	the worst spares buffer number is shared by more than one board,
				590	the algorithm will tend to prefer repair sets that include more
				591	of those boards over sets that cover fewer boards.
				592
J. Richard Barnette	1df6a56	2015-06-09 10:06:17 -0700	[diff] [blame]	593	@param inventory Inventory for generating recommendations.
				594	@param num_recommend Number of DUTs to recommend for repair.
				595
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	596	"""
				597	logging.debug('Creating DUT repair recommendations')
				598	board_counts = inventory.get_board_counts()
				599	# t[0] - board name
				600	# t[1] - size of spares buffer
				601	# t[2] - number of broken devices
				602	board_buffer_counts = {t[0]: t[1] for t in board_counts
				603	if t[2] != 0}
				604	recommendation = None
				605	best_score = None
				606	# N.B. The logic of this loop may seem complicated, but
				607	# simplification is hard:
				608	# * Calculating an initial recommendation outside of
				609	# the loop likely would make things more complicated,
				610	# not less.
				611	# * It's necessary to calculate an initial lab slice once per
				612	# lab _before_ the while loop, in case the number of broken
				613	# DUTs in a lab is less than `num_recommend`.
				614	for lab_duts in _sort_by_location(inventory.get_broken_list()):
				615	start = 0
				616	end = num_recommend
				617	lab_slice = lab_duts[start : end]
				618	lab_score = _score_repair_set(board_buffer_counts,
				619	lab_slice)
				620	while end < len(lab_duts):
				621	start += 1
				622	end += 1
				623	new_slice = lab_duts[start : end]
				624	new_score = _score_repair_set(board_buffer_counts,
				625	new_slice)
				626	if new_score > lab_score:
				627	lab_slice = new_slice
				628	lab_score = new_score
				629	if recommendation is None or lab_score > best_score:
				630	recommendation = lab_slice
				631	best_score = lab_score
J. Richard Barnette	1df6a56	2015-06-09 10:06:17 -0700	[diff] [blame]	632	message = ['Repair recommendations:\n',
				633	'%-30s %-16s %s' % (
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	634	'Hostname', 'Board', 'Servo instructions')]
				635	for h in recommendation:
				636	servo_name = servo_host.make_servo_hostname(h.host.hostname)
				637	if utils.host_is_in_lab_zone(servo_name):
				638	servo_message = 'Repair servo first'
				639	else:
				640	servo_message = 'No servo present'
				641	line = '%-30s %-16s %s' % (
				642	h.host.hostname, h.host_board, servo_message)
				643	message.append(line)
				644	return '\n'.join(message)
				645
				646
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	647	def _generate_board_inventory_message(inventory):
				648	"""Generate the "board inventory" e-mail message.
				649
				650	The board inventory is a list by board summarizing the number
				651	of working and broken DUTs, and the total shortfall or surplus
				652	of working devices relative to the minimum critical pool
				653	requirement.
				654
				655	The report omits boards with no DUTs in the spare pool or with
				656	no DUTs in a critical pool.
				657
				658	N.B. For sample output text formattted as users can expect to
				659	see it in e-mail and log files, refer to the unit tests.
				660
				661	@param inventory _LabInventory object with the inventory to
				662	be reported on.
				663	@return String with the inventory message to be sent.
				664
				665	"""
				666	logging.debug('Creating board inventory')
J. Richard Barnette	1df6a56	2015-06-09 10:06:17 -0700	[diff] [blame]	667	message = ['Full board inventory:\n',
				668	'%-22s %5s %5s %5s %5s %5s' % (
				669	'Board', 'Avail', 'Bad', 'Good',
				670	'Spare', 'Total')]
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	671	data_list = inventory.get_board_counts()
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	672	data_list = sorted(sorted(data_list, key=lambda t: -t[2]),
				673	key=lambda t: t[1])
				674	message.extend(
J. Richard Barnette	1df6a56	2015-06-09 10:06:17 -0700	[diff] [blame]	675	['%-22s %5d %5d %5d %5d %5d' % t for t in data_list])
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	676	return '\n'.join(message)
				677
				678
J. Richard Barnette	4845fcf	2015-04-20 14:26:25 -0700	[diff] [blame]	679	_POOL_INVENTORY_HEADER = '''\
Aviv Keshet	056d74c	2015-07-14 09:18:43 -0700	[diff] [blame]	680	Notice to Infrastructure deputies: All boards shown below are at
J. Richard Barnette	c9a143c	2015-06-04 11:11:19 -0700	[diff] [blame]	681	less than full strength, please take action to resolve the issues.
				682	Once you're satisified that failures won't recur, failed DUTs can
				683	be replaced with spares by running `balance_pool`. Detailed
				684	instructions can be found here:
J. Richard Barnette	4845fcf	2015-04-20 14:26:25 -0700	[diff] [blame]	685	http://go/cros-manage-duts
				686	'''
				687
				688
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	689	def _generate_pool_inventory_message(inventory):
				690	"""Generate the "pool inventory" e-mail message.
				691
				692	The pool inventory is a list by pool and board summarizing the
				693	number of working and broken DUTs in the pool. Only boards with
				694	at least one broken DUT are included in the list.
				695
				696	N.B. For sample output text formattted as users can expect to
				697	see it in e-mail and log files, refer to the unit tests.
				698
				699	@param inventory _LabInventory object with the inventory to
				700	be reported on.
				701	@return String with the inventory message to be sent.
				702
				703	"""
				704	logging.debug('Creating pool inventory')
J. Richard Barnette	4845fcf	2015-04-20 14:26:25 -0700	[diff] [blame]	705	message = [_POOL_INVENTORY_HEADER]
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	706	newline = ''
				707	for pool in _CRITICAL_POOLS:
				708	message.append(
				709	'%sStatus for pool:%s, by board:' % (newline, pool))
				710	message.append(
				711	'%-20s %5s %5s %5s' % (
				712	'Board', 'Bad', 'Good', 'Total'))
				713	data_list = []
				714	for board, counts in inventory.items():
				715	logging.debug('Counting inventory for %s, %s',
				716	board, pool)
				717	broken = counts.get_broken(pool)
				718	if broken == 0:
				719	continue
				720	working = counts.get_working(pool)
				721	total = counts.get_total(pool)
				722	data_list.append((board, broken, working, total))
				723	if data_list:
				724	data_list = sorted(data_list, key=lambda d: -d[1])
				725	message.extend(
				726	['%-20s %5d %5d %5d' % t for t in data_list])
				727	else:
				728	message.append('(All boards at full strength)')
				729	newline = '\n'
				730	return '\n'.join(message)
				731
				732
				733	def _send_email(arguments, tag, subject, recipients, body):
				734	"""Send an inventory e-mail message.
				735
				736	The message is logged in the selected log directory using `tag`
				737	for the file name.
				738
				739	If the --print option was requested, the message is neither
				740	logged nor sent, but merely printed on stdout.
				741
				742	@param arguments Parsed command-line options.
				743	@param tag Tag identifying the inventory for logging
				744	purposes.
				745	@param subject E-mail Subject: header line.
				746	@param recipients E-mail addresses for the To: header line.
				747	@param body E-mail message body.
				748
				749	"""
				750	logging.debug('Generating email: "%s"', subject)
				751	all_recipients = ', '.join(recipients)
				752	report_body = '\n'.join([
				753	'To: %s' % all_recipients,
				754	'Subject: %s' % subject,
				755	'', body, ''])
				756	if arguments.print_:
				757	print report_body
				758	else:
				759	filename = os.path.join(arguments.logdir, tag)
				760	try:
				761	report_file = open(filename, 'w')
				762	report_file.write(report_body)
				763	report_file.close()
				764	except EnvironmentError as e:
				765	logging.error('Failed to write %s: %s', filename, e)
				766	try:
				767	gmail_lib.send_email(all_recipients, subject, body)
				768	except Exception as e:
				769	logging.error('Failed to send e-mail to %s: %s',
				770	all_recipients, e)
				771
				772
				773	def _separate_email_addresses(address_list):
				774	"""Parse a list of comma-separated lists of e-mail addresses.
				775
				776	@param address_list A list of strings containing comma
				777	separate e-mail addresses.
				778	@return A list of the individual e-mail addresses.
				779
				780	"""
				781	newlist = []
				782	for arg in address_list:
				783	newlist.extend([email.strip() for email in arg.split(',')])
				784	return newlist
				785
				786
				787	def _verify_arguments(arguments):
				788	"""Validate command-line arguments.
				789
				790	Join comma separated e-mail addresses for `--board-notify` and
				791	`--pool-notify` in separate option arguments into a single list.
				792
				793	@param arguments Command-line arguments as returned by
				794	`ArgumentParser`
				795
				796	"""
				797	arguments.board_notify = _separate_email_addresses(
				798	arguments.board_notify)
				799	arguments.pool_notify = _separate_email_addresses(
				800	arguments.pool_notify)
				801
				802
				803	def _get_logdir(script):
				804	"""Get the default directory for the `--logdir` option.
				805
				806	The default log directory is based on the parent directory
				807	containing this script.
				808
				809	@param script Path to this script file.
				810	@return A path to a directory.
				811
				812	"""
				813	basedir = os.path.dirname(os.path.abspath(script))
				814	basedir = os.path.dirname(basedir)
				815	return os.path.join(basedir, _LOGDIR)
				816
				817
				818	def _parse_command(argv):
				819	"""Parse the command line arguments.
				820
				821	Create an argument parser for this command's syntax, parse the
				822	command line, and return the result of the ArgumentParser
				823	parse_args() method.
				824
				825	@param argv Standard command line argument vector; argv[0] is
				826	assumed to be the command name.
				827	@return Result returned by ArgumentParser.parse_args().
				828
				829	"""
				830	parser = argparse.ArgumentParser(
				831	prog=argv[0],
				832	description='Gather and report lab inventory statistics')
				833	parser.add_argument('-d', '--duration', type=int,
				834	default=_DEFAULT_DURATION, metavar='HOURS',
				835	help='number of hours back to search for status'
				836	' (default: %d)' % _DEFAULT_DURATION)
				837	parser.add_argument('--board-notify', action='append',
				838	default=[], metavar='ADDRESS',
				839	help='Generate board inventory message, '
				840	'and send it to the given e-mail address(es)')
				841	parser.add_argument('--pool-notify', action='append',
				842	default=[], metavar='ADDRESS',
				843	help='Generate pool inventory message, '
				844	'and send it to the given address(es)')
J. Richard Barnette	1df6a56	2015-06-09 10:06:17 -0700	[diff] [blame]	845	parser.add_argument('-r', '--recommend', type=int, default=None,
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	846	help=('Specify how many DUTs should be '
J. Richard Barnette	1df6a56	2015-06-09 10:06:17 -0700	[diff] [blame]	847	'recommended for repair (default: no '
				848	'recommendation)'))
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	849	parser.add_argument('--print', dest='print_', action='store_true',
				850	help='Print e-mail messages on stdout '
				851	'without sending them.')
				852	parser.add_argument('--logdir', default=_get_logdir(argv[0]),
				853	help='Directory where logs will be written.')
				854	parser.add_argument('boardnames', nargs='*',
				855	metavar='BOARD',
				856	help='names of boards to report on '
				857	'(default: all boards)')
				858	arguments = parser.parse_args(argv[1:])
				859	_verify_arguments(arguments)
				860	return arguments
				861
				862
				863	def _configure_logging(arguments):
				864	"""Configure the `logging` module for our needs.
				865
				866	How we log depends on whether the `--print` option was
				867	provided on the command line. Without the option, we log all
				868	messages at DEBUG level or above, and write them to a file in
				869	the directory specified by the `--logdir` option. With the
				870	option, we write log messages to stdout; messages below INFO
				871	level are discarded.
				872
				873	The log file is configured to rotate once a week on Friday
				874	evening, preserving ~3 months worth of history.
				875
				876	@param arguments Command-line arguments as returned by
				877	`ArgumentParser`
				878
				879	"""
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	880	root_logger = logging.getLogger()
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	881	if arguments.print_:
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	882	root_logger.setLevel(logging.INFO)
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	883	handler = logging.StreamHandler(sys.stdout)
				884	handler.setFormatter(logging.Formatter())
				885	else:
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	886	root_logger.setLevel(logging.DEBUG)
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	887	logfile = os.path.join(arguments.logdir, _LOGFILE)
				888	handler = logging.handlers.TimedRotatingFileHandler(
				889	logfile, when='W4', backupCount=13)
				890	formatter = logging.Formatter(_LOG_FORMAT,
				891	time_utils.TIME_FMT)
				892	handler.setFormatter(formatter)
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	893	# TODO(jrbarnette) This is gross. Importing client.bin.utils
				894	# implicitly imported logging_config, which calls
				895	# logging.basicConfig() at module level. That gives us an
				896	# extra logging handler that we don't want. So, clear out all
				897	# the handlers here.
				898	for h in root_logger.handlers:
				899	root_logger.removeHandler(h)
				900	root_logger.addHandler(handler)
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	901
				902
				903	def _populate_board_counts(inventory):
				904	"""Gather board counts while providing interactive feedback.
				905
				906	Gathering the status of all individual DUTs in the lab can take
				907	considerable time (~30 minutes at the time of this writing).
				908
				909	Normally, we pay that cost by querying as we go. However, with
				910	the `--print` option, a human being may be watching the
				911	progress. So, we force the first (expensive) queries to happen
				912	up front, and provide a small ASCII progress bar to give an
				913	indicator of how many boards have been processed.
				914
				915	@param inventory _LabInventory object with the inventory to
				916	be gathered.
				917
				918	"""
				919	n = 0
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	920	total_broken = 0
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	921	for counts in inventory.values():
				922	n += 1
				923	if n % 10 == 5:
				924	c = '+'
				925	elif n % 10 == 0:
				926	c = '%d' % ((n / 10) % 10)
				927	else:
				928	c = '.'
				929	sys.stdout.write(c)
				930	sys.stdout.flush()
				931	# This next call is where all the time goes - it forces all
				932	# of a board's HostJobHistory objects to query the database
				933	# and cache their results.
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	934	total_broken += counts.get_broken()
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	935	sys.stdout.write('\n')
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	936	sys.stdout.write('Found %d broken DUTs\n' % total_broken)
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	937
				938
				939	def main(argv):
				940	"""Standard main routine.
				941	@param argv Command line arguments including `sys.argv[0]`.
				942	"""
				943	arguments = _parse_command(argv)
				944	_configure_logging(arguments)
				945	try:
				946	end_time = int(time.time())
				947	start_time = end_time - arguments.duration * 60 * 60
				948	timestamp = time.strftime('%Y-%m-%d.%H',
				949	time.localtime(end_time))
				950	logging.debug('Starting lab inventory for %s', timestamp)
				951	if arguments.board_notify:
J. Richard Barnette	1df6a56	2015-06-09 10:06:17 -0700	[diff] [blame]	952	if arguments.recommend:
				953	logging.debug('Will include repair recommendations')
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	954	logging.debug('Will include board inventory')
				955	if arguments.pool_notify:
				956	logging.debug('Will include pool inventory')
				957
J. Richard Barnette	a7c514e	2015-09-15 11:13:23 -0700	[diff] [blame^]	958	afe = frontend_wrappers.RetryingAFE(server=None)
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	959	inventory = _LabInventory.create_inventory(
				960	afe, start_time, end_time, arguments.boardnames)
				961	logging.info('Found %d hosts across %d boards',
				962	inventory.get_num_duts(),
				963	inventory.get_num_boards())
				964
				965	if arguments.print_:
				966	_populate_board_counts(inventory)
				967
Richard Barnette	1f87ee1	2015-06-09 20:54:14 +0000	[diff] [blame]	968	if arguments.print_ or arguments.board_notify:
J. Richard Barnette	1df6a56	2015-06-09 10:06:17 -0700	[diff] [blame]	969	if arguments.recommend:
				970	recommend_message = _generate_repair_recommendation(
				971	inventory, arguments.recommend) + '\n\n\n'
				972	else:
				973	recommend_message = ''
				974	board_message = _generate_board_inventory_message(inventory)
				975	full_message = recommend_message + board_message
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	976	_send_email(arguments,
				977	'boards-%s.txt' % timestamp,
				978	'DUT board inventory %s' % timestamp,
				979	arguments.board_notify,
J. Richard Barnette	1df6a56	2015-06-09 10:06:17 -0700	[diff] [blame]	980	full_message)
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	981
				982	if arguments.print_ or arguments.pool_notify:
				983	_send_email(arguments,
				984	'pools-%s.txt' % timestamp,
				985	'DUT pool inventory %s' % timestamp,
				986	arguments.pool_notify,
				987	_generate_pool_inventory_message(inventory))
				988	except KeyboardInterrupt:
				989	pass
				990	except EnvironmentError as e:
				991	logging.exception('Unexpected OS error: %s', e)
				992	except Exception as e:
				993	logging.exception('Unexpected exception: %s', e)
				994
				995
				996	if __name__ == '__main__':
				997	main(sys.argv)