Blame - site_utils/lab_inventory.py - platform/external/autotest

blob: 97944a78c22923f9f556d6ba3771db8a66ccb107 [file] [log] [blame]

J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	1	#!/usr/bin/env python
				2	# Copyright 2015 The Chromium OS Authors. All rights reserved.
				3	# Use of this source code is governed by a BSD-style license that can be
				4	# found in the LICENSE file.
				5
				6	"""Create e-mail reports of the Lab's DUT inventory.
				7
				8	Gathers a list of all DUTs of interest in the Lab, segregated by
				9	board and pool, and determines whether each DUT is working or
				10	broken. Then, send one or more e-mail reports summarizing the
				11	status to e-mail addresses provided on the command line.
				12
				13	usage: lab_inventory.py [ options ] [ board ... ]
				14
				15	Options:
				16	--duration / -d <hours>
				17	How far back in time to search job history to determine DUT
				18	status.
				19
				20	--board-notify <address>[,<address>]
				21	Send the "board status" e-mail to all the specified e-mail
				22	addresses.
				23
				24	--pool-notify <address>[,<address>]
				25	Send the "pool status" e-mail to all the specified e-mail
				26	addresses.
				27
				28	--logdir <directory>
				29	Log progress and actions in a file under this directory. Text
				30	of any e-mail sent will also be logged in a timestamped file in
				31	this directory.
				32
				33	--print
				34	Suppress all logging and sending e-mail. Instead, write the
				35	output that would be generated onto stdout.
				36
				37	<board> arguments:
				38	With no arguments, gathers the status for all boards in the lab.
				39	With one or more named boards on the command line, restricts
				40	reporting to just those boards.
				41
				42	"""
				43
				44
				45	import argparse
				46	import logging
				47	import logging.handlers
				48	import os
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	49	import re
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	50	import sys
				51	import time
				52
				53	import common
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	54	from autotest_lib.client.bin import utils
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	55	from autotest_lib.client.common_lib import time_utils
				56	from autotest_lib.server import frontend
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	57	from autotest_lib.server.hosts import servo_host
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	58	from autotest_lib.site_utils import gmail_lib
				59	from autotest_lib.site_utils import status_history
				60	from autotest_lib.site_utils.suite_scheduler import constants
				61
				62
				63	# The pools in the Lab that are actually of interest.
				64	#
				65	# These are general purpose pools of DUTs that are considered
				66	# identical for purposes of testing. That is, a device in one of
				67	# these pools can be shifted to another pool at will for purposes
				68	# of supplying test demand.
				69	#
				70	# Devices in these pools are not allowed to have special-purpose
				71	# attachments, or to be part of in any kind of custom fixture.
				72	# Devices in these pools are also required to reside in areas
				73	# managed by the Platforms team (i.e. at the time of this writing,
				74	# only in "Atlantis" or "Destiny").
				75	#
				76	# _CRITICAL_POOLS - Pools that must be kept fully supplied in order
				77	# to guarantee timely completion of tests from builders.
				78	# _SPARE_POOL - A low priority pool that is allowed to provide
				79	# spares to replace broken devices in the critical pools.
				80	# _MANAGED_POOLS - The set of all the general purpose pools
				81	# monitored by this script.
				82
				83	_CRITICAL_POOLS = ['bvt', 'cq']
				84	_SPARE_POOL = 'suites'
				85	_MANAGED_POOLS = _CRITICAL_POOLS + [_SPARE_POOL]
				86
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	87	# _DEFAULT_DURATION:
				88	# Default value used for the --duration command line option.
				89	# Specifies how far back in time to search in order to determine
				90	# DUT status.
				91
				92	_DEFAULT_DURATION = 24
				93
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	94	# _LOGDIR:
				95	# Relative path used in the calculation of the default setting
				96	# for the --logdir option. The full path path is relative to
				97	# the root of the autotest directory, as determined from
				98	# sys.argv[0].
				99	# _LOGFILE:
				100	# Basename of a file to which general log information will be
				101	# written.
				102	# _LOG_FORMAT:
				103	# Format string for log messages.
				104
				105	_LOGDIR = os.path.join('logs', 'dut-data')
				106	_LOGFILE = 'lab-inventory.log'
				107	_LOG_FORMAT = '%(asctime)s \| %(levelname)-10s \| %(message)s'
				108
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	109	# _DEFAULT_NUM_RECOMMEND:
				110	# The default setting for the --recommend option. That option
				111	# determines how many DUTs will be listed in the output produced
				112	# by `_generate_repair_recommendation()`.
				113	_DEFAULT_NUM_RECOMMEND = 10
				114
				115	# Pattern describing location-based host names in the Chrome OS test
				116	# labs. Each DUT hostname designates the DUT's location:
				117	# * A lab (room) that's physically separated from other labs
				118	# (i.e. there's a door).
				119	# * A row (or aisle) of DUTs within the lab.
				120	# * A vertical rack of shelves on the row.
				121	# * A specific host on one shelf of the rack.
				122
				123	_HOSTNAME_PATTERN = re.compile(
				124	r'(chromeos\d+)-row(\d+)-rack(\d+)-host(\d+)')
				125
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	126
				127	class _PoolCounts(object):
				128	"""Maintains a set of `HostJobHistory` objects for a pool.
				129
				130	The collected history objects are nominally all part of a single
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	131	scheduling pool of DUTs. The collection maintains a list of
				132	working DUTs, a list of broken DUTs, and a list of all DUTs.
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	133
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	134	Performance note: Certain methods in this class are potentially
				135	expensive:
				136	* `get_working()`
				137	* `get_working_list()`
				138	* `get_broken()`
				139	* `get_broken_list()`
				140	The first time any one of these methods is called, it causes
				141	multiple RPC calls with a relatively expensive set of database
				142	queries. However, the results of the queries are cached in the
				143	individual `HostJobHistory` objects, so only the first call
				144	actually pays the full cost.
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	145
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	146	Additionally, `get_working_list()` and `get_broken_list()` both
				147	cache their return values to avoid recalculating lists at every
				148	call; this caching is separate from the caching of RPC results
				149	described above.
				150
				151	This class is deliberately constructed to delay the RPC cost
				152	until the accessor methods are called (rather than to query in
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	153	`record_host()`) so that it's possible to construct a complete
				154	`_LabInventory` without making the expensive queries at creation
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	155	time. `_populate_board_counts()`, below, assumes this behavior.
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	156
				157	"""
				158
				159	def __init__(self):
				160	self._histories = []
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	161	self._working_list = None
				162	self._broken_list = None
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	163
				164
				165	def record_host(self, host_history):
				166	"""Add one `HostJobHistory` object to the collection.
				167
				168	@param host_history The `HostJobHistory` object to be
				169	remembered.
				170
				171	"""
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	172	self._working_list = None
				173	self._broken_list = None
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	174	self._histories.append(host_history)
				175
				176
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	177	def get_working_list(self):
				178	"""Return a list of all working DUTs in the pool.
				179
				180	Filter `self._histories` for histories where the last
				181	diagnosis is `WORKING`.
				182
				183	Cache the result so that we only cacluate it once.
				184
				185	@return A list of HostJobHistory objects.
				186
				187	"""
				188	if self._working_list is None:
				189	self._working_list = [h for h in self._histories
				190	if h.last_diagnosis()[0] == status_history.WORKING]
				191	return self._working_list
				192
				193
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	194	def get_working(self):
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	195	"""Return the number of working DUTs in the pool."""
				196	return len(self.get_working_list())
				197
				198
				199	def get_broken_list(self):
				200	"""Return a list of all broken DUTs in the pool.
				201
				202	Filter `self._histories` for histories where the last
				203	diagnosis is not `WORKING`.
				204
				205	Cache the result so that we only cacluate it once.
				206
				207	@return A list of HostJobHistory objects.
				208
				209	"""
				210	if self._broken_list is None:
				211	self._broken_list = [h for h in self._histories
				212	if h.last_diagnosis()[0] != status_history.WORKING]
				213	return self._broken_list
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	214
				215
				216	def get_broken(self):
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	217	"""Return the number of broken DUTs in the pool."""
				218	return len(self.get_broken_list())
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	219
				220
				221	def get_total(self):
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	222	"""Return the total number of DUTs in the pool."""
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	223	return len(self._histories)
				224
				225
				226	class _BoardCounts(object):
				227	"""Maintains a set of `HostJobHistory` objects for a board.
				228
				229	The collected history objects are nominally all of the same
				230	board. The collection maintains a count of working DUTs, a
				231	count of broken DUTs, and a total count. The counts can be
				232	obtained either for a single pool, or as a total across all
				233	pools.
				234
				235	DUTs in the collection must be assigned to one of the pools
				236	in `_MANAGED_POOLS`.
				237
				238	The `get_working()` and `get_broken()` methods rely on the
				239	methods of the same name in _PoolCounts, so the performance
				240	note in _PoolCounts applies here as well.
				241
				242	"""
				243
				244	def __init__(self):
				245	self._pools = {
				246	pool: _PoolCounts() for pool in _MANAGED_POOLS
				247	}
				248
				249	def record_host(self, host_history):
				250	"""Add one `HostJobHistory` object to the collection.
				251
				252	@param host_history The `HostJobHistory` object to be
				253	remembered.
				254
				255	"""
J. Richard Barnette	3d0590a	2015-04-29 12:56:12 -0700	[diff] [blame]	256	pool = host_history.host_pool
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	257	self._pools[pool].record_host(host_history)
				258
				259
				260	def _count_pool(self, get_pool_count, pool=None):
				261	"""Internal helper to count hosts in a given pool.
				262
				263	The `get_pool_count` parameter is a function to calculate
				264	the exact count of interest for the pool.
				265
				266	@param get_pool_count Function to return a count from a
				267	_PoolCount object.
				268	@param pool The pool to be counted. If `None`,
				269	return the total across all pools.
				270
				271	"""
				272	if pool is None:
				273	return sum([get_pool_count(counts)
				274	for counts in self._pools.values()])
				275	else:
				276	return get_pool_count(self._pools[pool])
				277
				278
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	279	def get_working_list(self):
				280	"""Return a list of all working DUTs for the board.
				281
				282	Go through all HostJobHistory objects in the board's pools,
				283	selecting the ones where the last diagnosis is `WORKING`.
				284
				285	@return A list of HostJobHistory objects.
				286
				287	"""
				288	l = []
				289	for p in self._pools.values():
				290	l.extend(p.get_working_list())
				291	return l
				292
				293
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	294	def get_working(self, pool=None):
				295	"""Return the number of working DUTs in a pool.
				296
				297	@param pool The pool to be counted. If `None`, return the
				298	total across all pools.
				299
				300	"""
				301	return self._count_pool(_PoolCounts.get_working, pool)
				302
				303
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	304	def get_broken_list(self):
				305	"""Return a list of all broken DUTs for the board.
				306
				307	Go through all HostJobHistory objects in the board's pools,
				308	selecting the ones where the last diagnosis is not
				309	`WORKING`.
				310
				311	@return A list of HostJobHistory objects.
				312
				313	"""
				314	l = []
				315	for p in self._pools.values():
				316	l.extend(p.get_broken_list())
				317	return l
				318
				319
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	320	def get_broken(self, pool=None):
				321	"""Return the number of broken DUTs in a pool.
				322
				323	@param pool The pool to be counted. If `None`, return the
				324	total across all pools.
				325
				326	"""
				327	return self._count_pool(_PoolCounts.get_broken, pool)
				328
				329
				330	def get_total(self, pool=None):
				331	"""Return the total number of DUTs in a pool.
				332
				333	@param pool The pool to be counted. If `None`, return the
				334	total across all pools.
				335
				336	"""
				337	return self._count_pool(_PoolCounts.get_total, pool)
				338
				339
				340	class _LabInventory(dict):
				341	"""Collection of `HostJobHistory` objects for the Lab's inventory.
				342
				343	The collection is indexed by board. Indexing returns the
				344	_BoardCounts object associated with the board.
				345
				346	The collection is also iterable. The iterator returns all the
				347	boards in the inventory, in unspecified order.
				348
				349	"""
				350
				351	@classmethod
				352	def create_inventory(cls, afe, start_time, end_time, boardlist=[]):
				353	"""Return a Lab inventory with specified parameters.
				354
				355	By default, gathers inventory from `HostJobHistory` objects
				356	for all DUTs in the `_MANAGED_POOLS` list. If `boardlist`
				357	is supplied, the inventory will be restricted to only the
				358	given boards.
				359
				360	@param afe AFE object for constructing the
				361	`HostJobHistory` objects.
				362	@param start_time Start time for the `HostJobHistory`
				363	objects.
				364	@param end_time End time for the `HostJobHistory`
				365	objects.
				366	@param boardlist List of boards to include. If empty,
				367	include all available boards.
				368	@return A `_LabInventory` object for the specified boards.
				369
				370	"""
				371	label_list = [constants.Labels.POOL_PREFIX + l
				372	for l in _MANAGED_POOLS]
				373	afehosts = afe.get_hosts(labels__name__in=label_list)
				374	if boardlist:
				375	boardhosts = []
				376	for board in boardlist:
				377	board_label = constants.Labels.BOARD_PREFIX + board
				378	host_list = [h for h in afehosts
				379	if board_label in h.labels]
				380	boardhosts.extend(host_list)
				381	afehosts = boardhosts
				382	create = lambda host: (
				383	status_history.HostJobHistory(afe, host,
				384	start_time, end_time))
				385	return cls([create(host) for host in afehosts])
				386
				387
				388	def __init__(self, histories):
J. Richard Barnette	6948ed3	2015-05-06 08:57:10 -0700	[diff] [blame]	389	# N.B. The query that finds our hosts is restricted to those
				390	# with a valid pool: label, but doesn't check for a valid
				391	# board: label. In some (insufficiently) rare cases, the
				392	# AFE hosts table has been known to (incorrectly) have DUTs
				393	# with a pool: but no board: label. We explicitly exclude
				394	# those here.
				395	histories = [h for h in histories
				396	if h.host_board is not None]
J. Richard Barnette	3d0590a	2015-04-29 12:56:12 -0700	[diff] [blame]	397	boards = set([h.host_board for h in histories])
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	398	initval = { board: _BoardCounts() for board in boards }
				399	super(_LabInventory, self).__init__(initval)
				400	self._dut_count = len(histories)
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	401	self._board_counts = None
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	402	for h in histories:
J. Richard Barnette	3d0590a	2015-04-29 12:56:12 -0700	[diff] [blame]	403	self[h.host_board].record_host(h)
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	404
				405
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	406	def get_working_list(self):
				407	"""Return a list of all working DUTs in the inventory.
				408
				409	Go through all HostJobHistory objects in the inventory,
				410	selecting the ones where the last diagnosis is `WORKING`.
				411
				412	@return A list of HostJobHistory objects.
				413
				414	"""
				415	l = []
				416	for counts in self.values():
				417	l.extend(counts.get_working_list())
				418	return l
				419
				420
				421	def get_broken_list(self):
				422	"""Return a list of all broken DUTs in the inventory.
				423
				424	Go through all HostJobHistory objects in the inventory,
				425	selecting the ones where the last diagnosis is not
				426	`WORKING`.
				427
				428	@return A list of HostJobHistory objects.
				429
				430	"""
				431	l = []
				432	for counts in self.values():
				433	l.extend(counts.get_broken_list())
				434	return l
				435
				436
				437	def get_board_counts(self):
				438	"""Calculate a summary of board counts.
				439
				440	The summary is a list of tuples. The tuple elements, in
				441	order, are:
				442	* board - The name of the board associated with the
				443	counts.
				444	* buffer - The buffer of working spares (the total number
				445	of spares, less the number of broken DUTs).
				446	* broken - The number of broken DUTs.
				447	* working - The number of working DUTs.
				448	* spares - The number of DUTs in the spares pool.
				449	* total - The the total number of DUTs.
				450
				451	Boards with no DUTs in the spares pool or no DUTs in a
				452	critical pool will be excluded from the listed counts.
				453
				454	The ordering of the boards is unspecified.
				455
				456	@param inventory The inventory to be summarized.
				457	@return A list of tuples with board data.
				458
				459	"""
				460	if self._board_counts is None:
				461	self._board_counts = []
				462	for board, counts in self.items():
				463	logging.debug('Counting inventory for %s', board)
				464	spares = counts.get_total(_SPARE_POOL)
				465	total = counts.get_total()
				466	if spares == 0 or spares == total:
				467	continue
				468	working = counts.get_working()
				469	broken = counts.get_broken()
				470	spare_buffer = spares - broken
				471	element = (board, spare_buffer, broken, working,
				472	spares, total)
				473	self._board_counts.append(element)
				474	return self._board_counts
				475
				476
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	477	def get_num_duts(self):
				478	"""Return the total number of DUTs in the inventory."""
				479	return self._dut_count
				480
				481
				482	def get_num_boards(self):
				483	"""Return the total number of boards in the inventory."""
				484	return len(self)
				485
				486
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	487	def _sort_by_location(inventory_list):
				488	"""Return a list of DUTs, organized by location.
				489
				490	Take the given list of `HostJobHistory` objects, separate it
				491	into a list per lab, and sort each lab's list by location. The
				492	order of sorting within a lab is
				493	* By row number within the lab,
				494	* then by rack number within the row,
				495	* then by host shelf number within the rack.
				496
				497	Return a list of the sorted lists.
				498
				499	Implementation note: host locations are sorted by converting
				500	each location into a base 100 number. If row, rack or
				501	host numbers exceed the range [0..99], then sorting will
				502	break down.
				503
				504	@return A list of sorted lists of DUTs.
				505
				506	"""
				507	BASE = 100
				508	lab_lists = {}
				509	for history in inventory_list:
				510	location = _HOSTNAME_PATTERN.match(history.host.hostname)
				511	if location:
				512	lab = location.group(1)
				513	key = 0
				514	for idx in location.group(2, 3, 4):
				515	key = BASE * key + int(idx)
				516	lab_lists.setdefault(lab, []).append((key, history))
				517	return_list = []
				518	for dut_list in lab_lists.values():
				519	dut_list.sort(key=lambda t: t[0])
				520	return_list.append([t[1] for t in dut_list])
				521	return return_list
				522
				523
				524	def _score_repair_set(buffer_counts, repair_list):
				525	"""Return a numeric score rating a set of DUTs to be repaired.
				526
				527	`buffer_counts` is a dictionary mapping board names to the
				528	size of the board's spares buffer.
				529
				530	`repair_list` is a list of DUTs to be repaired.
				531
				532	This function calculates the new set of buffer counts that would
				533	result from the proposed repairs, and scores the new set using
				534	two numbers:
				535	* Worst case buffer count for any board (higher is better).
				536	This is the more siginficant number for comparison.
				537	* Number of boards at the worst case (lower is better). This
				538	is the less significant number.
				539
				540	Implementation note: The score could fail to reflect the
				541	intended criteria if there are more than 1000 boards in the
				542	inventory.
				543
				544	@param spare_counts A dictionary mapping boards to buffer counts.
				545	@param repair_list A list of boards to be repaired.
				546	@return A numeric score.
				547
				548	"""
				549	# Go through `buffer_counts`, and create a list of new counts
				550	# that records the buffer count for each board after repair.
				551	# The new list of counts discards the board names, as they don't
				552	# contribute to the final score.
				553	_NBOARDS = 1000
				554	repair_inventory = _LabInventory(repair_list)
				555	new_counts = []
				556	for b, c in buffer_counts.items():
				557	if b in repair_inventory:
				558	newcount = repair_inventory[b].get_total()
				559	else:
				560	newcount = 0
				561	new_counts.append(c + newcount)
				562	# Go through the new list of counts. Find the worst available
				563	# spares count, and count how many times that worst case occurs.
				564	worst_count = new_counts[0]
				565	num_worst = 1
				566	for c in new_counts[1:]:
				567	if c == worst_count:
				568	num_worst += 1
				569	elif c < worst_count:
				570	worst_count = c
				571	num_worst = 1
				572	# Return the calculated score
				573	return _NBOARDS * worst_count - num_worst
				574
				575
				576	def _generate_repair_recommendation(inventory, num_recommend):
				577	"""Return a summary of selected DUTs needing repair.
				578
				579	Returns a message recommending a list of broken DUTs to be
				580	repaired. The list of DUTs is selected based on these
				581	criteria:
				582	* No more than `num_recommend` DUTs will be listed.
				583	* All DUTs must be in the same lab.
				584	* DUTs should be selected for some degree of physical
				585	proximity.
				586	* DUTs for boards with a low spares buffer are more important
				587	than DUTs with larger buffers.
				588
				589	The algorithm used will guarantee that at least one DUT from a
				590	board with the smallest spares buffer will be recommended. If
				591	the worst spares buffer number is shared by more than one board,
				592	the algorithm will tend to prefer repair sets that include more
				593	of those boards over sets that cover fewer boards.
				594
				595	"""
				596	logging.debug('Creating DUT repair recommendations')
				597	board_counts = inventory.get_board_counts()
				598	# t[0] - board name
				599	# t[1] - size of spares buffer
				600	# t[2] - number of broken devices
				601	board_buffer_counts = {t[0]: t[1] for t in board_counts
				602	if t[2] != 0}
				603	recommendation = None
				604	best_score = None
				605	# N.B. The logic of this loop may seem complicated, but
				606	# simplification is hard:
				607	# * Calculating an initial recommendation outside of
				608	# the loop likely would make things more complicated,
				609	# not less.
				610	# * It's necessary to calculate an initial lab slice once per
				611	# lab _before_ the while loop, in case the number of broken
				612	# DUTs in a lab is less than `num_recommend`.
				613	for lab_duts in _sort_by_location(inventory.get_broken_list()):
				614	start = 0
				615	end = num_recommend
				616	lab_slice = lab_duts[start : end]
				617	lab_score = _score_repair_set(board_buffer_counts,
				618	lab_slice)
				619	while end < len(lab_duts):
				620	start += 1
				621	end += 1
				622	new_slice = lab_duts[start : end]
				623	new_score = _score_repair_set(board_buffer_counts,
				624	new_slice)
				625	if new_score > lab_score:
				626	lab_slice = new_slice
				627	lab_score = new_score
				628	if recommendation is None or lab_score > best_score:
				629	recommendation = lab_slice
				630	best_score = lab_score
				631	message = ['%-30s %-16s %s' % (
				632	'Hostname', 'Board', 'Servo instructions')]
				633	for h in recommendation:
				634	servo_name = servo_host.make_servo_hostname(h.host.hostname)
				635	if utils.host_is_in_lab_zone(servo_name):
				636	servo_message = 'Repair servo first'
				637	else:
				638	servo_message = 'No servo present'
				639	line = '%-30s %-16s %s' % (
				640	h.host.hostname, h.host_board, servo_message)
				641	message.append(line)
				642	return '\n'.join(message)
				643
				644
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	645	def _generate_board_inventory_message(inventory):
				646	"""Generate the "board inventory" e-mail message.
				647
				648	The board inventory is a list by board summarizing the number
				649	of working and broken DUTs, and the total shortfall or surplus
				650	of working devices relative to the minimum critical pool
				651	requirement.
				652
				653	The report omits boards with no DUTs in the spare pool or with
				654	no DUTs in a critical pool.
				655
				656	N.B. For sample output text formattted as users can expect to
				657	see it in e-mail and log files, refer to the unit tests.
				658
				659	@param inventory _LabInventory object with the inventory to
				660	be reported on.
				661	@return String with the inventory message to be sent.
				662
				663	"""
				664	logging.debug('Creating board inventory')
				665	message = []
				666	message.append(
				667	'%-20s %5s %5s %5s %5s %5s' % (
				668	'Board', 'Avail', 'Bad', 'Good', 'Spare', 'Total'))
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	669	data_list = inventory.get_board_counts()
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	670	data_list = sorted(sorted(data_list, key=lambda t: -t[2]),
				671	key=lambda t: t[1])
				672	message.extend(
				673	['%-20s %5d %5d %5d %5d %5d' % t for t in data_list])
				674	return '\n'.join(message)
				675
				676
J. Richard Barnette	4845fcf	2015-04-20 14:26:25 -0700	[diff] [blame]	677	_POOL_INVENTORY_HEADER = '''\
J. Richard Barnette	c9a143c	2015-06-04 11:11:19 -0700	[diff] [blame^]	678	Notice to Infrastructure deputies: If any boards are shown at
				679	less than full strength, please take action to resolve the issues.
				680	Once you're satisified that failures won't recur, failed DUTs can
				681	be replaced with spares by running `balance_pool`. Detailed
				682	instructions can be found here:
J. Richard Barnette	4845fcf	2015-04-20 14:26:25 -0700	[diff] [blame]	683	http://go/cros-manage-duts
				684	'''
				685
				686
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	687	def _generate_pool_inventory_message(inventory):
				688	"""Generate the "pool inventory" e-mail message.
				689
				690	The pool inventory is a list by pool and board summarizing the
				691	number of working and broken DUTs in the pool. Only boards with
				692	at least one broken DUT are included in the list.
				693
				694	N.B. For sample output text formattted as users can expect to
				695	see it in e-mail and log files, refer to the unit tests.
				696
				697	@param inventory _LabInventory object with the inventory to
				698	be reported on.
				699	@return String with the inventory message to be sent.
				700
				701	"""
				702	logging.debug('Creating pool inventory')
J. Richard Barnette	4845fcf	2015-04-20 14:26:25 -0700	[diff] [blame]	703	message = [_POOL_INVENTORY_HEADER]
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	704	newline = ''
				705	for pool in _CRITICAL_POOLS:
				706	message.append(
				707	'%sStatus for pool:%s, by board:' % (newline, pool))
				708	message.append(
				709	'%-20s %5s %5s %5s' % (
				710	'Board', 'Bad', 'Good', 'Total'))
				711	data_list = []
				712	for board, counts in inventory.items():
				713	logging.debug('Counting inventory for %s, %s',
				714	board, pool)
				715	broken = counts.get_broken(pool)
				716	if broken == 0:
				717	continue
				718	working = counts.get_working(pool)
				719	total = counts.get_total(pool)
				720	data_list.append((board, broken, working, total))
				721	if data_list:
				722	data_list = sorted(data_list, key=lambda d: -d[1])
				723	message.extend(
				724	['%-20s %5d %5d %5d' % t for t in data_list])
				725	else:
				726	message.append('(All boards at full strength)')
				727	newline = '\n'
				728	return '\n'.join(message)
				729
				730
				731	def _send_email(arguments, tag, subject, recipients, body):
				732	"""Send an inventory e-mail message.
				733
				734	The message is logged in the selected log directory using `tag`
				735	for the file name.
				736
				737	If the --print option was requested, the message is neither
				738	logged nor sent, but merely printed on stdout.
				739
				740	@param arguments Parsed command-line options.
				741	@param tag Tag identifying the inventory for logging
				742	purposes.
				743	@param subject E-mail Subject: header line.
				744	@param recipients E-mail addresses for the To: header line.
				745	@param body E-mail message body.
				746
				747	"""
				748	logging.debug('Generating email: "%s"', subject)
				749	all_recipients = ', '.join(recipients)
				750	report_body = '\n'.join([
				751	'To: %s' % all_recipients,
				752	'Subject: %s' % subject,
				753	'', body, ''])
				754	if arguments.print_:
				755	print report_body
				756	else:
				757	filename = os.path.join(arguments.logdir, tag)
				758	try:
				759	report_file = open(filename, 'w')
				760	report_file.write(report_body)
				761	report_file.close()
				762	except EnvironmentError as e:
				763	logging.error('Failed to write %s: %s', filename, e)
				764	try:
				765	gmail_lib.send_email(all_recipients, subject, body)
				766	except Exception as e:
				767	logging.error('Failed to send e-mail to %s: %s',
				768	all_recipients, e)
				769
				770
				771	def _separate_email_addresses(address_list):
				772	"""Parse a list of comma-separated lists of e-mail addresses.
				773
				774	@param address_list A list of strings containing comma
				775	separate e-mail addresses.
				776	@return A list of the individual e-mail addresses.
				777
				778	"""
				779	newlist = []
				780	for arg in address_list:
				781	newlist.extend([email.strip() for email in arg.split(',')])
				782	return newlist
				783
				784
				785	def _verify_arguments(arguments):
				786	"""Validate command-line arguments.
				787
				788	Join comma separated e-mail addresses for `--board-notify` and
				789	`--pool-notify` in separate option arguments into a single list.
				790
				791	@param arguments Command-line arguments as returned by
				792	`ArgumentParser`
				793
				794	"""
				795	arguments.board_notify = _separate_email_addresses(
				796	arguments.board_notify)
				797	arguments.pool_notify = _separate_email_addresses(
				798	arguments.pool_notify)
				799
				800
				801	def _get_logdir(script):
				802	"""Get the default directory for the `--logdir` option.
				803
				804	The default log directory is based on the parent directory
				805	containing this script.
				806
				807	@param script Path to this script file.
				808	@return A path to a directory.
				809
				810	"""
				811	basedir = os.path.dirname(os.path.abspath(script))
				812	basedir = os.path.dirname(basedir)
				813	return os.path.join(basedir, _LOGDIR)
				814
				815
				816	def _parse_command(argv):
				817	"""Parse the command line arguments.
				818
				819	Create an argument parser for this command's syntax, parse the
				820	command line, and return the result of the ArgumentParser
				821	parse_args() method.
				822
				823	@param argv Standard command line argument vector; argv[0] is
				824	assumed to be the command name.
				825	@return Result returned by ArgumentParser.parse_args().
				826
				827	"""
				828	parser = argparse.ArgumentParser(
				829	prog=argv[0],
				830	description='Gather and report lab inventory statistics')
				831	parser.add_argument('-d', '--duration', type=int,
				832	default=_DEFAULT_DURATION, metavar='HOURS',
				833	help='number of hours back to search for status'
				834	' (default: %d)' % _DEFAULT_DURATION)
				835	parser.add_argument('--board-notify', action='append',
				836	default=[], metavar='ADDRESS',
				837	help='Generate board inventory message, '
				838	'and send it to the given e-mail address(es)')
				839	parser.add_argument('--pool-notify', action='append',
				840	default=[], metavar='ADDRESS',
				841	help='Generate pool inventory message, '
				842	'and send it to the given address(es)')
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	843	parser.add_argument('--recommend-notify', action='append',
				844	default=[], metavar='ADDRESS',
				845	help='Generate repair recommendations, '
				846	'and send it to the given address(es)')
				847	parser.add_argument('-r', '--recommend', type=int,
				848	default=_DEFAULT_NUM_RECOMMEND,
				849	help=('Specify how many DUTs should be '
				850	'recommended for repair (default: %d)' %
				851	_DEFAULT_NUM_RECOMMEND))
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	852	parser.add_argument('--print', dest='print_', action='store_true',
				853	help='Print e-mail messages on stdout '
				854	'without sending them.')
				855	parser.add_argument('--logdir', default=_get_logdir(argv[0]),
				856	help='Directory where logs will be written.')
				857	parser.add_argument('boardnames', nargs='*',
				858	metavar='BOARD',
				859	help='names of boards to report on '
				860	'(default: all boards)')
				861	arguments = parser.parse_args(argv[1:])
				862	_verify_arguments(arguments)
				863	return arguments
				864
				865
				866	def _configure_logging(arguments):
				867	"""Configure the `logging` module for our needs.
				868
				869	How we log depends on whether the `--print` option was
				870	provided on the command line. Without the option, we log all
				871	messages at DEBUG level or above, and write them to a file in
				872	the directory specified by the `--logdir` option. With the
				873	option, we write log messages to stdout; messages below INFO
				874	level are discarded.
				875
				876	The log file is configured to rotate once a week on Friday
				877	evening, preserving ~3 months worth of history.
				878
				879	@param arguments Command-line arguments as returned by
				880	`ArgumentParser`
				881
				882	"""
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	883	root_logger = logging.getLogger()
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	884	if arguments.print_:
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	885	root_logger.setLevel(logging.INFO)
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	886	handler = logging.StreamHandler(sys.stdout)
				887	handler.setFormatter(logging.Formatter())
				888	else:
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	889	root_logger.setLevel(logging.DEBUG)
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	890	logfile = os.path.join(arguments.logdir, _LOGFILE)
				891	handler = logging.handlers.TimedRotatingFileHandler(
				892	logfile, when='W4', backupCount=13)
				893	formatter = logging.Formatter(_LOG_FORMAT,
				894	time_utils.TIME_FMT)
				895	handler.setFormatter(formatter)
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	896	# TODO(jrbarnette) This is gross. Importing client.bin.utils
				897	# implicitly imported logging_config, which calls
				898	# logging.basicConfig() at module level. That gives us an
				899	# extra logging handler that we don't want. So, clear out all
				900	# the handlers here.
				901	for h in root_logger.handlers:
				902	root_logger.removeHandler(h)
				903	root_logger.addHandler(handler)
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	904
				905
				906	def _populate_board_counts(inventory):
				907	"""Gather board counts while providing interactive feedback.
				908
				909	Gathering the status of all individual DUTs in the lab can take
				910	considerable time (~30 minutes at the time of this writing).
				911
				912	Normally, we pay that cost by querying as we go. However, with
				913	the `--print` option, a human being may be watching the
				914	progress. So, we force the first (expensive) queries to happen
				915	up front, and provide a small ASCII progress bar to give an
				916	indicator of how many boards have been processed.
				917
				918	@param inventory _LabInventory object with the inventory to
				919	be gathered.
				920
				921	"""
				922	n = 0
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	923	total_broken = 0
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	924	for counts in inventory.values():
				925	n += 1
				926	if n % 10 == 5:
				927	c = '+'
				928	elif n % 10 == 0:
				929	c = '%d' % ((n / 10) % 10)
				930	else:
				931	c = '.'
				932	sys.stdout.write(c)
				933	sys.stdout.flush()
				934	# This next call is where all the time goes - it forces all
				935	# of a board's HostJobHistory objects to query the database
				936	# and cache their results.
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	937	total_broken += counts.get_broken()
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	938	sys.stdout.write('\n')
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	939	sys.stdout.write('Found %d broken DUTs\n' % total_broken)
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	940
				941
				942	def main(argv):
				943	"""Standard main routine.
				944	@param argv Command line arguments including `sys.argv[0]`.
				945	"""
				946	arguments = _parse_command(argv)
				947	_configure_logging(arguments)
				948	try:
				949	end_time = int(time.time())
				950	start_time = end_time - arguments.duration * 60 * 60
				951	timestamp = time.strftime('%Y-%m-%d.%H',
				952	time.localtime(end_time))
				953	logging.debug('Starting lab inventory for %s', timestamp)
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	954	if arguments.recommend_notify:
				955	logging.debug('Will include repair recommendations')
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	956	if arguments.board_notify:
				957	logging.debug('Will include board inventory')
				958	if arguments.pool_notify:
				959	logging.debug('Will include pool inventory')
				960
				961	afe = frontend.AFE(server=None)
				962	inventory = _LabInventory.create_inventory(
				963	afe, start_time, end_time, arguments.boardnames)
				964	logging.info('Found %d hosts across %d boards',
				965	inventory.get_num_duts(),
				966	inventory.get_num_boards())
				967
				968	if arguments.print_:
				969	_populate_board_counts(inventory)
				970
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	971	if arguments.print_ or arguments.recommend_notify:
				972	recommend_message = _generate_repair_recommendation(
				973	inventory, arguments.recommend)
				974	_send_email(arguments,
				975	'recommend-%s.txt' % timestamp,
				976	'DUT repair recommendations %s' % timestamp,
				977	arguments.recommend_notify,
				978	recommend_message)
				979
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	980	if arguments.print_ or arguments.board_notify:
				981	_send_email(arguments,
				982	'boards-%s.txt' % timestamp,
				983	'DUT board inventory %s' % timestamp,
				984	arguments.board_notify,
				985	_generate_board_inventory_message(inventory))
				986
				987	if arguments.print_ or arguments.pool_notify:
				988	_send_email(arguments,
				989	'pools-%s.txt' % timestamp,
				990	'DUT pool inventory %s' % timestamp,
				991	arguments.pool_notify,
				992	_generate_pool_inventory_message(inventory))
				993	except KeyboardInterrupt:
				994	pass
				995	except EnvironmentError as e:
				996	logging.exception('Unexpected OS error: %s', e)
				997	except Exception as e:
				998	logging.exception('Unexpected exception: %s', e)
				999
				1000
				1001	if __name__ == '__main__':
				1002	main(sys.argv)