Blame - site_utils/lab_inventory.py - platform/external/autotest

blob: cddb4895f81e73688d176147a849517bbf0c9254 [file] [log] [blame]

J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	1	#!/usr/bin/env python
				2	# Copyright 2015 The Chromium OS Authors. All rights reserved.
				3	# Use of this source code is governed by a BSD-style license that can be
				4	# found in the LICENSE file.
				5
				6	"""Create e-mail reports of the Lab's DUT inventory.
				7
				8	Gathers a list of all DUTs of interest in the Lab, segregated by
				9	board and pool, and determines whether each DUT is working or
				10	broken. Then, send one or more e-mail reports summarizing the
				11	status to e-mail addresses provided on the command line.
				12
				13	usage: lab_inventory.py [ options ] [ board ... ]
				14
				15	Options:
				16	--duration / -d <hours>
				17	How far back in time to search job history to determine DUT
				18	status.
				19
				20	--board-notify <address>[,<address>]
				21	Send the "board status" e-mail to all the specified e-mail
				22	addresses.
				23
				24	--pool-notify <address>[,<address>]
				25	Send the "pool status" e-mail to all the specified e-mail
				26	addresses.
				27
J. Richard Barnette	1df6a56	2015-06-09 10:06:17 -0700	[diff] [blame]	28	--recommend <number>
				29	When generating the "board status" e-mail, included a list of
				30	<number> specific DUTs to be recommended for repair.
				31
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	32	--logdir <directory>
				33	Log progress and actions in a file under this directory. Text
				34	of any e-mail sent will also be logged in a timestamped file in
				35	this directory.
				36
J. Richard Barnette	02e8243	2015-10-13 16:02:47 -0700	[diff] [blame]	37	--debug
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	38	Suppress all logging and sending e-mail. Instead, write the
				39	output that would be generated onto stdout.
				40
				41	<board> arguments:
				42	With no arguments, gathers the status for all boards in the lab.
				43	With one or more named boards on the command line, restricts
				44	reporting to just those boards.
				45
				46	"""
				47
				48
				49	import argparse
				50	import logging
				51	import logging.handlers
				52	import os
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	53	import re
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	54	import sys
				55	import time
				56
				57	import common
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	58	from autotest_lib.client.bin import utils
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	59	from autotest_lib.client.common_lib import time_utils
J. Richard Barnette	a7c514e	2015-09-15 11:13:23 -0700	[diff] [blame]	60	from autotest_lib.server.cros.dynamic_suite import frontend_wrappers
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	61	from autotest_lib.server.hosts import servo_host
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	62	from autotest_lib.site_utils import gmail_lib
				63	from autotest_lib.site_utils import status_history
				64	from autotest_lib.site_utils.suite_scheduler import constants
				65
				66
				67	# The pools in the Lab that are actually of interest.
				68	#
				69	# These are general purpose pools of DUTs that are considered
				70	# identical for purposes of testing. That is, a device in one of
				71	# these pools can be shifted to another pool at will for purposes
				72	# of supplying test demand.
				73	#
				74	# Devices in these pools are not allowed to have special-purpose
				75	# attachments, or to be part of in any kind of custom fixture.
				76	# Devices in these pools are also required to reside in areas
				77	# managed by the Platforms team (i.e. at the time of this writing,
				78	# only in "Atlantis" or "Destiny").
				79	#
				80	# _CRITICAL_POOLS - Pools that must be kept fully supplied in order
				81	# to guarantee timely completion of tests from builders.
				82	# _SPARE_POOL - A low priority pool that is allowed to provide
				83	# spares to replace broken devices in the critical pools.
				84	# _MANAGED_POOLS - The set of all the general purpose pools
				85	# monitored by this script.
				86
J. Richard Barnette	222d7f4	2015-12-14 17:22:51 -0800	[diff] [blame]	87	_CRITICAL_POOLS = ['bvt', 'cq', 'continuous']
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	88	_SPARE_POOL = 'suites'
				89	_MANAGED_POOLS = _CRITICAL_POOLS + [_SPARE_POOL]
				90
J. Richard Barnette	b8bc570c	2016-03-17 17:03:57 -0700	[diff] [blame]	91	# _EXCLUDED_LABELS - A set of labels that disqualify a DUT from
				92	# monitoring by this script. Currently, we're excluding any
				93	# 'adb' host, because we're not ready to monitor Android or
				94	# Brillo hosts.
				95	_EXCLUDED_LABELS = set(['adb'])
				96
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	97	# _DEFAULT_DURATION:
				98	# Default value used for the --duration command line option.
				99	# Specifies how far back in time to search in order to determine
				100	# DUT status.
				101
				102	_DEFAULT_DURATION = 24
				103
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	104	# _LOGDIR:
				105	# Relative path used in the calculation of the default setting
				106	# for the --logdir option. The full path path is relative to
				107	# the root of the autotest directory, as determined from
				108	# sys.argv[0].
				109	# _LOGFILE:
				110	# Basename of a file to which general log information will be
				111	# written.
				112	# _LOG_FORMAT:
				113	# Format string for log messages.
				114
				115	_LOGDIR = os.path.join('logs', 'dut-data')
				116	_LOGFILE = 'lab-inventory.log'
				117	_LOG_FORMAT = '%(asctime)s \| %(levelname)-10s \| %(message)s'
				118
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	119	# Pattern describing location-based host names in the Chrome OS test
				120	# labs. Each DUT hostname designates the DUT's location:
				121	# * A lab (room) that's physically separated from other labs
				122	# (i.e. there's a door).
				123	# * A row (or aisle) of DUTs within the lab.
				124	# * A vertical rack of shelves on the row.
				125	# * A specific host on one shelf of the rack.
				126
				127	_HOSTNAME_PATTERN = re.compile(
				128	r'(chromeos\d+)-row(\d+)-rack(\d+)-host(\d+)')
				129
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	130
				131	class _PoolCounts(object):
				132	"""Maintains a set of `HostJobHistory` objects for a pool.
				133
				134	The collected history objects are nominally all part of a single
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	135	scheduling pool of DUTs. The collection maintains a list of
				136	working DUTs, a list of broken DUTs, and a list of all DUTs.
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	137
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	138	Performance note: Certain methods in this class are potentially
				139	expensive:
				140	* `get_working()`
				141	* `get_working_list()`
				142	* `get_broken()`
				143	* `get_broken_list()`
xixuan	12ce04f	2016-03-10 13:16:30 -0800	[diff] [blame^]	144	* `get_idle()`
				145	* `get_idle_list()`
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	146	The first time any one of these methods is called, it causes
				147	multiple RPC calls with a relatively expensive set of database
				148	queries. However, the results of the queries are cached in the
				149	individual `HostJobHistory` objects, so only the first call
				150	actually pays the full cost.
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	151
xixuan	12ce04f	2016-03-10 13:16:30 -0800	[diff] [blame^]	152	Additionally, `get_working_list()`, `get_broken_list()` and
				153	`get_idle_list()` cache their return values to avoid recalculating
				154	lists at every call; this caching is separate from the caching of RPC
				155	results described above.
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	156
				157	This class is deliberately constructed to delay the RPC cost
				158	until the accessor methods are called (rather than to query in
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	159	`record_host()`) so that it's possible to construct a complete
				160	`_LabInventory` without making the expensive queries at creation
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	161	time. `_populate_board_counts()`, below, assumes this behavior.
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	162
				163	"""
				164
				165	def __init__(self):
				166	self._histories = []
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	167	self._working_list = None
				168	self._broken_list = None
xixuan	12ce04f	2016-03-10 13:16:30 -0800	[diff] [blame^]	169	self._idle_list = None
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	170
				171
				172	def record_host(self, host_history):
				173	"""Add one `HostJobHistory` object to the collection.
				174
				175	@param host_history The `HostJobHistory` object to be
				176	remembered.
				177
				178	"""
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	179	self._working_list = None
				180	self._broken_list = None
xixuan	12ce04f	2016-03-10 13:16:30 -0800	[diff] [blame^]	181	self._idle_list = None
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	182	self._histories.append(host_history)
				183
				184
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	185	def get_working_list(self):
				186	"""Return a list of all working DUTs in the pool.
				187
				188	Filter `self._histories` for histories where the last
				189	diagnosis is `WORKING`.
				190
				191	Cache the result so that we only cacluate it once.
				192
				193	@return A list of HostJobHistory objects.
				194
				195	"""
				196	if self._working_list is None:
				197	self._working_list = [h for h in self._histories
				198	if h.last_diagnosis()[0] == status_history.WORKING]
				199	return self._working_list
				200
				201
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	202	def get_working(self):
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	203	"""Return the number of working DUTs in the pool."""
				204	return len(self.get_working_list())
				205
				206
				207	def get_broken_list(self):
				208	"""Return a list of all broken DUTs in the pool.
				209
				210	Filter `self._histories` for histories where the last
xixuan	12ce04f	2016-03-10 13:16:30 -0800	[diff] [blame^]	211	diagnosis is `BROKEN`.
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	212
				213	Cache the result so that we only cacluate it once.
				214
				215	@return A list of HostJobHistory objects.
				216
				217	"""
				218	if self._broken_list is None:
				219	self._broken_list = [h for h in self._histories
xixuan	12ce04f	2016-03-10 13:16:30 -0800	[diff] [blame^]	220	if h.last_diagnosis()[0] == status_history.BROKEN]
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	221	return self._broken_list
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	222
				223
				224	def get_broken(self):
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	225	"""Return the number of broken DUTs in the pool."""
				226	return len(self.get_broken_list())
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	227
				228
xixuan	12ce04f	2016-03-10 13:16:30 -0800	[diff] [blame^]	229	def get_idle_list(self):
				230	"""Return a list of all idle DUTs in the pool.
				231
				232	Filter `self._histories` for histories where the last
				233	diagnosis is `UNUSED` or `UNKNOWN`.
				234
				235	Cache the result so that we only cacluate it once.
				236
				237	@return A list of HostJobHistory objects.
				238
				239	"""
				240	idle_list = [status_history.UNUSED, status_history.UNKNOWN]
				241	if self._idle_list is None:
				242	self._idle_list = [h for h in self._histories
				243	if h.last_diagnosis()[0] in idle_list]
				244	return self._idle_list
				245
				246
				247	def get_idle(self):
				248	"""Return the number of idle DUTs in the pool."""
				249	return len(self.get_idle_list())
				250
				251
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	252	def get_total(self):
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	253	"""Return the total number of DUTs in the pool."""
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	254	return len(self._histories)
				255
				256
				257	class _BoardCounts(object):
				258	"""Maintains a set of `HostJobHistory` objects for a board.
				259
				260	The collected history objects are nominally all of the same
				261	board. The collection maintains a count of working DUTs, a
				262	count of broken DUTs, and a total count. The counts can be
				263	obtained either for a single pool, or as a total across all
				264	pools.
				265
				266	DUTs in the collection must be assigned to one of the pools
				267	in `_MANAGED_POOLS`.
				268
				269	The `get_working()` and `get_broken()` methods rely on the
				270	methods of the same name in _PoolCounts, so the performance
				271	note in _PoolCounts applies here as well.
				272
				273	"""
				274
				275	def __init__(self):
				276	self._pools = {
				277	pool: _PoolCounts() for pool in _MANAGED_POOLS
				278	}
				279
				280	def record_host(self, host_history):
				281	"""Add one `HostJobHistory` object to the collection.
				282
				283	@param host_history The `HostJobHistory` object to be
				284	remembered.
				285
				286	"""
J. Richard Barnette	3d0590a	2015-04-29 12:56:12 -0700	[diff] [blame]	287	pool = host_history.host_pool
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	288	self._pools[pool].record_host(host_history)
				289
				290
				291	def _count_pool(self, get_pool_count, pool=None):
				292	"""Internal helper to count hosts in a given pool.
				293
				294	The `get_pool_count` parameter is a function to calculate
				295	the exact count of interest for the pool.
				296
				297	@param get_pool_count Function to return a count from a
				298	_PoolCount object.
				299	@param pool The pool to be counted. If `None`,
				300	return the total across all pools.
				301
				302	"""
				303	if pool is None:
				304	return sum([get_pool_count(counts)
				305	for counts in self._pools.values()])
				306	else:
				307	return get_pool_count(self._pools[pool])
				308
				309
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	310	def get_working_list(self):
				311	"""Return a list of all working DUTs for the board.
				312
				313	Go through all HostJobHistory objects in the board's pools,
				314	selecting the ones where the last diagnosis is `WORKING`.
				315
				316	@return A list of HostJobHistory objects.
				317
				318	"""
				319	l = []
				320	for p in self._pools.values():
				321	l.extend(p.get_working_list())
				322	return l
				323
				324
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	325	def get_working(self, pool=None):
				326	"""Return the number of working DUTs in a pool.
				327
				328	@param pool The pool to be counted. If `None`, return the
				329	total across all pools.
				330
J. Richard Barnette	e39c827	2015-10-20 17:58:30 -0700	[diff] [blame]	331	@return The total number of working DUTs in the selected
				332	pool(s).
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	333	"""
				334	return self._count_pool(_PoolCounts.get_working, pool)
				335
				336
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	337	def get_broken_list(self):
				338	"""Return a list of all broken DUTs for the board.
				339
				340	Go through all HostJobHistory objects in the board's pools,
xixuan	12ce04f	2016-03-10 13:16:30 -0800	[diff] [blame^]	341	selecting the ones where the last diagnosis is `BROKEN`.
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	342
				343	@return A list of HostJobHistory objects.
				344
				345	"""
				346	l = []
				347	for p in self._pools.values():
				348	l.extend(p.get_broken_list())
				349	return l
				350
				351
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	352	def get_broken(self, pool=None):
				353	"""Return the number of broken DUTs in a pool.
				354
				355	@param pool The pool to be counted. If `None`, return the
				356	total across all pools.
				357
J. Richard Barnette	e39c827	2015-10-20 17:58:30 -0700	[diff] [blame]	358	@return The total number of broken DUTs in the selected pool(s).
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	359	"""
				360	return self._count_pool(_PoolCounts.get_broken, pool)
				361
				362
xixuan	12ce04f	2016-03-10 13:16:30 -0800	[diff] [blame^]	363	def get_idle_list(self, pool=None):
				364	"""Return a list of all idle DUTs for the board.
				365
				366	Go through all HostJobHistory objects in the board's pools,
				367	selecting the ones where the last diagnosis is `UNUSED` or `UNKNOWN`.
				368
				369	@param pool: The pool to be counted. If `None`, return the total list
				370	across all pools.
				371
				372	@return A list of HostJobHistory objects.
				373
				374	"""
				375	if pool is None:
				376	l = []
				377	for p in self._pools.values():
				378	l.extend(p.get_idle_list())
				379	return l
				380	else:
				381	return _PoolCounts.get_idle_list(self._pools[pool])
				382
				383
				384	def get_idle(self, pool=None):
				385	"""Return the number of idle DUTs in a pool.
				386
				387	@param pool: The pool to be counted. If `None`, return the total
				388	across all pools.
				389
				390	@return The total number of idle DUTs in the selected pool(s).
				391	"""
				392	return self._count_pool(_PoolCounts.get_idle, pool)
				393
				394
J. Richard Barnette	e39c827	2015-10-20 17:58:30 -0700	[diff] [blame]	395	def get_spares_buffer(self):
				396	"""Return the the nominal number of working spares.
				397
				398	Calculates and returns how many working spares there would
				399	be in the spares pool if all broken DUTs were in the spares
				400	pool. This number may be negative, indicating a shortfall
				401	in the critical pools.
				402
				403	@return The total number DUTs in the spares pool, less the total
				404	number of broken DUTs in all pools.
				405	"""
				406	return self.get_total(_SPARE_POOL) - self.get_broken()
				407
				408
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	409	def get_total(self, pool=None):
				410	"""Return the total number of DUTs in a pool.
				411
				412	@param pool The pool to be counted. If `None`, return the
				413	total across all pools.
				414
J. Richard Barnette	e39c827	2015-10-20 17:58:30 -0700	[diff] [blame]	415	@return The total number of DUTs in the selected pool(s).
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	416	"""
				417	return self._count_pool(_PoolCounts.get_total, pool)
				418
				419
				420	class _LabInventory(dict):
				421	"""Collection of `HostJobHistory` objects for the Lab's inventory.
				422
				423	The collection is indexed by board. Indexing returns the
				424	_BoardCounts object associated with the board.
				425
				426	The collection is also iterable. The iterator returns all the
				427	boards in the inventory, in unspecified order.
				428
				429	"""
				430
J. Richard Barnette	b8bc570c	2016-03-17 17:03:57 -0700	[diff] [blame]	431	@staticmethod
				432	def _eligible_host(afehost):
				433	"""Return whether this host is eligible for monitoring.
				434
				435	Hosts with any label that's in `_EXCLUDED_LABELS` aren't
				436	eligible.
				437
				438	@param afehost The host to be tested for eligibility.
				439	"""
				440	return not len(_EXCLUDED_LABELS.intersection(afehost.labels))
				441
				442
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	443	@classmethod
				444	def create_inventory(cls, afe, start_time, end_time, boardlist=[]):
				445	"""Return a Lab inventory with specified parameters.
				446
				447	By default, gathers inventory from `HostJobHistory` objects
				448	for all DUTs in the `_MANAGED_POOLS` list. If `boardlist`
				449	is supplied, the inventory will be restricted to only the
				450	given boards.
				451
				452	@param afe AFE object for constructing the
				453	`HostJobHistory` objects.
				454	@param start_time Start time for the `HostJobHistory`
				455	objects.
				456	@param end_time End time for the `HostJobHistory`
				457	objects.
				458	@param boardlist List of boards to include. If empty,
				459	include all available boards.
				460	@return A `_LabInventory` object for the specified boards.
				461
				462	"""
				463	label_list = [constants.Labels.POOL_PREFIX + l
				464	for l in _MANAGED_POOLS]
				465	afehosts = afe.get_hosts(labels__name__in=label_list)
				466	if boardlist:
J. Richard Barnette	b8bc570c	2016-03-17 17:03:57 -0700	[diff] [blame]	467	# We're deliberately not checking host eligibility in this
				468	# code path. This is a debug path, not used in production;
				469	# it may be useful to include ineligible hosts here.
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	470	boardhosts = []
				471	for board in boardlist:
				472	board_label = constants.Labels.BOARD_PREFIX + board
				473	host_list = [h for h in afehosts
				474	if board_label in h.labels]
				475	boardhosts.extend(host_list)
				476	afehosts = boardhosts
J. Richard Barnette	b8bc570c	2016-03-17 17:03:57 -0700	[diff] [blame]	477	else:
				478	afehosts = [h for h in afehosts if cls._eligible_host(h)]
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	479	create = lambda host: (
				480	status_history.HostJobHistory(afe, host,
				481	start_time, end_time))
				482	return cls([create(host) for host in afehosts])
				483
				484
				485	def __init__(self, histories):
J. Richard Barnette	6948ed3	2015-05-06 08:57:10 -0700	[diff] [blame]	486	# N.B. The query that finds our hosts is restricted to those
				487	# with a valid pool: label, but doesn't check for a valid
				488	# board: label. In some (insufficiently) rare cases, the
				489	# AFE hosts table has been known to (incorrectly) have DUTs
				490	# with a pool: but no board: label. We explicitly exclude
				491	# those here.
				492	histories = [h for h in histories
				493	if h.host_board is not None]
J. Richard Barnette	3d0590a	2015-04-29 12:56:12 -0700	[diff] [blame]	494	boards = set([h.host_board for h in histories])
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	495	initval = { board: _BoardCounts() for board in boards }
				496	super(_LabInventory, self).__init__(initval)
				497	self._dut_count = len(histories)
J. Richard Barnette	e39c827	2015-10-20 17:58:30 -0700	[diff] [blame]	498	self._managed_boards = None
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	499	for h in histories:
J. Richard Barnette	3d0590a	2015-04-29 12:56:12 -0700	[diff] [blame]	500	self[h.host_board].record_host(h)
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	501
				502
J. Richard Barnette	e39c827	2015-10-20 17:58:30 -0700	[diff] [blame]	503	def get_managed_boards(self):
				504	"""Return the set of "managed" boards.
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	505
J. Richard Barnette	e39c827	2015-10-20 17:58:30 -0700	[diff] [blame]	506	Operationally, saying a board is "managed" means that the
				507	board will be included in the "board" and "repair
				508	recommendations" reports. That is, if there are failures in
				509	the board's inventory then lab techs will be asked to fix
				510	them without a separate ticket.
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	511
J. Richard Barnette	e39c827	2015-10-20 17:58:30 -0700	[diff] [blame]	512	For purposes of implementation, a board is "managed" if it
				513	has DUTs in both the spare and a non-spare (i.e. critical)
				514	pool.
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	515
J. Richard Barnette	e39c827	2015-10-20 17:58:30 -0700	[diff] [blame]	516	@return A set of all the boards that have both spare and
				517	non-spare pools.
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	518	"""
J. Richard Barnette	e39c827	2015-10-20 17:58:30 -0700	[diff] [blame]	519	if self._managed_boards is None:
				520	self._managed_boards = set()
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	521	for board, counts in self.items():
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	522	spares = counts.get_total(_SPARE_POOL)
				523	total = counts.get_total()
J. Richard Barnette	e39c827	2015-10-20 17:58:30 -0700	[diff] [blame]	524	if spares != 0 and spares != total:
				525	self._managed_boards.add(board)
				526	return self._managed_boards
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	527
				528
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	529	def get_num_duts(self):
				530	"""Return the total number of DUTs in the inventory."""
				531	return self._dut_count
				532
				533
				534	def get_num_boards(self):
				535	"""Return the total number of boards in the inventory."""
				536	return len(self)
				537
				538
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	539	def _sort_by_location(inventory_list):
				540	"""Return a list of DUTs, organized by location.
				541
				542	Take the given list of `HostJobHistory` objects, separate it
				543	into a list per lab, and sort each lab's list by location. The
				544	order of sorting within a lab is
				545	* By row number within the lab,
				546	* then by rack number within the row,
				547	* then by host shelf number within the rack.
				548
				549	Return a list of the sorted lists.
				550
				551	Implementation note: host locations are sorted by converting
				552	each location into a base 100 number. If row, rack or
				553	host numbers exceed the range [0..99], then sorting will
				554	break down.
				555
				556	@return A list of sorted lists of DUTs.
				557
				558	"""
				559	BASE = 100
				560	lab_lists = {}
				561	for history in inventory_list:
				562	location = _HOSTNAME_PATTERN.match(history.host.hostname)
				563	if location:
				564	lab = location.group(1)
				565	key = 0
				566	for idx in location.group(2, 3, 4):
				567	key = BASE * key + int(idx)
				568	lab_lists.setdefault(lab, []).append((key, history))
				569	return_list = []
				570	for dut_list in lab_lists.values():
				571	dut_list.sort(key=lambda t: t[0])
				572	return_list.append([t[1] for t in dut_list])
				573	return return_list
				574
				575
				576	def _score_repair_set(buffer_counts, repair_list):
				577	"""Return a numeric score rating a set of DUTs to be repaired.
				578
				579	`buffer_counts` is a dictionary mapping board names to the
				580	size of the board's spares buffer.
				581
				582	`repair_list` is a list of DUTs to be repaired.
				583
				584	This function calculates the new set of buffer counts that would
				585	result from the proposed repairs, and scores the new set using
				586	two numbers:
				587	* Worst case buffer count for any board (higher is better).
				588	This is the more siginficant number for comparison.
				589	* Number of boards at the worst case (lower is better). This
				590	is the less significant number.
				591
				592	Implementation note: The score could fail to reflect the
				593	intended criteria if there are more than 1000 boards in the
				594	inventory.
				595
				596	@param spare_counts A dictionary mapping boards to buffer counts.
				597	@param repair_list A list of boards to be repaired.
				598	@return A numeric score.
				599
				600	"""
				601	# Go through `buffer_counts`, and create a list of new counts
				602	# that records the buffer count for each board after repair.
				603	# The new list of counts discards the board names, as they don't
				604	# contribute to the final score.
				605	_NBOARDS = 1000
				606	repair_inventory = _LabInventory(repair_list)
				607	new_counts = []
				608	for b, c in buffer_counts.items():
				609	if b in repair_inventory:
				610	newcount = repair_inventory[b].get_total()
				611	else:
				612	newcount = 0
				613	new_counts.append(c + newcount)
				614	# Go through the new list of counts. Find the worst available
				615	# spares count, and count how many times that worst case occurs.
				616	worst_count = new_counts[0]
				617	num_worst = 1
				618	for c in new_counts[1:]:
				619	if c == worst_count:
				620	num_worst += 1
				621	elif c < worst_count:
				622	worst_count = c
				623	num_worst = 1
				624	# Return the calculated score
				625	return _NBOARDS * worst_count - num_worst
				626
				627
				628	def _generate_repair_recommendation(inventory, num_recommend):
				629	"""Return a summary of selected DUTs needing repair.
				630
				631	Returns a message recommending a list of broken DUTs to be
				632	repaired. The list of DUTs is selected based on these
				633	criteria:
				634	* No more than `num_recommend` DUTs will be listed.
				635	* All DUTs must be in the same lab.
				636	* DUTs should be selected for some degree of physical
				637	proximity.
				638	* DUTs for boards with a low spares buffer are more important
				639	than DUTs with larger buffers.
				640
				641	The algorithm used will guarantee that at least one DUT from a
				642	board with the smallest spares buffer will be recommended. If
				643	the worst spares buffer number is shared by more than one board,
				644	the algorithm will tend to prefer repair sets that include more
				645	of those boards over sets that cover fewer boards.
				646
J. Richard Barnette	1df6a56	2015-06-09 10:06:17 -0700	[diff] [blame]	647	@param inventory Inventory for generating recommendations.
				648	@param num_recommend Number of DUTs to recommend for repair.
				649
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	650	"""
				651	logging.debug('Creating DUT repair recommendations')
J. Richard Barnette	e39c827	2015-10-20 17:58:30 -0700	[diff] [blame]	652	board_buffer_counts = {}
				653	broken_list = []
				654	for board in inventory.get_managed_boards():
				655	logging.debug('Listing failed DUTs for %s', board)
				656	counts = inventory[board]
				657	if counts.get_broken() != 0:
				658	board_buffer_counts[board] = counts.get_spares_buffer()
				659	broken_list.extend(counts.get_broken_list())
J. Richard Barnette	5512743	2015-10-13 17:01:56 -0700	[diff] [blame]	660	# N.B. The logic inside this loop may seem complicated, but
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	661	# simplification is hard:
				662	# * Calculating an initial recommendation outside of
				663	# the loop likely would make things more complicated,
				664	# not less.
				665	# * It's necessary to calculate an initial lab slice once per
				666	# lab _before_ the while loop, in case the number of broken
				667	# DUTs in a lab is less than `num_recommend`.
J. Richard Barnette	5512743	2015-10-13 17:01:56 -0700	[diff] [blame]	668	recommendation = None
				669	best_score = None
				670	for lab_duts in _sort_by_location(broken_list):
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	671	start = 0
				672	end = num_recommend
				673	lab_slice = lab_duts[start : end]
				674	lab_score = _score_repair_set(board_buffer_counts,
				675	lab_slice)
				676	while end < len(lab_duts):
				677	start += 1
				678	end += 1
				679	new_slice = lab_duts[start : end]
				680	new_score = _score_repair_set(board_buffer_counts,
				681	new_slice)
				682	if new_score > lab_score:
				683	lab_slice = new_slice
				684	lab_score = new_score
				685	if recommendation is None or lab_score > best_score:
				686	recommendation = lab_slice
				687	best_score = lab_score
J. Richard Barnette	1df6a56	2015-06-09 10:06:17 -0700	[diff] [blame]	688	message = ['Repair recommendations:\n',
				689	'%-30s %-16s %s' % (
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	690	'Hostname', 'Board', 'Servo instructions')]
				691	for h in recommendation:
				692	servo_name = servo_host.make_servo_hostname(h.host.hostname)
				693	if utils.host_is_in_lab_zone(servo_name):
				694	servo_message = 'Repair servo first'
				695	else:
				696	servo_message = 'No servo present'
				697	line = '%-30s %-16s %s' % (
				698	h.host.hostname, h.host_board, servo_message)
				699	message.append(line)
				700	return '\n'.join(message)
				701
				702
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	703	def _generate_board_inventory_message(inventory):
				704	"""Generate the "board inventory" e-mail message.
				705
				706	The board inventory is a list by board summarizing the number
				707	of working and broken DUTs, and the total shortfall or surplus
				708	of working devices relative to the minimum critical pool
				709	requirement.
				710
				711	The report omits boards with no DUTs in the spare pool or with
				712	no DUTs in a critical pool.
				713
				714	N.B. For sample output text formattted as users can expect to
				715	see it in e-mail and log files, refer to the unit tests.
				716
				717	@param inventory _LabInventory object with the inventory to
				718	be reported on.
				719	@return String with the inventory message to be sent.
				720
				721	"""
				722	logging.debug('Creating board inventory')
J. Richard Barnette	d3ba33a	2015-10-14 11:20:49 -0700	[diff] [blame]	723	nworking = 0
				724	nbroken = 0
xixuan	12ce04f	2016-03-10 13:16:30 -0800	[diff] [blame^]	725	nidle = 0
J. Richard Barnette	d3ba33a	2015-10-14 11:20:49 -0700	[diff] [blame]	726	nbroken_boards = 0
J. Richard Barnette	ea5a4ba	2016-02-18 16:34:50 -0800	[diff] [blame]	727	ntotal_boards = 0
J. Richard Barnette	e39c827	2015-10-20 17:58:30 -0700	[diff] [blame]	728	summaries = []
				729	for board in inventory.get_managed_boards():
				730	logging.debug('Counting board inventory for %s', board)
				731	counts = inventory[board]
				732	# Summary elements laid out in the same order as the text
				733	# headers:
xixuan	12ce04f	2016-03-10 13:16:30 -0800	[diff] [blame^]	734	# Board Avail Bad Idle Good Spare Total
				735	# e[0] e[1] e[2] e[3] e[4] e[5] e[6]
J. Richard Barnette	e39c827	2015-10-20 17:58:30 -0700	[diff] [blame]	736	element = (board,
				737	counts.get_spares_buffer(),
				738	counts.get_broken(),
xixuan	12ce04f	2016-03-10 13:16:30 -0800	[diff] [blame^]	739	counts.get_idle(),
J. Richard Barnette	e39c827	2015-10-20 17:58:30 -0700	[diff] [blame]	740	counts.get_working(),
				741	counts.get_total(_SPARE_POOL),
				742	counts.get_total())
J. Richard Barnette	ea5a4ba	2016-02-18 16:34:50 -0800	[diff] [blame]	743	if element[2]:
				744	summaries.append(element)
				745	nbroken_boards += 1
				746	ntotal_boards += 1
J. Richard Barnette	e39c827	2015-10-20 17:58:30 -0700	[diff] [blame]	747	nbroken += element[2]
xixuan	12ce04f	2016-03-10 13:16:30 -0800	[diff] [blame^]	748	nidle += element[3]
				749	nworking += element[4]
				750	ntotal = nworking + nbroken + nidle
J. Richard Barnette	e39c827	2015-10-20 17:58:30 -0700	[diff] [blame]	751	summaries = sorted(summaries, key=lambda e: (e[1], -e[2]))
J. Richard Barnette	d3ba33a	2015-10-14 11:20:49 -0700	[diff] [blame]	752	broken_percent = int(round(100.0 * nbroken / ntotal))
xixuan	12ce04f	2016-03-10 13:16:30 -0800	[diff] [blame^]	753	idle_percent = int(round(100.0 * nidle / ntotal))
				754	working_percent = 100 - broken_percent - idle_percent
J. Richard Barnette	d3ba33a	2015-10-14 11:20:49 -0700	[diff] [blame]	755	message = ['Summary of DUTs in inventory:',
xixuan	12ce04f	2016-03-10 13:16:30 -0800	[diff] [blame^]	756	'%10s %10s %10s %6s' % ('Bad', 'Idle', 'Good', 'Total'),
				757	'%5d %3d%% %5d %3d%% %5d %3d%% %6d' % (
J. Richard Barnette	d3ba33a	2015-10-14 11:20:49 -0700	[diff] [blame]	758	nbroken, broken_percent,
xixuan	12ce04f	2016-03-10 13:16:30 -0800	[diff] [blame^]	759	nidle, idle_percent,
J. Richard Barnette	d3ba33a	2015-10-14 11:20:49 -0700	[diff] [blame]	760	nworking, working_percent,
				761	ntotal),
				762	'',
				763	'Boards with failures: %d' % nbroken_boards,
J. Richard Barnette	ea5a4ba	2016-02-18 16:34:50 -0800	[diff] [blame]	764	'Boards in inventory: %d' % ntotal_boards,
J. Richard Barnette	d3ba33a	2015-10-14 11:20:49 -0700	[diff] [blame]	765	'', '',
				766	'Full board inventory:\n',
xixuan	12ce04f	2016-03-10 13:16:30 -0800	[diff] [blame^]	767	'%-22s %5s %5s %5s %5s %5s %5s' % (
				768	'Board', 'Avail', 'Bad', 'Idle', 'Good',
J. Richard Barnette	e39c827	2015-10-20 17:58:30 -0700	[diff] [blame]	769	'Spare', 'Total')]
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	770	message.extend(
xixuan	12ce04f	2016-03-10 13:16:30 -0800	[diff] [blame^]	771	['%-22s %5d %5d %5d %5d %5d %5d' % e for e in summaries])
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	772	return '\n'.join(message)
				773
				774
J. Richard Barnette	4845fcf	2015-04-20 14:26:25 -0700	[diff] [blame]	775	_POOL_INVENTORY_HEADER = '''\
Aviv Keshet	056d74c	2015-07-14 09:18:43 -0700	[diff] [blame]	776	Notice to Infrastructure deputies: All boards shown below are at
J. Richard Barnette	c9a143c	2015-06-04 11:11:19 -0700	[diff] [blame]	777	less than full strength, please take action to resolve the issues.
				778	Once you're satisified that failures won't recur, failed DUTs can
				779	be replaced with spares by running `balance_pool`. Detailed
				780	instructions can be found here:
J. Richard Barnette	4845fcf	2015-04-20 14:26:25 -0700	[diff] [blame]	781	http://go/cros-manage-duts
				782	'''
				783
				784
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	785	def _generate_pool_inventory_message(inventory):
				786	"""Generate the "pool inventory" e-mail message.
				787
				788	The pool inventory is a list by pool and board summarizing the
				789	number of working and broken DUTs in the pool. Only boards with
				790	at least one broken DUT are included in the list.
				791
				792	N.B. For sample output text formattted as users can expect to
				793	see it in e-mail and log files, refer to the unit tests.
				794
				795	@param inventory _LabInventory object with the inventory to
				796	be reported on.
				797	@return String with the inventory message to be sent.
				798
				799	"""
				800	logging.debug('Creating pool inventory')
J. Richard Barnette	4845fcf	2015-04-20 14:26:25 -0700	[diff] [blame]	801	message = [_POOL_INVENTORY_HEADER]
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	802	newline = ''
				803	for pool in _CRITICAL_POOLS:
				804	message.append(
				805	'%sStatus for pool:%s, by board:' % (newline, pool))
				806	message.append(
xixuan	12ce04f	2016-03-10 13:16:30 -0800	[diff] [blame^]	807	'%-20s %5s %5s %5s %5s' % (
				808	'Board', 'Bad', 'Idle', 'Good', 'Total'))
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	809	data_list = []
				810	for board, counts in inventory.items():
				811	logging.debug('Counting inventory for %s, %s',
				812	board, pool)
				813	broken = counts.get_broken(pool)
xixuan	12ce04f	2016-03-10 13:16:30 -0800	[diff] [blame^]	814	idle = counts.get_idle(pool)
				815	# boards at full strength are not reported
				816	if broken == 0 and idle == 0:
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	817	continue
				818	working = counts.get_working(pool)
				819	total = counts.get_total(pool)
xixuan	12ce04f	2016-03-10 13:16:30 -0800	[diff] [blame^]	820	data_list.append((board, broken, idle, working, total))
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	821	if data_list:
				822	data_list = sorted(data_list, key=lambda d: -d[1])
				823	message.extend(
xixuan	12ce04f	2016-03-10 13:16:30 -0800	[diff] [blame^]	824	['%-20s %5d %5d %5d %5d' % t for t in data_list])
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	825	else:
				826	message.append('(All boards at full strength)')
				827	newline = '\n'
				828	return '\n'.join(message)
				829
				830
xixuan	12ce04f	2016-03-10 13:16:30 -0800	[diff] [blame^]	831	_IDLE_INVENTORY_HEADER = '''\
				832	Notice to Infrastructure deputies: The hosts shown below haven't
				833	run any jobs for at least 24 hours. Please check each host; locked
				834	hosts should normally be unlocked; stuck jobs should normally be
				835	aborted.
				836	'''
				837
				838
				839	def _generate_idle_inventory_message(inventory):
				840	"""Generate the "idle inventory" e-mail message.
				841
				842	The idle inventory is a host list with corresponding pool and board,
				843	where the hosts are idle (`UNKWOWN` or `UNUSED`).
				844
				845	N.B. For sample output text format as users can expect to
				846	see it in e-mail and log files, refer to the unit tests.
				847
				848	@param inventory _LabInventory object with the inventory to
				849	be reported on.
				850	@return String with the inventory message to be sent.
				851
				852	"""
				853	logging.debug('Creating idle inventory')
				854	message = [_IDLE_INVENTORY_HEADER]
				855	message.append('Idle Host List:')
				856	message.append('%-30s %-20s %s' % ('Hostname', 'Board', 'Pool'))
				857	data_list = []
				858	for pool in _MANAGED_POOLS:
				859	for board, counts in inventory.items():
				860	logging.debug('Counting inventory for %s, %s', board, pool)
				861	data_list.extend([(dut.host.hostname, board, pool)
				862	for dut in counts.get_idle_list(pool)])
				863	if data_list:
				864	message.extend(['%-30s %-20s %s' % t for t in data_list])
				865	else:
				866	message.append('(No idle DUTs)')
				867	return '\n'.join(message)
				868
				869
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	870	def _send_email(arguments, tag, subject, recipients, body):
				871	"""Send an inventory e-mail message.
				872
				873	The message is logged in the selected log directory using `tag`
				874	for the file name.
				875
				876	If the --print option was requested, the message is neither
				877	logged nor sent, but merely printed on stdout.
				878
				879	@param arguments Parsed command-line options.
				880	@param tag Tag identifying the inventory for logging
				881	purposes.
				882	@param subject E-mail Subject: header line.
				883	@param recipients E-mail addresses for the To: header line.
				884	@param body E-mail message body.
				885
				886	"""
				887	logging.debug('Generating email: "%s"', subject)
				888	all_recipients = ', '.join(recipients)
				889	report_body = '\n'.join([
				890	'To: %s' % all_recipients,
				891	'Subject: %s' % subject,
				892	'', body, ''])
J. Richard Barnette	02e8243	2015-10-13 16:02:47 -0700	[diff] [blame]	893	if arguments.debug:
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	894	print report_body
				895	else:
				896	filename = os.path.join(arguments.logdir, tag)
				897	try:
				898	report_file = open(filename, 'w')
				899	report_file.write(report_body)
				900	report_file.close()
				901	except EnvironmentError as e:
				902	logging.error('Failed to write %s: %s', filename, e)
				903	try:
				904	gmail_lib.send_email(all_recipients, subject, body)
				905	except Exception as e:
				906	logging.error('Failed to send e-mail to %s: %s',
				907	all_recipients, e)
				908
				909
				910	def _separate_email_addresses(address_list):
				911	"""Parse a list of comma-separated lists of e-mail addresses.
				912
				913	@param address_list A list of strings containing comma
				914	separate e-mail addresses.
				915	@return A list of the individual e-mail addresses.
				916
				917	"""
				918	newlist = []
				919	for arg in address_list:
				920	newlist.extend([email.strip() for email in arg.split(',')])
				921	return newlist
				922
				923
				924	def _verify_arguments(arguments):
				925	"""Validate command-line arguments.
				926
				927	Join comma separated e-mail addresses for `--board-notify` and
				928	`--pool-notify` in separate option arguments into a single list.
				929
J. Richard Barnette	02e8243	2015-10-13 16:02:47 -0700	[diff] [blame]	930	For non-debug uses, require that notification be requested for
				931	at least one report. For debug, if notification isn't specified,
				932	treat it as "run all the reports."
				933
				934	The return value indicates success or failure; in the case of
				935	failure, we also write an error message to stderr.
				936
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	937	@param arguments Command-line arguments as returned by
				938	`ArgumentParser`
J. Richard Barnette	02e8243	2015-10-13 16:02:47 -0700	[diff] [blame]	939	@return True if the arguments are semantically good, or False
				940	if the arguments don't meet requirements.
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	941
				942	"""
				943	arguments.board_notify = _separate_email_addresses(
				944	arguments.board_notify)
				945	arguments.pool_notify = _separate_email_addresses(
				946	arguments.pool_notify)
J. Richard Barnette	02e8243	2015-10-13 16:02:47 -0700	[diff] [blame]	947	if not arguments.board_notify and not arguments.pool_notify:
				948	if not arguments.debug:
				949	sys.stderr.write('Must specify at least one of '
				950	'--board-notify or --pool-notify\n')
				951	return False
				952	else:
				953	# We want to run all the reports. An empty notify list
				954	# will cause a report to be skipped, so make sure the
				955	# lists are non-empty.
				956	arguments.board_notify = ['']
				957	arguments.pool_notify = ['']
				958	return True
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	959
				960
				961	def _get_logdir(script):
				962	"""Get the default directory for the `--logdir` option.
				963
				964	The default log directory is based on the parent directory
				965	containing this script.
				966
				967	@param script Path to this script file.
				968	@return A path to a directory.
				969
				970	"""
				971	basedir = os.path.dirname(os.path.abspath(script))
				972	basedir = os.path.dirname(basedir)
				973	return os.path.join(basedir, _LOGDIR)
				974
				975
				976	def _parse_command(argv):
				977	"""Parse the command line arguments.
				978
				979	Create an argument parser for this command's syntax, parse the
				980	command line, and return the result of the ArgumentParser
				981	parse_args() method.
				982
				983	@param argv Standard command line argument vector; argv[0] is
				984	assumed to be the command name.
				985	@return Result returned by ArgumentParser.parse_args().
				986
				987	"""
				988	parser = argparse.ArgumentParser(
				989	prog=argv[0],
				990	description='Gather and report lab inventory statistics')
				991	parser.add_argument('-d', '--duration', type=int,
				992	default=_DEFAULT_DURATION, metavar='HOURS',
				993	help='number of hours back to search for status'
				994	' (default: %d)' % _DEFAULT_DURATION)
				995	parser.add_argument('--board-notify', action='append',
				996	default=[], metavar='ADDRESS',
				997	help='Generate board inventory message, '
				998	'and send it to the given e-mail address(es)')
				999	parser.add_argument('--pool-notify', action='append',
				1000	default=[], metavar='ADDRESS',
				1001	help='Generate pool inventory message, '
				1002	'and send it to the given address(es)')
J. Richard Barnette	1df6a56	2015-06-09 10:06:17 -0700	[diff] [blame]	1003	parser.add_argument('-r', '--recommend', type=int, default=None,
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	1004	help=('Specify how many DUTs should be '
J. Richard Barnette	1df6a56	2015-06-09 10:06:17 -0700	[diff] [blame]	1005	'recommended for repair (default: no '
				1006	'recommendation)'))
J. Richard Barnette	02e8243	2015-10-13 16:02:47 -0700	[diff] [blame]	1007	parser.add_argument('--debug', action='store_true',
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	1008	help='Print e-mail messages on stdout '
				1009	'without sending them.')
				1010	parser.add_argument('--logdir', default=_get_logdir(argv[0]),
				1011	help='Directory where logs will be written.')
				1012	parser.add_argument('boardnames', nargs='*',
				1013	metavar='BOARD',
				1014	help='names of boards to report on '
				1015	'(default: all boards)')
				1016	arguments = parser.parse_args(argv[1:])
J. Richard Barnette	02e8243	2015-10-13 16:02:47 -0700	[diff] [blame]	1017	if not _verify_arguments(arguments):
				1018	return None
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	1019	return arguments
				1020
				1021
				1022	def _configure_logging(arguments):
				1023	"""Configure the `logging` module for our needs.
				1024
				1025	How we log depends on whether the `--print` option was
				1026	provided on the command line. Without the option, we log all
				1027	messages at DEBUG level or above, and write them to a file in
				1028	the directory specified by the `--logdir` option. With the
				1029	option, we write log messages to stdout; messages below INFO
				1030	level are discarded.
				1031
				1032	The log file is configured to rotate once a week on Friday
				1033	evening, preserving ~3 months worth of history.
				1034
				1035	@param arguments Command-line arguments as returned by
				1036	`ArgumentParser`
				1037
				1038	"""
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	1039	root_logger = logging.getLogger()
J. Richard Barnette	02e8243	2015-10-13 16:02:47 -0700	[diff] [blame]	1040	if arguments.debug:
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	1041	root_logger.setLevel(logging.INFO)
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	1042	handler = logging.StreamHandler(sys.stdout)
				1043	handler.setFormatter(logging.Formatter())
				1044	else:
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	1045	root_logger.setLevel(logging.DEBUG)
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	1046	logfile = os.path.join(arguments.logdir, _LOGFILE)
				1047	handler = logging.handlers.TimedRotatingFileHandler(
				1048	logfile, when='W4', backupCount=13)
				1049	formatter = logging.Formatter(_LOG_FORMAT,
				1050	time_utils.TIME_FMT)
				1051	handler.setFormatter(formatter)
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	1052	# TODO(jrbarnette) This is gross. Importing client.bin.utils
				1053	# implicitly imported logging_config, which calls
				1054	# logging.basicConfig() at module level. That gives us an
				1055	# extra logging handler that we don't want. So, clear out all
				1056	# the handlers here.
				1057	for h in root_logger.handlers:
				1058	root_logger.removeHandler(h)
				1059	root_logger.addHandler(handler)
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	1060
				1061
				1062	def _populate_board_counts(inventory):
				1063	"""Gather board counts while providing interactive feedback.
				1064
				1065	Gathering the status of all individual DUTs in the lab can take
				1066	considerable time (~30 minutes at the time of this writing).
				1067
				1068	Normally, we pay that cost by querying as we go. However, with
				1069	the `--print` option, a human being may be watching the
				1070	progress. So, we force the first (expensive) queries to happen
				1071	up front, and provide a small ASCII progress bar to give an
				1072	indicator of how many boards have been processed.
				1073
				1074	@param inventory _LabInventory object with the inventory to
				1075	be gathered.
				1076
				1077	"""
				1078	n = 0
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	1079	total_broken = 0
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	1080	for counts in inventory.values():
				1081	n += 1
				1082	if n % 10 == 5:
				1083	c = '+'
				1084	elif n % 10 == 0:
				1085	c = '%d' % ((n / 10) % 10)
				1086	else:
				1087	c = '.'
				1088	sys.stdout.write(c)
				1089	sys.stdout.flush()
				1090	# This next call is where all the time goes - it forces all
				1091	# of a board's HostJobHistory objects to query the database
				1092	# and cache their results.
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	1093	total_broken += counts.get_broken()
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	1094	sys.stdout.write('\n')
J. Richard Barnette	f683928	2015-06-01 16:00:35 -0700	[diff] [blame]	1095	sys.stdout.write('Found %d broken DUTs\n' % total_broken)
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	1096
				1097
				1098	def main(argv):
				1099	"""Standard main routine.
				1100	@param argv Command line arguments including `sys.argv[0]`.
				1101	"""
				1102	arguments = _parse_command(argv)
J. Richard Barnette	02e8243	2015-10-13 16:02:47 -0700	[diff] [blame]	1103	if not arguments:
				1104	sys.exit(1)
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	1105	_configure_logging(arguments)
				1106	try:
				1107	end_time = int(time.time())
				1108	start_time = end_time - arguments.duration * 60 * 60
				1109	timestamp = time.strftime('%Y-%m-%d.%H',
				1110	time.localtime(end_time))
				1111	logging.debug('Starting lab inventory for %s', timestamp)
				1112	if arguments.board_notify:
J. Richard Barnette	1df6a56	2015-06-09 10:06:17 -0700	[diff] [blame]	1113	if arguments.recommend:
				1114	logging.debug('Will include repair recommendations')
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	1115	logging.debug('Will include board inventory')
				1116	if arguments.pool_notify:
				1117	logging.debug('Will include pool inventory')
				1118
J. Richard Barnette	a7c514e	2015-09-15 11:13:23 -0700	[diff] [blame]	1119	afe = frontend_wrappers.RetryingAFE(server=None)
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	1120	inventory = _LabInventory.create_inventory(
				1121	afe, start_time, end_time, arguments.boardnames)
				1122	logging.info('Found %d hosts across %d boards',
				1123	inventory.get_num_duts(),
				1124	inventory.get_num_boards())
				1125
J. Richard Barnette	02e8243	2015-10-13 16:02:47 -0700	[diff] [blame]	1126	if arguments.debug:
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	1127	_populate_board_counts(inventory)
				1128
J. Richard Barnette	02e8243	2015-10-13 16:02:47 -0700	[diff] [blame]	1129	if arguments.board_notify:
J. Richard Barnette	1df6a56	2015-06-09 10:06:17 -0700	[diff] [blame]	1130	if arguments.recommend:
				1131	recommend_message = _generate_repair_recommendation(
				1132	inventory, arguments.recommend) + '\n\n\n'
				1133	else:
				1134	recommend_message = ''
				1135	board_message = _generate_board_inventory_message(inventory)
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	1136	_send_email(arguments,
				1137	'boards-%s.txt' % timestamp,
				1138	'DUT board inventory %s' % timestamp,
				1139	arguments.board_notify,
J. Richard Barnette	02e8243	2015-10-13 16:02:47 -0700	[diff] [blame]	1140	recommend_message + board_message)
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	1141
J. Richard Barnette	02e8243	2015-10-13 16:02:47 -0700	[diff] [blame]	1142	if arguments.pool_notify:
xixuan	12ce04f	2016-03-10 13:16:30 -0800	[diff] [blame^]	1143	pool_message = _generate_pool_inventory_message(inventory)
				1144	idle_message = _generate_idle_inventory_message(inventory)
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	1145	_send_email(arguments,
				1146	'pools-%s.txt' % timestamp,
				1147	'DUT pool inventory %s' % timestamp,
				1148	arguments.pool_notify,
xixuan	12ce04f	2016-03-10 13:16:30 -0800	[diff] [blame^]	1149	pool_message + '\n\n\n' + idle_message)
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	1150	except KeyboardInterrupt:
				1151	pass
				1152	except EnvironmentError as e:
				1153	logging.exception('Unexpected OS error: %s', e)
				1154	except Exception as e:
				1155	logging.exception('Unexpected exception: %s', e)
				1156
				1157
J. Richard Barnette	aa86893	2015-10-23 13:28:59 -0700	[diff] [blame]	1158	def get_managed_boards(afe):
				1159	end_time = int(time.time())
				1160	start_time = end_time - 24 * 60 * 60
				1161	inventory = _LabInventory.create_inventory(
				1162	afe, start_time, end_time)
				1163	return inventory.get_managed_boards()
				1164
				1165
J. Richard Barnette	96db349	2015-03-27 17:23:52 -0700	[diff] [blame]	1166	if __name__ == '__main__':
				1167	main(sys.argv)