blob: 4a65589e58c4e31f78fd3bc3bbf701a99fbc1a74 [file] [log] [blame]
J. Richard Barnette96db3492015-03-27 17:23:52 -07001#!/usr/bin/env python
2# Copyright 2015 The Chromium OS Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6"""Create e-mail reports of the Lab's DUT inventory.
7
8Gathers a list of all DUTs of interest in the Lab, segregated by
9board and pool, and determines whether each DUT is working or
10broken. Then, send one or more e-mail reports summarizing the
11status to e-mail addresses provided on the command line.
12
13usage: lab_inventory.py [ options ] [ board ... ]
14
15Options:
16--duration / -d <hours>
17 How far back in time to search job history to determine DUT
18 status.
19
20--board-notify <address>[,<address>]
21 Send the "board status" e-mail to all the specified e-mail
22 addresses.
23
24--pool-notify <address>[,<address>]
25 Send the "pool status" e-mail to all the specified e-mail
26 addresses.
27
J. Richard Barnette1df6a562015-06-09 10:06:17 -070028--recommend <number>
29 When generating the "board status" e-mail, included a list of
30 <number> specific DUTs to be recommended for repair.
31
J. Richard Barnette96db3492015-03-27 17:23:52 -070032--logdir <directory>
33 Log progress and actions in a file under this directory. Text
34 of any e-mail sent will also be logged in a timestamped file in
35 this directory.
36
J. Richard Barnette02e82432015-10-13 16:02:47 -070037--debug
J. Richard Barnette96db3492015-03-27 17:23:52 -070038 Suppress all logging and sending e-mail. Instead, write the
39 output that would be generated onto stdout.
40
41<board> arguments:
42 With no arguments, gathers the status for all boards in the lab.
43 With one or more named boards on the command line, restricts
44 reporting to just those boards.
45
46"""
47
48
49import argparse
50import logging
51import logging.handlers
52import os
J. Richard Barnettef6839282015-06-01 16:00:35 -070053import re
J. Richard Barnette96db3492015-03-27 17:23:52 -070054import sys
55import time
56
57import common
J. Richard Barnettef6839282015-06-01 16:00:35 -070058from autotest_lib.client.bin import utils
J. Richard Barnette96db3492015-03-27 17:23:52 -070059from autotest_lib.client.common_lib import time_utils
J. Richard Barnettea7c514e2015-09-15 11:13:23 -070060from autotest_lib.server.cros.dynamic_suite import frontend_wrappers
J. Richard Barnettef6839282015-06-01 16:00:35 -070061from autotest_lib.server.hosts import servo_host
Aviv Keshet7ee95862016-08-30 15:18:27 -070062from autotest_lib.server.lib import status_history
J. Richard Barnette96db3492015-03-27 17:23:52 -070063from autotest_lib.site_utils import gmail_lib
J. Richard Barnette96db3492015-03-27 17:23:52 -070064from autotest_lib.site_utils.suite_scheduler import constants
65
66
Richard Barnette673573b2016-12-12 09:46:39 -080067CRITICAL_POOLS = constants.Pools.CRITICAL_POOLS
68SPARE_POOL = constants.Pools.SPARE_POOL
69MANAGED_POOLS = constants.Pools.MANAGED_POOLS
J. Richard Barnette96db3492015-03-27 17:23:52 -070070
J. Richard Barnetteb8bc570c2016-03-17 17:03:57 -070071# _EXCLUDED_LABELS - A set of labels that disqualify a DUT from
Richard Barnetteeabcf392017-09-01 15:10:54 -070072# monitoring by this script. Currently, we're excluding these:
73# + 'adb' - We're not ready to monitor Android or Brillo hosts.
74# + 'board:guado_moblab' - These are maintained by a separate
75# process that doesn't use this script.
Kevin Chengcf0ad2b2016-04-19 14:51:39 -070076
Richard Barnetteeabcf392017-09-01 15:10:54 -070077_EXCLUDED_LABELS = {'adb', 'board:guado_moblab'}
J. Richard Barnetteb8bc570c2016-03-17 17:03:57 -070078
J. Richard Barnette96db3492015-03-27 17:23:52 -070079# _DEFAULT_DURATION:
80# Default value used for the --duration command line option.
81# Specifies how far back in time to search in order to determine
82# DUT status.
83
84_DEFAULT_DURATION = 24
85
J. Richard Barnette96db3492015-03-27 17:23:52 -070086# _LOGDIR:
87# Relative path used in the calculation of the default setting
88# for the --logdir option. The full path path is relative to
89# the root of the autotest directory, as determined from
90# sys.argv[0].
91# _LOGFILE:
92# Basename of a file to which general log information will be
93# written.
94# _LOG_FORMAT:
95# Format string for log messages.
96
97_LOGDIR = os.path.join('logs', 'dut-data')
98_LOGFILE = 'lab-inventory.log'
99_LOG_FORMAT = '%(asctime)s | %(levelname)-10s | %(message)s'
100
J. Richard Barnettef6839282015-06-01 16:00:35 -0700101# Pattern describing location-based host names in the Chrome OS test
102# labs. Each DUT hostname designates the DUT's location:
103# * A lab (room) that's physically separated from other labs
104# (i.e. there's a door).
105# * A row (or aisle) of DUTs within the lab.
106# * A vertical rack of shelves on the row.
107# * A specific host on one shelf of the rack.
108
109_HOSTNAME_PATTERN = re.compile(
110 r'(chromeos\d+)-row(\d+)-rack(\d+)-host(\d+)')
111
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700112# Default entry for managed pools.
113
114_MANAGED_POOL_DEFAULT = 'all_pools'
115
J. Richard Barnette96db3492015-03-27 17:23:52 -0700116
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800117class _CachedHostJobHistories(object):
J. Richard Barnette96db3492015-03-27 17:23:52 -0700118 """Maintains a set of `HostJobHistory` objects for a pool.
119
120 The collected history objects are nominally all part of a single
J. Richard Barnettef6839282015-06-01 16:00:35 -0700121 scheduling pool of DUTs. The collection maintains a list of
122 working DUTs, a list of broken DUTs, and a list of all DUTs.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700123
J. Richard Barnettef6839282015-06-01 16:00:35 -0700124 Performance note: Certain methods in this class are potentially
125 expensive:
126 * `get_working()`
127 * `get_working_list()`
128 * `get_broken()`
129 * `get_broken_list()`
xixuan12ce04f2016-03-10 13:16:30 -0800130 * `get_idle()`
131 * `get_idle_list()`
J. Richard Barnettef6839282015-06-01 16:00:35 -0700132 The first time any one of these methods is called, it causes
133 multiple RPC calls with a relatively expensive set of database
134 queries. However, the results of the queries are cached in the
135 individual `HostJobHistory` objects, so only the first call
136 actually pays the full cost.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700137
xixuan12ce04f2016-03-10 13:16:30 -0800138 Additionally, `get_working_list()`, `get_broken_list()` and
139 `get_idle_list()` cache their return values to avoid recalculating
140 lists at every call; this caching is separate from the caching of RPC
141 results described above.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700142
143 This class is deliberately constructed to delay the RPC cost
144 until the accessor methods are called (rather than to query in
J. Richard Barnette96db3492015-03-27 17:23:52 -0700145 `record_host()`) so that it's possible to construct a complete
146 `_LabInventory` without making the expensive queries at creation
J. Richard Barnettef6839282015-06-01 16:00:35 -0700147 time. `_populate_board_counts()`, below, assumes this behavior.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700148
149 """
150
151 def __init__(self):
152 self._histories = []
J. Richard Barnettef6839282015-06-01 16:00:35 -0700153 self._working_list = None
154 self._broken_list = None
xixuan12ce04f2016-03-10 13:16:30 -0800155 self._idle_list = None
J. Richard Barnette96db3492015-03-27 17:23:52 -0700156
157
158 def record_host(self, host_history):
159 """Add one `HostJobHistory` object to the collection.
160
161 @param host_history The `HostJobHistory` object to be
162 remembered.
163
164 """
J. Richard Barnettef6839282015-06-01 16:00:35 -0700165 self._working_list = None
166 self._broken_list = None
xixuan12ce04f2016-03-10 13:16:30 -0800167 self._idle_list = None
J. Richard Barnette96db3492015-03-27 17:23:52 -0700168 self._histories.append(host_history)
169
170
J. Richard Barnettef6839282015-06-01 16:00:35 -0700171 def get_working_list(self):
172 """Return a list of all working DUTs in the pool.
173
174 Filter `self._histories` for histories where the last
175 diagnosis is `WORKING`.
176
177 Cache the result so that we only cacluate it once.
178
179 @return A list of HostJobHistory objects.
180
181 """
182 if self._working_list is None:
183 self._working_list = [h for h in self._histories
184 if h.last_diagnosis()[0] == status_history.WORKING]
185 return self._working_list
186
187
J. Richard Barnette96db3492015-03-27 17:23:52 -0700188 def get_working(self):
J. Richard Barnettef6839282015-06-01 16:00:35 -0700189 """Return the number of working DUTs in the pool."""
190 return len(self.get_working_list())
191
192
193 def get_broken_list(self):
194 """Return a list of all broken DUTs in the pool.
195
196 Filter `self._histories` for histories where the last
xixuan12ce04f2016-03-10 13:16:30 -0800197 diagnosis is `BROKEN`.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700198
199 Cache the result so that we only cacluate it once.
200
201 @return A list of HostJobHistory objects.
202
203 """
204 if self._broken_list is None:
205 self._broken_list = [h for h in self._histories
xixuan12ce04f2016-03-10 13:16:30 -0800206 if h.last_diagnosis()[0] == status_history.BROKEN]
J. Richard Barnettef6839282015-06-01 16:00:35 -0700207 return self._broken_list
J. Richard Barnette96db3492015-03-27 17:23:52 -0700208
209
210 def get_broken(self):
J. Richard Barnettef6839282015-06-01 16:00:35 -0700211 """Return the number of broken DUTs in the pool."""
212 return len(self.get_broken_list())
J. Richard Barnette96db3492015-03-27 17:23:52 -0700213
214
xixuan12ce04f2016-03-10 13:16:30 -0800215 def get_idle_list(self):
216 """Return a list of all idle DUTs in the pool.
217
218 Filter `self._histories` for histories where the last
219 diagnosis is `UNUSED` or `UNKNOWN`.
220
221 Cache the result so that we only cacluate it once.
222
223 @return A list of HostJobHistory objects.
224
225 """
226 idle_list = [status_history.UNUSED, status_history.UNKNOWN]
227 if self._idle_list is None:
228 self._idle_list = [h for h in self._histories
229 if h.last_diagnosis()[0] in idle_list]
230 return self._idle_list
231
232
233 def get_idle(self):
234 """Return the number of idle DUTs in the pool."""
235 return len(self.get_idle_list())
236
237
J. Richard Barnette96db3492015-03-27 17:23:52 -0700238 def get_total(self):
J. Richard Barnettef6839282015-06-01 16:00:35 -0700239 """Return the total number of DUTs in the pool."""
J. Richard Barnette96db3492015-03-27 17:23:52 -0700240 return len(self._histories)
241
242
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800243class _ManagedPoolsHostJobHistories(object):
244 """Maintains a set of `HostJobHistory`s per managed pool.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700245
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800246 The collection maintains a count of working DUTs, a count of broken DUTs,
247 and a total count. The counts can be obtained either for a single pool, or
248 as a total across all pools.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700249
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800250 DUTs in the collection must be assigned to one of the pools in
251 `_MANAGED_POOLS`.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700252
253 The `get_working()` and `get_broken()` methods rely on the
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800254 methods of the same name in _CachedHostJobHistories, so the performance
255 note in _CachedHostJobHistories applies here as well.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700256
257 """
258
259 def __init__(self):
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800260 self._histories_by_pool = {
261 pool: _CachedHostJobHistories() for pool in MANAGED_POOLS
J. Richard Barnette96db3492015-03-27 17:23:52 -0700262 }
263
264 def record_host(self, host_history):
265 """Add one `HostJobHistory` object to the collection.
266
267 @param host_history The `HostJobHistory` object to be
268 remembered.
269
270 """
J. Richard Barnette3d0590a2015-04-29 12:56:12 -0700271 pool = host_history.host_pool
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800272 self._histories_by_pool[pool].record_host(host_history)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700273
274
275 def _count_pool(self, get_pool_count, pool=None):
276 """Internal helper to count hosts in a given pool.
277
278 The `get_pool_count` parameter is a function to calculate
279 the exact count of interest for the pool.
280
281 @param get_pool_count Function to return a count from a
282 _PoolCount object.
283 @param pool The pool to be counted. If `None`,
284 return the total across all pools.
285
286 """
287 if pool is None:
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800288 return sum([get_pool_count(cached_history) for cached_history in
289 self._histories_by_pool.values()])
J. Richard Barnette96db3492015-03-27 17:23:52 -0700290 else:
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800291 return get_pool_count(self._histories_by_pool[pool])
J. Richard Barnette96db3492015-03-27 17:23:52 -0700292
293
J. Richard Barnettef6839282015-06-01 16:00:35 -0700294 def get_working_list(self):
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800295 """Return a list of all working DUTs (across all pools).
J. Richard Barnettef6839282015-06-01 16:00:35 -0700296
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800297 Go through all HostJobHistory objects across all pools, selecting the
298 ones where the last diagnosis is `WORKING`.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700299
300 @return A list of HostJobHistory objects.
301
302 """
303 l = []
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800304 for p in self._histories_by_pool.values():
J. Richard Barnettef6839282015-06-01 16:00:35 -0700305 l.extend(p.get_working_list())
306 return l
307
308
J. Richard Barnette96db3492015-03-27 17:23:52 -0700309 def get_working(self, pool=None):
310 """Return the number of working DUTs in a pool.
311
312 @param pool The pool to be counted. If `None`, return the
313 total across all pools.
314
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700315 @return The total number of working DUTs in the selected
316 pool(s).
J. Richard Barnette96db3492015-03-27 17:23:52 -0700317 """
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800318 return self._count_pool(_CachedHostJobHistories.get_working, pool)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700319
320
J. Richard Barnettef6839282015-06-01 16:00:35 -0700321 def get_broken_list(self):
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800322 """Return a list of all broken DUTs (across all pools).
J. Richard Barnettef6839282015-06-01 16:00:35 -0700323
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800324 Go through all HostJobHistory objects in the across all pools,
xixuan12ce04f2016-03-10 13:16:30 -0800325 selecting the ones where the last diagnosis is `BROKEN`.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700326
327 @return A list of HostJobHistory objects.
328
329 """
330 l = []
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800331 for p in self._histories_by_pool.values():
J. Richard Barnettef6839282015-06-01 16:00:35 -0700332 l.extend(p.get_broken_list())
333 return l
334
335
J. Richard Barnette96db3492015-03-27 17:23:52 -0700336 def get_broken(self, pool=None):
337 """Return the number of broken DUTs in a pool.
338
339 @param pool The pool to be counted. If `None`, return the
340 total across all pools.
341
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700342 @return The total number of broken DUTs in the selected pool(s).
J. Richard Barnette96db3492015-03-27 17:23:52 -0700343 """
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800344 return self._count_pool(_CachedHostJobHistories.get_broken, pool)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700345
346
xixuan12ce04f2016-03-10 13:16:30 -0800347 def get_idle_list(self, pool=None):
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800348 """Return a list of all idle DUTs in the given pool.
xixuan12ce04f2016-03-10 13:16:30 -0800349
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800350 Go through all HostJobHistory objects in the given pool, selecting the
351 ones where the last diagnosis is `UNUSED` or `UNKNOWN`.
xixuan12ce04f2016-03-10 13:16:30 -0800352
353 @param pool: The pool to be counted. If `None`, return the total list
354 across all pools.
355
356 @return A list of HostJobHistory objects.
357
358 """
359 if pool is None:
360 l = []
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800361 for p in self._histories_by_pool.values():
xixuan12ce04f2016-03-10 13:16:30 -0800362 l.extend(p.get_idle_list())
363 return l
364 else:
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800365 return _CachedHostJobHistories.get_idle_list(
366 self._histories_by_pool[pool])
xixuan12ce04f2016-03-10 13:16:30 -0800367
368
369 def get_idle(self, pool=None):
370 """Return the number of idle DUTs in a pool.
371
372 @param pool: The pool to be counted. If `None`, return the total
373 across all pools.
374
375 @return The total number of idle DUTs in the selected pool(s).
376 """
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800377 return self._count_pool(_CachedHostJobHistories.get_idle, pool)
xixuan12ce04f2016-03-10 13:16:30 -0800378
379
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700380 def get_spares_buffer(self):
381 """Return the the nominal number of working spares.
382
383 Calculates and returns how many working spares there would
384 be in the spares pool if all broken DUTs were in the spares
385 pool. This number may be negative, indicating a shortfall
386 in the critical pools.
387
388 @return The total number DUTs in the spares pool, less the total
389 number of broken DUTs in all pools.
390 """
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700391 return self.get_total(SPARE_POOL) - self.get_broken()
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700392
393
J. Richard Barnette96db3492015-03-27 17:23:52 -0700394 def get_total(self, pool=None):
395 """Return the total number of DUTs in a pool.
396
397 @param pool The pool to be counted. If `None`, return the
398 total across all pools.
399
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700400 @return The total number of DUTs in the selected pool(s).
J. Richard Barnette96db3492015-03-27 17:23:52 -0700401 """
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800402 return self._count_pool(_CachedHostJobHistories.get_total, pool)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700403
404
405class _LabInventory(dict):
406 """Collection of `HostJobHistory` objects for the Lab's inventory.
407
408 The collection is indexed by board. Indexing returns the
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800409 _ManagedPoolsHostJobHistories object associated with the board.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700410
411 The collection is also iterable. The iterator returns all the
412 boards in the inventory, in unspecified order.
413
414 """
415
J. Richard Barnetteb8bc570c2016-03-17 17:03:57 -0700416 @staticmethod
417 def _eligible_host(afehost):
418 """Return whether this host is eligible for monitoring.
419
Richard Barnette99473f62017-10-17 14:43:46 -0700420 A host is eligible if it's in exactly one pool and it has no
421 labels from the `_EXCLUDED_LABELS` set.
J. Richard Barnetteb8bc570c2016-03-17 17:03:57 -0700422
423 @param afehost The host to be tested for eligibility.
424 """
Richard Barnette99473f62017-10-17 14:43:46 -0700425 pools = [l for l in afehost.labels
426 if l.startswith(constants.Labels.POOL_PREFIX)]
427 excluded = _EXCLUDED_LABELS.intersection(afehost.labels)
428 return len(pools) == 1 and not excluded
J. Richard Barnetteb8bc570c2016-03-17 17:03:57 -0700429
430
J. Richard Barnette96db3492015-03-27 17:23:52 -0700431 @classmethod
432 def create_inventory(cls, afe, start_time, end_time, boardlist=[]):
433 """Return a Lab inventory with specified parameters.
434
435 By default, gathers inventory from `HostJobHistory` objects
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700436 for all DUTs in the `MANAGED_POOLS` list. If `boardlist`
J. Richard Barnette96db3492015-03-27 17:23:52 -0700437 is supplied, the inventory will be restricted to only the
438 given boards.
439
440 @param afe AFE object for constructing the
441 `HostJobHistory` objects.
442 @param start_time Start time for the `HostJobHistory`
443 objects.
444 @param end_time End time for the `HostJobHistory`
445 objects.
446 @param boardlist List of boards to include. If empty,
447 include all available boards.
448 @return A `_LabInventory` object for the specified boards.
449
450 """
451 label_list = [constants.Labels.POOL_PREFIX + l
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700452 for l in MANAGED_POOLS]
J. Richard Barnette96db3492015-03-27 17:23:52 -0700453 afehosts = afe.get_hosts(labels__name__in=label_list)
454 if boardlist:
J. Richard Barnetteb8bc570c2016-03-17 17:03:57 -0700455 # We're deliberately not checking host eligibility in this
456 # code path. This is a debug path, not used in production;
457 # it may be useful to include ineligible hosts here.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700458 boardhosts = []
459 for board in boardlist:
460 board_label = constants.Labels.BOARD_PREFIX + board
461 host_list = [h for h in afehosts
462 if board_label in h.labels]
463 boardhosts.extend(host_list)
464 afehosts = boardhosts
J. Richard Barnetteb8bc570c2016-03-17 17:03:57 -0700465 else:
466 afehosts = [h for h in afehosts if cls._eligible_host(h)]
J. Richard Barnette96db3492015-03-27 17:23:52 -0700467 create = lambda host: (
468 status_history.HostJobHistory(afe, host,
469 start_time, end_time))
470 return cls([create(host) for host in afehosts])
471
472
473 def __init__(self, histories):
J. Richard Barnette6948ed32015-05-06 08:57:10 -0700474 # N.B. The query that finds our hosts is restricted to those
475 # with a valid pool: label, but doesn't check for a valid
476 # board: label. In some (insufficiently) rare cases, the
477 # AFE hosts table has been known to (incorrectly) have DUTs
478 # with a pool: but no board: label. We explicitly exclude
479 # those here.
480 histories = [h for h in histories
481 if h.host_board is not None]
J. Richard Barnette3d0590a2015-04-29 12:56:12 -0700482 boards = set([h.host_board for h in histories])
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800483 initval = { board: _ManagedPoolsHostJobHistories() for board in boards }
J. Richard Barnette96db3492015-03-27 17:23:52 -0700484 super(_LabInventory, self).__init__(initval)
485 self._dut_count = len(histories)
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700486 self._managed_boards = {}
J. Richard Barnette96db3492015-03-27 17:23:52 -0700487 for h in histories:
J. Richard Barnette3d0590a2015-04-29 12:56:12 -0700488 self[h.host_board].record_host(h)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700489
490
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700491 def get_managed_boards(self, pool=_MANAGED_POOL_DEFAULT):
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700492 """Return the set of "managed" boards.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700493
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700494 Operationally, saying a board is "managed" means that the
495 board will be included in the "board" and "repair
496 recommendations" reports. That is, if there are failures in
497 the board's inventory then lab techs will be asked to fix
498 them without a separate ticket.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700499
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700500 For purposes of implementation, a board is "managed" if it
501 has DUTs in both the spare and a non-spare (i.e. critical)
502 pool.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700503
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700504 @param pool: The specified pool for managed boards.
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700505 @return A set of all the boards that have both spare and
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700506 non-spare pools, unless the pool is specified,
507 then the set of boards in that pool.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700508 """
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700509 if self._managed_boards.get(pool, None) is None:
510 self._managed_boards[pool] = set()
J. Richard Barnettef6839282015-06-01 16:00:35 -0700511 for board, counts in self.items():
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700512 # Get the counts for all pools, otherwise get it for the
513 # specified pool.
514 if pool == _MANAGED_POOL_DEFAULT:
515 spares = counts.get_total(SPARE_POOL)
516 total = counts.get_total()
517 if spares != 0 and spares != total:
518 self._managed_boards[pool].add(board)
519 else:
520 if counts.get_total(pool) != 0:
521 self._managed_boards[pool].add(board)
522 return self._managed_boards[pool]
J. Richard Barnettef6839282015-06-01 16:00:35 -0700523
524
J. Richard Barnette96db3492015-03-27 17:23:52 -0700525 def get_num_duts(self):
526 """Return the total number of DUTs in the inventory."""
527 return self._dut_count
528
529
530 def get_num_boards(self):
531 """Return the total number of boards in the inventory."""
532 return len(self)
533
534
J. Richard Barnettef6839282015-06-01 16:00:35 -0700535def _sort_by_location(inventory_list):
536 """Return a list of DUTs, organized by location.
537
538 Take the given list of `HostJobHistory` objects, separate it
539 into a list per lab, and sort each lab's list by location. The
540 order of sorting within a lab is
541 * By row number within the lab,
542 * then by rack number within the row,
543 * then by host shelf number within the rack.
544
545 Return a list of the sorted lists.
546
547 Implementation note: host locations are sorted by converting
548 each location into a base 100 number. If row, rack or
549 host numbers exceed the range [0..99], then sorting will
550 break down.
551
552 @return A list of sorted lists of DUTs.
553
554 """
555 BASE = 100
556 lab_lists = {}
557 for history in inventory_list:
558 location = _HOSTNAME_PATTERN.match(history.host.hostname)
559 if location:
560 lab = location.group(1)
561 key = 0
562 for idx in location.group(2, 3, 4):
563 key = BASE * key + int(idx)
564 lab_lists.setdefault(lab, []).append((key, history))
565 return_list = []
566 for dut_list in lab_lists.values():
567 dut_list.sort(key=lambda t: t[0])
568 return_list.append([t[1] for t in dut_list])
569 return return_list
570
571
572def _score_repair_set(buffer_counts, repair_list):
573 """Return a numeric score rating a set of DUTs to be repaired.
574
575 `buffer_counts` is a dictionary mapping board names to the
576 size of the board's spares buffer.
577
578 `repair_list` is a list of DUTs to be repaired.
579
580 This function calculates the new set of buffer counts that would
581 result from the proposed repairs, and scores the new set using
582 two numbers:
583 * Worst case buffer count for any board (higher is better).
584 This is the more siginficant number for comparison.
585 * Number of boards at the worst case (lower is better). This
586 is the less significant number.
587
588 Implementation note: The score could fail to reflect the
589 intended criteria if there are more than 1000 boards in the
590 inventory.
591
592 @param spare_counts A dictionary mapping boards to buffer counts.
593 @param repair_list A list of boards to be repaired.
594 @return A numeric score.
595
596 """
597 # Go through `buffer_counts`, and create a list of new counts
598 # that records the buffer count for each board after repair.
599 # The new list of counts discards the board names, as they don't
600 # contribute to the final score.
601 _NBOARDS = 1000
602 repair_inventory = _LabInventory(repair_list)
603 new_counts = []
604 for b, c in buffer_counts.items():
605 if b in repair_inventory:
606 newcount = repair_inventory[b].get_total()
607 else:
608 newcount = 0
609 new_counts.append(c + newcount)
610 # Go through the new list of counts. Find the worst available
611 # spares count, and count how many times that worst case occurs.
612 worst_count = new_counts[0]
613 num_worst = 1
614 for c in new_counts[1:]:
615 if c == worst_count:
616 num_worst += 1
617 elif c < worst_count:
618 worst_count = c
619 num_worst = 1
620 # Return the calculated score
621 return _NBOARDS * worst_count - num_worst
622
623
624def _generate_repair_recommendation(inventory, num_recommend):
625 """Return a summary of selected DUTs needing repair.
626
627 Returns a message recommending a list of broken DUTs to be
628 repaired. The list of DUTs is selected based on these
629 criteria:
630 * No more than `num_recommend` DUTs will be listed.
631 * All DUTs must be in the same lab.
632 * DUTs should be selected for some degree of physical
633 proximity.
634 * DUTs for boards with a low spares buffer are more important
635 than DUTs with larger buffers.
636
637 The algorithm used will guarantee that at least one DUT from a
638 board with the smallest spares buffer will be recommended. If
639 the worst spares buffer number is shared by more than one board,
640 the algorithm will tend to prefer repair sets that include more
641 of those boards over sets that cover fewer boards.
642
J. Richard Barnette1df6a562015-06-09 10:06:17 -0700643 @param inventory Inventory for generating recommendations.
644 @param num_recommend Number of DUTs to recommend for repair.
645
J. Richard Barnettef6839282015-06-01 16:00:35 -0700646 """
647 logging.debug('Creating DUT repair recommendations')
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700648 board_buffer_counts = {}
649 broken_list = []
650 for board in inventory.get_managed_boards():
651 logging.debug('Listing failed DUTs for %s', board)
652 counts = inventory[board]
653 if counts.get_broken() != 0:
654 board_buffer_counts[board] = counts.get_spares_buffer()
655 broken_list.extend(counts.get_broken_list())
J. Richard Barnette55127432015-10-13 17:01:56 -0700656 # N.B. The logic inside this loop may seem complicated, but
J. Richard Barnettef6839282015-06-01 16:00:35 -0700657 # simplification is hard:
658 # * Calculating an initial recommendation outside of
659 # the loop likely would make things more complicated,
660 # not less.
661 # * It's necessary to calculate an initial lab slice once per
662 # lab _before_ the while loop, in case the number of broken
663 # DUTs in a lab is less than `num_recommend`.
J. Richard Barnette55127432015-10-13 17:01:56 -0700664 recommendation = None
665 best_score = None
666 for lab_duts in _sort_by_location(broken_list):
J. Richard Barnettef6839282015-06-01 16:00:35 -0700667 start = 0
668 end = num_recommend
669 lab_slice = lab_duts[start : end]
670 lab_score = _score_repair_set(board_buffer_counts,
671 lab_slice)
672 while end < len(lab_duts):
673 start += 1
674 end += 1
675 new_slice = lab_duts[start : end]
676 new_score = _score_repair_set(board_buffer_counts,
677 new_slice)
678 if new_score > lab_score:
679 lab_slice = new_slice
680 lab_score = new_score
681 if recommendation is None or lab_score > best_score:
682 recommendation = lab_slice
683 best_score = lab_score
J. Richard Barnette5d0fa512016-04-05 17:39:52 -0700684 # N.B. The trailing space here is manadatory: Without it, Gmail
685 # will parse the URL wrong. Don't ask. If you simply _must_
686 # know more, go try it yourself...
687 line_fmt = '%-30s %-16s %-6s\n %s '
J. Richard Barnette1df6a562015-06-09 10:06:17 -0700688 message = ['Repair recommendations:\n',
J. Richard Barnette5d0fa512016-04-05 17:39:52 -0700689 line_fmt % ( 'Hostname', 'Board', 'Servo?', 'Logs URL')]
J. Richard Barnettef6839282015-06-01 16:00:35 -0700690 for h in recommendation:
691 servo_name = servo_host.make_servo_hostname(h.host.hostname)
J. Richard Barnette5d0fa512016-04-05 17:39:52 -0700692 servo_present = utils.host_is_in_lab_zone(servo_name)
693 _, event = h.last_diagnosis()
694 line = line_fmt % (
695 h.host.hostname, h.host_board,
696 'Yes' if servo_present else 'No', event.job_url)
J. Richard Barnettef6839282015-06-01 16:00:35 -0700697 message.append(line)
698 return '\n'.join(message)
699
700
J. Richard Barnette96db3492015-03-27 17:23:52 -0700701def _generate_board_inventory_message(inventory):
702 """Generate the "board inventory" e-mail message.
703
704 The board inventory is a list by board summarizing the number
705 of working and broken DUTs, and the total shortfall or surplus
706 of working devices relative to the minimum critical pool
707 requirement.
708
709 The report omits boards with no DUTs in the spare pool or with
710 no DUTs in a critical pool.
711
712 N.B. For sample output text formattted as users can expect to
713 see it in e-mail and log files, refer to the unit tests.
714
715 @param inventory _LabInventory object with the inventory to
716 be reported on.
717 @return String with the inventory message to be sent.
718
719 """
720 logging.debug('Creating board inventory')
J. Richard Barnetted3ba33a2015-10-14 11:20:49 -0700721 nworking = 0
722 nbroken = 0
xixuan12ce04f2016-03-10 13:16:30 -0800723 nidle = 0
J. Richard Barnetted3ba33a2015-10-14 11:20:49 -0700724 nbroken_boards = 0
J. Richard Barnetteea5a4ba2016-02-18 16:34:50 -0800725 ntotal_boards = 0
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700726 summaries = []
727 for board in inventory.get_managed_boards():
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700728 counts = inventory[board]
Richard Barnette254d5b42016-07-06 19:13:23 -0700729 logging.debug('Counting %2d DUTS for board %s',
730 counts.get_total(), board)
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700731 # Summary elements laid out in the same order as the text
732 # headers:
xixuan12ce04f2016-03-10 13:16:30 -0800733 # Board Avail Bad Idle Good Spare Total
734 # e[0] e[1] e[2] e[3] e[4] e[5] e[6]
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700735 element = (board,
736 counts.get_spares_buffer(),
737 counts.get_broken(),
xixuan12ce04f2016-03-10 13:16:30 -0800738 counts.get_idle(),
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700739 counts.get_working(),
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700740 counts.get_total(SPARE_POOL),
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700741 counts.get_total())
J. Richard Barnetteea5a4ba2016-02-18 16:34:50 -0800742 if element[2]:
743 summaries.append(element)
744 nbroken_boards += 1
745 ntotal_boards += 1
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700746 nbroken += element[2]
xixuan12ce04f2016-03-10 13:16:30 -0800747 nidle += element[3]
748 nworking += element[4]
749 ntotal = nworking + nbroken + nidle
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700750 summaries = sorted(summaries, key=lambda e: (e[1], -e[2]))
J. Richard Barnetted3ba33a2015-10-14 11:20:49 -0700751 broken_percent = int(round(100.0 * nbroken / ntotal))
xixuan12ce04f2016-03-10 13:16:30 -0800752 idle_percent = int(round(100.0 * nidle / ntotal))
753 working_percent = 100 - broken_percent - idle_percent
J. Richard Barnetted3ba33a2015-10-14 11:20:49 -0700754 message = ['Summary of DUTs in inventory:',
xixuan12ce04f2016-03-10 13:16:30 -0800755 '%10s %10s %10s %6s' % ('Bad', 'Idle', 'Good', 'Total'),
756 '%5d %3d%% %5d %3d%% %5d %3d%% %6d' % (
J. Richard Barnetted3ba33a2015-10-14 11:20:49 -0700757 nbroken, broken_percent,
xixuan12ce04f2016-03-10 13:16:30 -0800758 nidle, idle_percent,
J. Richard Barnetted3ba33a2015-10-14 11:20:49 -0700759 nworking, working_percent,
760 ntotal),
761 '',
762 'Boards with failures: %d' % nbroken_boards,
J. Richard Barnetteea5a4ba2016-02-18 16:34:50 -0800763 'Boards in inventory: %d' % ntotal_boards,
J. Richard Barnetted3ba33a2015-10-14 11:20:49 -0700764 '', '',
765 'Full board inventory:\n',
xixuan12ce04f2016-03-10 13:16:30 -0800766 '%-22s %5s %5s %5s %5s %5s %5s' % (
767 'Board', 'Avail', 'Bad', 'Idle', 'Good',
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700768 'Spare', 'Total')]
J. Richard Barnette96db3492015-03-27 17:23:52 -0700769 message.extend(
xixuan12ce04f2016-03-10 13:16:30 -0800770 ['%-22s %5d %5d %5d %5d %5d %5d' % e for e in summaries])
J. Richard Barnette96db3492015-03-27 17:23:52 -0700771 return '\n'.join(message)
772
773
J. Richard Barnette4845fcf2015-04-20 14:26:25 -0700774_POOL_INVENTORY_HEADER = '''\
Aviv Keshet056d74c2015-07-14 09:18:43 -0700775Notice to Infrastructure deputies: All boards shown below are at
J. Richard Barnettec9a143c2015-06-04 11:11:19 -0700776less than full strength, please take action to resolve the issues.
777Once you're satisified that failures won't recur, failed DUTs can
778be replaced with spares by running `balance_pool`. Detailed
779instructions can be found here:
J. Richard Barnette4845fcf2015-04-20 14:26:25 -0700780 http://go/cros-manage-duts
781'''
782
783
J. Richard Barnette96db3492015-03-27 17:23:52 -0700784def _generate_pool_inventory_message(inventory):
785 """Generate the "pool inventory" e-mail message.
786
787 The pool inventory is a list by pool and board summarizing the
788 number of working and broken DUTs in the pool. Only boards with
789 at least one broken DUT are included in the list.
790
791 N.B. For sample output text formattted as users can expect to
792 see it in e-mail and log files, refer to the unit tests.
793
794 @param inventory _LabInventory object with the inventory to
795 be reported on.
796 @return String with the inventory message to be sent.
797
798 """
799 logging.debug('Creating pool inventory')
J. Richard Barnette4845fcf2015-04-20 14:26:25 -0700800 message = [_POOL_INVENTORY_HEADER]
J. Richard Barnette96db3492015-03-27 17:23:52 -0700801 newline = ''
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700802 for pool in CRITICAL_POOLS:
J. Richard Barnette96db3492015-03-27 17:23:52 -0700803 message.append(
804 '%sStatus for pool:%s, by board:' % (newline, pool))
805 message.append(
xixuan12ce04f2016-03-10 13:16:30 -0800806 '%-20s %5s %5s %5s %5s' % (
807 'Board', 'Bad', 'Idle', 'Good', 'Total'))
J. Richard Barnette96db3492015-03-27 17:23:52 -0700808 data_list = []
809 for board, counts in inventory.items():
Richard Barnette254d5b42016-07-06 19:13:23 -0700810 logging.debug('Counting %2d DUTs for %s, %s',
811 counts.get_total(pool), board, pool)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700812 broken = counts.get_broken(pool)
xixuan12ce04f2016-03-10 13:16:30 -0800813 idle = counts.get_idle(pool)
814 # boards at full strength are not reported
815 if broken == 0 and idle == 0:
J. Richard Barnette96db3492015-03-27 17:23:52 -0700816 continue
817 working = counts.get_working(pool)
818 total = counts.get_total(pool)
xixuan12ce04f2016-03-10 13:16:30 -0800819 data_list.append((board, broken, idle, working, total))
J. Richard Barnette96db3492015-03-27 17:23:52 -0700820 if data_list:
821 data_list = sorted(data_list, key=lambda d: -d[1])
822 message.extend(
xixuan12ce04f2016-03-10 13:16:30 -0800823 ['%-20s %5d %5d %5d %5d' % t for t in data_list])
J. Richard Barnette96db3492015-03-27 17:23:52 -0700824 else:
825 message.append('(All boards at full strength)')
826 newline = '\n'
827 return '\n'.join(message)
828
829
xixuan12ce04f2016-03-10 13:16:30 -0800830_IDLE_INVENTORY_HEADER = '''\
831Notice to Infrastructure deputies: The hosts shown below haven't
832run any jobs for at least 24 hours. Please check each host; locked
833hosts should normally be unlocked; stuck jobs should normally be
834aborted.
835'''
836
837
838def _generate_idle_inventory_message(inventory):
839 """Generate the "idle inventory" e-mail message.
840
841 The idle inventory is a host list with corresponding pool and board,
842 where the hosts are idle (`UNKWOWN` or `UNUSED`).
843
844 N.B. For sample output text format as users can expect to
845 see it in e-mail and log files, refer to the unit tests.
846
847 @param inventory _LabInventory object with the inventory to
848 be reported on.
849 @return String with the inventory message to be sent.
850
851 """
852 logging.debug('Creating idle inventory')
853 message = [_IDLE_INVENTORY_HEADER]
854 message.append('Idle Host List:')
855 message.append('%-30s %-20s %s' % ('Hostname', 'Board', 'Pool'))
856 data_list = []
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700857 for pool in MANAGED_POOLS:
xixuan12ce04f2016-03-10 13:16:30 -0800858 for board, counts in inventory.items():
Richard Barnette254d5b42016-07-06 19:13:23 -0700859 logging.debug('Counting %2d DUTs for %s, %s',
860 counts.get_total(pool), board, pool)
xixuan12ce04f2016-03-10 13:16:30 -0800861 data_list.extend([(dut.host.hostname, board, pool)
862 for dut in counts.get_idle_list(pool)])
863 if data_list:
864 message.extend(['%-30s %-20s %s' % t for t in data_list])
865 else:
866 message.append('(No idle DUTs)')
867 return '\n'.join(message)
868
869
J. Richard Barnette96db3492015-03-27 17:23:52 -0700870def _send_email(arguments, tag, subject, recipients, body):
871 """Send an inventory e-mail message.
872
873 The message is logged in the selected log directory using `tag`
874 for the file name.
875
876 If the --print option was requested, the message is neither
877 logged nor sent, but merely printed on stdout.
878
879 @param arguments Parsed command-line options.
880 @param tag Tag identifying the inventory for logging
881 purposes.
882 @param subject E-mail Subject: header line.
883 @param recipients E-mail addresses for the To: header line.
884 @param body E-mail message body.
885
886 """
887 logging.debug('Generating email: "%s"', subject)
888 all_recipients = ', '.join(recipients)
889 report_body = '\n'.join([
890 'To: %s' % all_recipients,
891 'Subject: %s' % subject,
892 '', body, ''])
J. Richard Barnette02e82432015-10-13 16:02:47 -0700893 if arguments.debug:
J. Richard Barnette96db3492015-03-27 17:23:52 -0700894 print report_body
895 else:
896 filename = os.path.join(arguments.logdir, tag)
897 try:
898 report_file = open(filename, 'w')
899 report_file.write(report_body)
900 report_file.close()
901 except EnvironmentError as e:
902 logging.error('Failed to write %s: %s', filename, e)
903 try:
904 gmail_lib.send_email(all_recipients, subject, body)
905 except Exception as e:
906 logging.error('Failed to send e-mail to %s: %s',
907 all_recipients, e)
908
909
910def _separate_email_addresses(address_list):
911 """Parse a list of comma-separated lists of e-mail addresses.
912
913 @param address_list A list of strings containing comma
914 separate e-mail addresses.
915 @return A list of the individual e-mail addresses.
916
917 """
918 newlist = []
919 for arg in address_list:
920 newlist.extend([email.strip() for email in arg.split(',')])
921 return newlist
922
923
924def _verify_arguments(arguments):
925 """Validate command-line arguments.
926
927 Join comma separated e-mail addresses for `--board-notify` and
928 `--pool-notify` in separate option arguments into a single list.
929
J. Richard Barnette02e82432015-10-13 16:02:47 -0700930 For non-debug uses, require that notification be requested for
931 at least one report. For debug, if notification isn't specified,
932 treat it as "run all the reports."
933
934 The return value indicates success or failure; in the case of
935 failure, we also write an error message to stderr.
936
J. Richard Barnette96db3492015-03-27 17:23:52 -0700937 @param arguments Command-line arguments as returned by
938 `ArgumentParser`
J. Richard Barnette02e82432015-10-13 16:02:47 -0700939 @return True if the arguments are semantically good, or False
940 if the arguments don't meet requirements.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700941
942 """
943 arguments.board_notify = _separate_email_addresses(
944 arguments.board_notify)
945 arguments.pool_notify = _separate_email_addresses(
946 arguments.pool_notify)
J. Richard Barnette02e82432015-10-13 16:02:47 -0700947 if not arguments.board_notify and not arguments.pool_notify:
948 if not arguments.debug:
949 sys.stderr.write('Must specify at least one of '
950 '--board-notify or --pool-notify\n')
951 return False
952 else:
953 # We want to run all the reports. An empty notify list
954 # will cause a report to be skipped, so make sure the
955 # lists are non-empty.
956 arguments.board_notify = ['']
957 arguments.pool_notify = ['']
958 return True
J. Richard Barnette96db3492015-03-27 17:23:52 -0700959
960
961def _get_logdir(script):
962 """Get the default directory for the `--logdir` option.
963
964 The default log directory is based on the parent directory
965 containing this script.
966
967 @param script Path to this script file.
968 @return A path to a directory.
969
970 """
971 basedir = os.path.dirname(os.path.abspath(script))
972 basedir = os.path.dirname(basedir)
973 return os.path.join(basedir, _LOGDIR)
974
975
976def _parse_command(argv):
977 """Parse the command line arguments.
978
979 Create an argument parser for this command's syntax, parse the
980 command line, and return the result of the ArgumentParser
981 parse_args() method.
982
983 @param argv Standard command line argument vector; argv[0] is
984 assumed to be the command name.
985 @return Result returned by ArgumentParser.parse_args().
986
987 """
988 parser = argparse.ArgumentParser(
989 prog=argv[0],
990 description='Gather and report lab inventory statistics')
991 parser.add_argument('-d', '--duration', type=int,
992 default=_DEFAULT_DURATION, metavar='HOURS',
993 help='number of hours back to search for status'
994 ' (default: %d)' % _DEFAULT_DURATION)
995 parser.add_argument('--board-notify', action='append',
996 default=[], metavar='ADDRESS',
997 help='Generate board inventory message, '
998 'and send it to the given e-mail address(es)')
999 parser.add_argument('--pool-notify', action='append',
1000 default=[], metavar='ADDRESS',
1001 help='Generate pool inventory message, '
1002 'and send it to the given address(es)')
J. Richard Barnette1df6a562015-06-09 10:06:17 -07001003 parser.add_argument('-r', '--recommend', type=int, default=None,
J. Richard Barnettef6839282015-06-01 16:00:35 -07001004 help=('Specify how many DUTs should be '
J. Richard Barnette1df6a562015-06-09 10:06:17 -07001005 'recommended for repair (default: no '
1006 'recommendation)'))
J. Richard Barnette02e82432015-10-13 16:02:47 -07001007 parser.add_argument('--debug', action='store_true',
J. Richard Barnette96db3492015-03-27 17:23:52 -07001008 help='Print e-mail messages on stdout '
1009 'without sending them.')
1010 parser.add_argument('--logdir', default=_get_logdir(argv[0]),
1011 help='Directory where logs will be written.')
1012 parser.add_argument('boardnames', nargs='*',
1013 metavar='BOARD',
1014 help='names of boards to report on '
1015 '(default: all boards)')
1016 arguments = parser.parse_args(argv[1:])
J. Richard Barnette02e82432015-10-13 16:02:47 -07001017 if not _verify_arguments(arguments):
1018 return None
J. Richard Barnette96db3492015-03-27 17:23:52 -07001019 return arguments
1020
1021
1022def _configure_logging(arguments):
1023 """Configure the `logging` module for our needs.
1024
1025 How we log depends on whether the `--print` option was
1026 provided on the command line. Without the option, we log all
1027 messages at DEBUG level or above, and write them to a file in
1028 the directory specified by the `--logdir` option. With the
1029 option, we write log messages to stdout; messages below INFO
1030 level are discarded.
1031
1032 The log file is configured to rotate once a week on Friday
1033 evening, preserving ~3 months worth of history.
1034
1035 @param arguments Command-line arguments as returned by
1036 `ArgumentParser`
1037
1038 """
J. Richard Barnettef6839282015-06-01 16:00:35 -07001039 root_logger = logging.getLogger()
J. Richard Barnette02e82432015-10-13 16:02:47 -07001040 if arguments.debug:
J. Richard Barnettef6839282015-06-01 16:00:35 -07001041 root_logger.setLevel(logging.INFO)
J. Richard Barnette96db3492015-03-27 17:23:52 -07001042 handler = logging.StreamHandler(sys.stdout)
1043 handler.setFormatter(logging.Formatter())
1044 else:
Richard Barnette5af97402016-04-18 11:00:26 -07001045 if not os.path.exists(arguments.logdir):
1046 os.mkdir(arguments.logdir)
J. Richard Barnettef6839282015-06-01 16:00:35 -07001047 root_logger.setLevel(logging.DEBUG)
J. Richard Barnette96db3492015-03-27 17:23:52 -07001048 logfile = os.path.join(arguments.logdir, _LOGFILE)
1049 handler = logging.handlers.TimedRotatingFileHandler(
1050 logfile, when='W4', backupCount=13)
1051 formatter = logging.Formatter(_LOG_FORMAT,
1052 time_utils.TIME_FMT)
1053 handler.setFormatter(formatter)
J. Richard Barnettef6839282015-06-01 16:00:35 -07001054 # TODO(jrbarnette) This is gross. Importing client.bin.utils
1055 # implicitly imported logging_config, which calls
1056 # logging.basicConfig() *at module level*. That gives us an
1057 # extra logging handler that we don't want. So, clear out all
1058 # the handlers here.
1059 for h in root_logger.handlers:
1060 root_logger.removeHandler(h)
1061 root_logger.addHandler(handler)
J. Richard Barnette96db3492015-03-27 17:23:52 -07001062
1063
1064def _populate_board_counts(inventory):
1065 """Gather board counts while providing interactive feedback.
1066
1067 Gathering the status of all individual DUTs in the lab can take
1068 considerable time (~30 minutes at the time of this writing).
1069
1070 Normally, we pay that cost by querying as we go. However, with
1071 the `--print` option, a human being may be watching the
1072 progress. So, we force the first (expensive) queries to happen
1073 up front, and provide a small ASCII progress bar to give an
1074 indicator of how many boards have been processed.
1075
1076 @param inventory _LabInventory object with the inventory to
1077 be gathered.
1078
1079 """
1080 n = 0
J. Richard Barnettef6839282015-06-01 16:00:35 -07001081 total_broken = 0
J. Richard Barnette96db3492015-03-27 17:23:52 -07001082 for counts in inventory.values():
1083 n += 1
1084 if n % 10 == 5:
1085 c = '+'
1086 elif n % 10 == 0:
1087 c = '%d' % ((n / 10) % 10)
1088 else:
1089 c = '.'
1090 sys.stdout.write(c)
1091 sys.stdout.flush()
1092 # This next call is where all the time goes - it forces all
1093 # of a board's HostJobHistory objects to query the database
1094 # and cache their results.
J. Richard Barnettef6839282015-06-01 16:00:35 -07001095 total_broken += counts.get_broken()
J. Richard Barnette96db3492015-03-27 17:23:52 -07001096 sys.stdout.write('\n')
J. Richard Barnettef6839282015-06-01 16:00:35 -07001097 sys.stdout.write('Found %d broken DUTs\n' % total_broken)
J. Richard Barnette96db3492015-03-27 17:23:52 -07001098
1099
1100def main(argv):
1101 """Standard main routine.
1102 @param argv Command line arguments including `sys.argv[0]`.
1103 """
1104 arguments = _parse_command(argv)
J. Richard Barnette02e82432015-10-13 16:02:47 -07001105 if not arguments:
1106 sys.exit(1)
J. Richard Barnette96db3492015-03-27 17:23:52 -07001107 _configure_logging(arguments)
1108 try:
1109 end_time = int(time.time())
1110 start_time = end_time - arguments.duration * 60 * 60
1111 timestamp = time.strftime('%Y-%m-%d.%H',
1112 time.localtime(end_time))
1113 logging.debug('Starting lab inventory for %s', timestamp)
1114 if arguments.board_notify:
J. Richard Barnette1df6a562015-06-09 10:06:17 -07001115 if arguments.recommend:
1116 logging.debug('Will include repair recommendations')
J. Richard Barnette96db3492015-03-27 17:23:52 -07001117 logging.debug('Will include board inventory')
1118 if arguments.pool_notify:
1119 logging.debug('Will include pool inventory')
1120
J. Richard Barnettea7c514e2015-09-15 11:13:23 -07001121 afe = frontend_wrappers.RetryingAFE(server=None)
J. Richard Barnette96db3492015-03-27 17:23:52 -07001122 inventory = _LabInventory.create_inventory(
1123 afe, start_time, end_time, arguments.boardnames)
1124 logging.info('Found %d hosts across %d boards',
1125 inventory.get_num_duts(),
1126 inventory.get_num_boards())
1127
J. Richard Barnette02e82432015-10-13 16:02:47 -07001128 if arguments.debug:
J. Richard Barnette96db3492015-03-27 17:23:52 -07001129 _populate_board_counts(inventory)
1130
J. Richard Barnette02e82432015-10-13 16:02:47 -07001131 if arguments.board_notify:
J. Richard Barnette1df6a562015-06-09 10:06:17 -07001132 if arguments.recommend:
1133 recommend_message = _generate_repair_recommendation(
1134 inventory, arguments.recommend) + '\n\n\n'
1135 else:
1136 recommend_message = ''
1137 board_message = _generate_board_inventory_message(inventory)
J. Richard Barnette96db3492015-03-27 17:23:52 -07001138 _send_email(arguments,
1139 'boards-%s.txt' % timestamp,
1140 'DUT board inventory %s' % timestamp,
1141 arguments.board_notify,
J. Richard Barnette02e82432015-10-13 16:02:47 -07001142 recommend_message + board_message)
J. Richard Barnette96db3492015-03-27 17:23:52 -07001143
J. Richard Barnette02e82432015-10-13 16:02:47 -07001144 if arguments.pool_notify:
xixuan12ce04f2016-03-10 13:16:30 -08001145 pool_message = _generate_pool_inventory_message(inventory)
1146 idle_message = _generate_idle_inventory_message(inventory)
J. Richard Barnette96db3492015-03-27 17:23:52 -07001147 _send_email(arguments,
1148 'pools-%s.txt' % timestamp,
1149 'DUT pool inventory %s' % timestamp,
1150 arguments.pool_notify,
xixuan12ce04f2016-03-10 13:16:30 -08001151 pool_message + '\n\n\n' + idle_message)
J. Richard Barnette96db3492015-03-27 17:23:52 -07001152 except KeyboardInterrupt:
1153 pass
1154 except EnvironmentError as e:
1155 logging.exception('Unexpected OS error: %s', e)
1156 except Exception as e:
1157 logging.exception('Unexpected exception: %s', e)
1158
1159
Kevin Chengcf0ad2b2016-04-19 14:51:39 -07001160def get_inventory(afe):
J. Richard Barnetteaa868932015-10-23 13:28:59 -07001161 end_time = int(time.time())
1162 start_time = end_time - 24 * 60 * 60
Kevin Chengcf0ad2b2016-04-19 14:51:39 -07001163 return _LabInventory.create_inventory(afe, start_time, end_time)
1164
1165
1166def get_managed_boards(afe):
1167 return get_inventory(afe).get_managed_boards()
J. Richard Barnetteaa868932015-10-23 13:28:59 -07001168
1169
J. Richard Barnette96db3492015-03-27 17:23:52 -07001170if __name__ == '__main__':
1171 main(sys.argv)