blob: 79228445a845426b142b343eaaeb719341ca1b12 [file] [log] [blame]
J. Richard Barnette96db3492015-03-27 17:23:52 -07001#!/usr/bin/env python
2# Copyright 2015 The Chromium OS Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6"""Create e-mail reports of the Lab's DUT inventory.
7
8Gathers a list of all DUTs of interest in the Lab, segregated by
9board and pool, and determines whether each DUT is working or
10broken. Then, send one or more e-mail reports summarizing the
11status to e-mail addresses provided on the command line.
12
13usage: lab_inventory.py [ options ] [ board ... ]
14
15Options:
16--duration / -d <hours>
17 How far back in time to search job history to determine DUT
18 status.
19
20--board-notify <address>[,<address>]
21 Send the "board status" e-mail to all the specified e-mail
22 addresses.
23
24--pool-notify <address>[,<address>]
25 Send the "pool status" e-mail to all the specified e-mail
26 addresses.
27
J. Richard Barnette1df6a562015-06-09 10:06:17 -070028--recommend <number>
29 When generating the "board status" e-mail, included a list of
30 <number> specific DUTs to be recommended for repair.
31
Richard Barnettecf5d8342017-10-24 18:13:11 -070032--repair-loops
33 Scan the inventory for DUTs stuck in repair loops, and report them
34 via a Monarch presence metric.
35
J. Richard Barnette96db3492015-03-27 17:23:52 -070036--logdir <directory>
37 Log progress and actions in a file under this directory. Text
38 of any e-mail sent will also be logged in a timestamped file in
39 this directory.
40
J. Richard Barnette02e82432015-10-13 16:02:47 -070041--debug
Richard Barnettecf5d8342017-10-24 18:13:11 -070042 Suppress all logging, metrics reporting, and sending e-mail.
43 Instead, write the output that would be generated onto stdout.
J. Richard Barnette96db3492015-03-27 17:23:52 -070044
45<board> arguments:
46 With no arguments, gathers the status for all boards in the lab.
47 With one or more named boards on the command line, restricts
48 reporting to just those boards.
49
50"""
51
52
53import argparse
Prathmesh Prabhu021e7842017-11-08 18:05:45 -080054import collections
J. Richard Barnette96db3492015-03-27 17:23:52 -070055import logging
56import logging.handlers
57import os
J. Richard Barnettef6839282015-06-01 16:00:35 -070058import re
J. Richard Barnette96db3492015-03-27 17:23:52 -070059import sys
60import time
61
62import common
J. Richard Barnettef6839282015-06-01 16:00:35 -070063from autotest_lib.client.bin import utils
J. Richard Barnette96db3492015-03-27 17:23:52 -070064from autotest_lib.client.common_lib import time_utils
Richard Barnettecf5d8342017-10-24 18:13:11 -070065from autotest_lib.server import site_utils
J. Richard Barnettea7c514e2015-09-15 11:13:23 -070066from autotest_lib.server.cros.dynamic_suite import frontend_wrappers
J. Richard Barnettef6839282015-06-01 16:00:35 -070067from autotest_lib.server.hosts import servo_host
Aviv Keshet7ee95862016-08-30 15:18:27 -070068from autotest_lib.server.lib import status_history
J. Richard Barnette96db3492015-03-27 17:23:52 -070069from autotest_lib.site_utils import gmail_lib
J. Richard Barnette96db3492015-03-27 17:23:52 -070070from autotest_lib.site_utils.suite_scheduler import constants
Prathmesh Prabhu021e7842017-11-08 18:05:45 -080071from autotest_lib.utils import labellib
Richard Barnettecf5d8342017-10-24 18:13:11 -070072from chromite.lib import metrics
J. Richard Barnette96db3492015-03-27 17:23:52 -070073
74
Richard Barnette673573b2016-12-12 09:46:39 -080075CRITICAL_POOLS = constants.Pools.CRITICAL_POOLS
76SPARE_POOL = constants.Pools.SPARE_POOL
77MANAGED_POOLS = constants.Pools.MANAGED_POOLS
J. Richard Barnette96db3492015-03-27 17:23:52 -070078
J. Richard Barnetteb8bc570c2016-03-17 17:03:57 -070079# _EXCLUDED_LABELS - A set of labels that disqualify a DUT from
Richard Barnetteeabcf392017-09-01 15:10:54 -070080# monitoring by this script. Currently, we're excluding these:
81# + 'adb' - We're not ready to monitor Android or Brillo hosts.
82# + 'board:guado_moblab' - These are maintained by a separate
83# process that doesn't use this script.
Kevin Chengcf0ad2b2016-04-19 14:51:39 -070084
Richard Barnetteeabcf392017-09-01 15:10:54 -070085_EXCLUDED_LABELS = {'adb', 'board:guado_moblab'}
J. Richard Barnetteb8bc570c2016-03-17 17:03:57 -070086
J. Richard Barnette96db3492015-03-27 17:23:52 -070087# _DEFAULT_DURATION:
88# Default value used for the --duration command line option.
89# Specifies how far back in time to search in order to determine
90# DUT status.
91
92_DEFAULT_DURATION = 24
93
J. Richard Barnette96db3492015-03-27 17:23:52 -070094# _LOGDIR:
Richard Barnettecf5d8342017-10-24 18:13:11 -070095# Relative path used in the calculation of the default setting for
96# the --logdir option. The full path is relative to the root of the
97# autotest directory, as determined from sys.argv[0].
J. Richard Barnette96db3492015-03-27 17:23:52 -070098# _LOGFILE:
99# Basename of a file to which general log information will be
100# written.
101# _LOG_FORMAT:
102# Format string for log messages.
103
104_LOGDIR = os.path.join('logs', 'dut-data')
105_LOGFILE = 'lab-inventory.log'
106_LOG_FORMAT = '%(asctime)s | %(levelname)-10s | %(message)s'
107
J. Richard Barnettef6839282015-06-01 16:00:35 -0700108# Pattern describing location-based host names in the Chrome OS test
109# labs. Each DUT hostname designates the DUT's location:
110# * A lab (room) that's physically separated from other labs
111# (i.e. there's a door).
112# * A row (or aisle) of DUTs within the lab.
113# * A vertical rack of shelves on the row.
114# * A specific host on one shelf of the rack.
115
116_HOSTNAME_PATTERN = re.compile(
117 r'(chromeos\d+)-row(\d+)-rack(\d+)-host(\d+)')
118
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700119# Default entry for managed pools.
120
121_MANAGED_POOL_DEFAULT = 'all_pools'
122
Richard Barnettecf5d8342017-10-24 18:13:11 -0700123# _REPAIR_LOOP_THRESHOLD:
124# The number of repeated Repair tasks that must be seen to declare
125# that a DUT is stuck in a repair loop.
126
127_REPAIR_LOOP_THRESHOLD = 4
128
J. Richard Barnette96db3492015-03-27 17:23:52 -0700129
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800130class _CachedHostJobHistories(object):
J. Richard Barnette96db3492015-03-27 17:23:52 -0700131 """Maintains a set of `HostJobHistory` objects for a pool.
132
133 The collected history objects are nominally all part of a single
J. Richard Barnettef6839282015-06-01 16:00:35 -0700134 scheduling pool of DUTs. The collection maintains a list of
135 working DUTs, a list of broken DUTs, and a list of all DUTs.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700136
J. Richard Barnettef6839282015-06-01 16:00:35 -0700137 Performance note: Certain methods in this class are potentially
138 expensive:
139 * `get_working()`
140 * `get_working_list()`
141 * `get_broken()`
142 * `get_broken_list()`
xixuan12ce04f2016-03-10 13:16:30 -0800143 * `get_idle()`
144 * `get_idle_list()`
J. Richard Barnettef6839282015-06-01 16:00:35 -0700145 The first time any one of these methods is called, it causes
146 multiple RPC calls with a relatively expensive set of database
147 queries. However, the results of the queries are cached in the
148 individual `HostJobHistory` objects, so only the first call
149 actually pays the full cost.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700150
xixuan12ce04f2016-03-10 13:16:30 -0800151 Additionally, `get_working_list()`, `get_broken_list()` and
152 `get_idle_list()` cache their return values to avoid recalculating
153 lists at every call; this caching is separate from the caching of RPC
154 results described above.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700155
156 This class is deliberately constructed to delay the RPC cost
157 until the accessor methods are called (rather than to query in
J. Richard Barnette96db3492015-03-27 17:23:52 -0700158 `record_host()`) so that it's possible to construct a complete
159 `_LabInventory` without making the expensive queries at creation
J. Richard Barnettef6839282015-06-01 16:00:35 -0700160 time. `_populate_board_counts()`, below, assumes this behavior.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700161
162 """
163
164 def __init__(self):
165 self._histories = []
J. Richard Barnettef6839282015-06-01 16:00:35 -0700166 self._working_list = None
167 self._broken_list = None
xixuan12ce04f2016-03-10 13:16:30 -0800168 self._idle_list = None
J. Richard Barnette96db3492015-03-27 17:23:52 -0700169
170
171 def record_host(self, host_history):
172 """Add one `HostJobHistory` object to the collection.
173
174 @param host_history The `HostJobHistory` object to be
175 remembered.
176
177 """
J. Richard Barnettef6839282015-06-01 16:00:35 -0700178 self._working_list = None
179 self._broken_list = None
xixuan12ce04f2016-03-10 13:16:30 -0800180 self._idle_list = None
J. Richard Barnette96db3492015-03-27 17:23:52 -0700181 self._histories.append(host_history)
182
183
J. Richard Barnettef6839282015-06-01 16:00:35 -0700184 def get_working_list(self):
185 """Return a list of all working DUTs in the pool.
186
187 Filter `self._histories` for histories where the last
188 diagnosis is `WORKING`.
189
190 Cache the result so that we only cacluate it once.
191
192 @return A list of HostJobHistory objects.
193
194 """
195 if self._working_list is None:
196 self._working_list = [h for h in self._histories
197 if h.last_diagnosis()[0] == status_history.WORKING]
198 return self._working_list
199
200
J. Richard Barnette96db3492015-03-27 17:23:52 -0700201 def get_working(self):
J. Richard Barnettef6839282015-06-01 16:00:35 -0700202 """Return the number of working DUTs in the pool."""
203 return len(self.get_working_list())
204
205
206 def get_broken_list(self):
207 """Return a list of all broken DUTs in the pool.
208
209 Filter `self._histories` for histories where the last
xixuan12ce04f2016-03-10 13:16:30 -0800210 diagnosis is `BROKEN`.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700211
212 Cache the result so that we only cacluate it once.
213
214 @return A list of HostJobHistory objects.
215
216 """
217 if self._broken_list is None:
218 self._broken_list = [h for h in self._histories
xixuan12ce04f2016-03-10 13:16:30 -0800219 if h.last_diagnosis()[0] == status_history.BROKEN]
J. Richard Barnettef6839282015-06-01 16:00:35 -0700220 return self._broken_list
J. Richard Barnette96db3492015-03-27 17:23:52 -0700221
222
223 def get_broken(self):
J. Richard Barnettef6839282015-06-01 16:00:35 -0700224 """Return the number of broken DUTs in the pool."""
225 return len(self.get_broken_list())
J. Richard Barnette96db3492015-03-27 17:23:52 -0700226
227
xixuan12ce04f2016-03-10 13:16:30 -0800228 def get_idle_list(self):
229 """Return a list of all idle DUTs in the pool.
230
231 Filter `self._histories` for histories where the last
232 diagnosis is `UNUSED` or `UNKNOWN`.
233
234 Cache the result so that we only cacluate it once.
235
236 @return A list of HostJobHistory objects.
237
238 """
239 idle_list = [status_history.UNUSED, status_history.UNKNOWN]
240 if self._idle_list is None:
241 self._idle_list = [h for h in self._histories
242 if h.last_diagnosis()[0] in idle_list]
243 return self._idle_list
244
245
246 def get_idle(self):
247 """Return the number of idle DUTs in the pool."""
248 return len(self.get_idle_list())
249
250
J. Richard Barnette96db3492015-03-27 17:23:52 -0700251 def get_total(self):
J. Richard Barnettef6839282015-06-01 16:00:35 -0700252 """Return the total number of DUTs in the pool."""
J. Richard Barnette96db3492015-03-27 17:23:52 -0700253 return len(self._histories)
254
255
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800256class _ManagedPoolsHostJobHistories(object):
257 """Maintains a set of `HostJobHistory`s per managed pool.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700258
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800259 The collection maintains a count of working DUTs, a count of broken DUTs,
260 and a total count. The counts can be obtained either for a single pool, or
261 as a total across all pools.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700262
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800263 DUTs in the collection must be assigned to one of the pools in
264 `_MANAGED_POOLS`.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700265
266 The `get_working()` and `get_broken()` methods rely on the
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800267 methods of the same name in _CachedHostJobHistories, so the performance
268 note in _CachedHostJobHistories applies here as well.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700269
270 """
271
272 def __init__(self):
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800273 self._histories_by_pool = {
274 pool: _CachedHostJobHistories() for pool in MANAGED_POOLS
J. Richard Barnette96db3492015-03-27 17:23:52 -0700275 }
276
277 def record_host(self, host_history):
278 """Add one `HostJobHistory` object to the collection.
279
280 @param host_history The `HostJobHistory` object to be
281 remembered.
282
283 """
J. Richard Barnette3d0590a2015-04-29 12:56:12 -0700284 pool = host_history.host_pool
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800285 self._histories_by_pool[pool].record_host(host_history)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700286
287
288 def _count_pool(self, get_pool_count, pool=None):
289 """Internal helper to count hosts in a given pool.
290
291 The `get_pool_count` parameter is a function to calculate
292 the exact count of interest for the pool.
293
294 @param get_pool_count Function to return a count from a
295 _PoolCount object.
296 @param pool The pool to be counted. If `None`,
297 return the total across all pools.
298
299 """
300 if pool is None:
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800301 return sum([get_pool_count(cached_history) for cached_history in
302 self._histories_by_pool.values()])
J. Richard Barnette96db3492015-03-27 17:23:52 -0700303 else:
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800304 return get_pool_count(self._histories_by_pool[pool])
J. Richard Barnette96db3492015-03-27 17:23:52 -0700305
306
J. Richard Barnettef6839282015-06-01 16:00:35 -0700307 def get_working_list(self):
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800308 """Return a list of all working DUTs (across all pools).
J. Richard Barnettef6839282015-06-01 16:00:35 -0700309
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800310 Go through all HostJobHistory objects across all pools, selecting the
311 ones where the last diagnosis is `WORKING`.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700312
313 @return A list of HostJobHistory objects.
314
315 """
316 l = []
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800317 for p in self._histories_by_pool.values():
J. Richard Barnettef6839282015-06-01 16:00:35 -0700318 l.extend(p.get_working_list())
319 return l
320
321
J. Richard Barnette96db3492015-03-27 17:23:52 -0700322 def get_working(self, pool=None):
323 """Return the number of working DUTs in a pool.
324
325 @param pool The pool to be counted. If `None`, return the
326 total across all pools.
327
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700328 @return The total number of working DUTs in the selected
329 pool(s).
J. Richard Barnette96db3492015-03-27 17:23:52 -0700330 """
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800331 return self._count_pool(_CachedHostJobHistories.get_working, pool)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700332
333
J. Richard Barnettef6839282015-06-01 16:00:35 -0700334 def get_broken_list(self):
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800335 """Return a list of all broken DUTs (across all pools).
J. Richard Barnettef6839282015-06-01 16:00:35 -0700336
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800337 Go through all HostJobHistory objects in the across all pools,
xixuan12ce04f2016-03-10 13:16:30 -0800338 selecting the ones where the last diagnosis is `BROKEN`.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700339
340 @return A list of HostJobHistory objects.
341
342 """
343 l = []
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800344 for p in self._histories_by_pool.values():
J. Richard Barnettef6839282015-06-01 16:00:35 -0700345 l.extend(p.get_broken_list())
346 return l
347
348
J. Richard Barnette96db3492015-03-27 17:23:52 -0700349 def get_broken(self, pool=None):
350 """Return the number of broken DUTs in a pool.
351
352 @param pool The pool to be counted. If `None`, return the
353 total across all pools.
354
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700355 @return The total number of broken DUTs in the selected pool(s).
J. Richard Barnette96db3492015-03-27 17:23:52 -0700356 """
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800357 return self._count_pool(_CachedHostJobHistories.get_broken, pool)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700358
359
xixuan12ce04f2016-03-10 13:16:30 -0800360 def get_idle_list(self, pool=None):
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800361 """Return a list of all idle DUTs in the given pool.
xixuan12ce04f2016-03-10 13:16:30 -0800362
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800363 Go through all HostJobHistory objects in the given pool, selecting the
364 ones where the last diagnosis is `UNUSED` or `UNKNOWN`.
xixuan12ce04f2016-03-10 13:16:30 -0800365
366 @param pool: The pool to be counted. If `None`, return the total list
367 across all pools.
368
369 @return A list of HostJobHistory objects.
370
371 """
372 if pool is None:
373 l = []
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800374 for p in self._histories_by_pool.values():
xixuan12ce04f2016-03-10 13:16:30 -0800375 l.extend(p.get_idle_list())
376 return l
377 else:
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800378 return _CachedHostJobHistories.get_idle_list(
379 self._histories_by_pool[pool])
xixuan12ce04f2016-03-10 13:16:30 -0800380
381
382 def get_idle(self, pool=None):
383 """Return the number of idle DUTs in a pool.
384
385 @param pool: The pool to be counted. If `None`, return the total
386 across all pools.
387
388 @return The total number of idle DUTs in the selected pool(s).
389 """
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800390 return self._count_pool(_CachedHostJobHistories.get_idle, pool)
xixuan12ce04f2016-03-10 13:16:30 -0800391
392
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700393 def get_spares_buffer(self):
394 """Return the the nominal number of working spares.
395
396 Calculates and returns how many working spares there would
397 be in the spares pool if all broken DUTs were in the spares
398 pool. This number may be negative, indicating a shortfall
399 in the critical pools.
400
401 @return The total number DUTs in the spares pool, less the total
402 number of broken DUTs in all pools.
403 """
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700404 return self.get_total(SPARE_POOL) - self.get_broken()
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700405
406
J. Richard Barnette96db3492015-03-27 17:23:52 -0700407 def get_total(self, pool=None):
408 """Return the total number of DUTs in a pool.
409
410 @param pool The pool to be counted. If `None`, return the
411 total across all pools.
412
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700413 @return The total number of DUTs in the selected pool(s).
J. Richard Barnette96db3492015-03-27 17:23:52 -0700414 """
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800415 return self._count_pool(_CachedHostJobHistories.get_total, pool)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700416
417
Prathmesh Prabhu154cb2b2017-11-08 17:36:51 -0800418class _LabInventory(object):
J. Richard Barnette96db3492015-03-27 17:23:52 -0700419 """Collection of `HostJobHistory` objects for the Lab's inventory.
420
Prathmesh Prabhu154cb2b2017-11-08 17:36:51 -0800421 Important attributes:
422 by_board: A dict mapping board to ManagedPoolsHostJobHistories
J. Richard Barnette96db3492015-03-27 17:23:52 -0700423
424 """
425
J. Richard Barnetteb8bc570c2016-03-17 17:03:57 -0700426 @staticmethod
427 def _eligible_host(afehost):
428 """Return whether this host is eligible for monitoring.
429
Richard Barnette99473f62017-10-17 14:43:46 -0700430 A host is eligible if it's in exactly one pool and it has no
431 labels from the `_EXCLUDED_LABELS` set.
J. Richard Barnetteb8bc570c2016-03-17 17:03:57 -0700432
433 @param afehost The host to be tested for eligibility.
434 """
Richard Barnette99473f62017-10-17 14:43:46 -0700435 pools = [l for l in afehost.labels
436 if l.startswith(constants.Labels.POOL_PREFIX)]
437 excluded = _EXCLUDED_LABELS.intersection(afehost.labels)
438 return len(pools) == 1 and not excluded
J. Richard Barnetteb8bc570c2016-03-17 17:03:57 -0700439
440
J. Richard Barnette96db3492015-03-27 17:23:52 -0700441 @classmethod
442 def create_inventory(cls, afe, start_time, end_time, boardlist=[]):
443 """Return a Lab inventory with specified parameters.
444
445 By default, gathers inventory from `HostJobHistory` objects
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700446 for all DUTs in the `MANAGED_POOLS` list. If `boardlist`
J. Richard Barnette96db3492015-03-27 17:23:52 -0700447 is supplied, the inventory will be restricted to only the
448 given boards.
449
450 @param afe AFE object for constructing the
451 `HostJobHistory` objects.
452 @param start_time Start time for the `HostJobHistory`
453 objects.
454 @param end_time End time for the `HostJobHistory`
455 objects.
456 @param boardlist List of boards to include. If empty,
457 include all available boards.
458 @return A `_LabInventory` object for the specified boards.
459
460 """
461 label_list = [constants.Labels.POOL_PREFIX + l
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700462 for l in MANAGED_POOLS]
J. Richard Barnette96db3492015-03-27 17:23:52 -0700463 afehosts = afe.get_hosts(labels__name__in=label_list)
464 if boardlist:
J. Richard Barnetteb8bc570c2016-03-17 17:03:57 -0700465 # We're deliberately not checking host eligibility in this
466 # code path. This is a debug path, not used in production;
467 # it may be useful to include ineligible hosts here.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700468 boardhosts = []
469 for board in boardlist:
470 board_label = constants.Labels.BOARD_PREFIX + board
471 host_list = [h for h in afehosts
472 if board_label in h.labels]
473 boardhosts.extend(host_list)
474 afehosts = boardhosts
J. Richard Barnetteb8bc570c2016-03-17 17:03:57 -0700475 else:
476 afehosts = [h for h in afehosts if cls._eligible_host(h)]
J. Richard Barnette96db3492015-03-27 17:23:52 -0700477 create = lambda host: (
478 status_history.HostJobHistory(afe, host,
479 start_time, end_time))
480 return cls([create(host) for host in afehosts])
481
482
483 def __init__(self, histories):
J. Richard Barnette6948ed32015-05-06 08:57:10 -0700484 # N.B. The query that finds our hosts is restricted to those
485 # with a valid pool: label, but doesn't check for a valid
486 # board: label. In some (insufficiently) rare cases, the
487 # AFE hosts table has been known to (incorrectly) have DUTs
488 # with a pool: but no board: label. We explicitly exclude
489 # those here.
490 histories = [h for h in histories
491 if h.host_board is not None]
Prathmesh Prabhu154cb2b2017-11-08 17:36:51 -0800492 self.histories = histories
J. Richard Barnette96db3492015-03-27 17:23:52 -0700493 self._dut_count = len(histories)
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700494 self._managed_boards = {}
Prathmesh Prabhua5a0e3d2017-11-09 08:53:53 -0800495 self._managed_models = {}
Prathmesh Prabhu021e7842017-11-08 18:05:45 -0800496 self.by_board = self._classify_by_label_type('board')
Prathmesh Prabhua5a0e3d2017-11-09 08:53:53 -0800497 self.by_model = self._classify_by_label_type('model')
Prathmesh Prabhu154cb2b2017-11-08 17:36:51 -0800498
499
Prathmesh Prabhu021e7842017-11-08 18:05:45 -0800500 def _classify_by_label_type(self, label_key):
501 """Classify histories by labels with the given key.
502
503 @returns a dict mapping labels with the given key to
504 _ManagedPoolsHostJobHistories for DUTs with that label.
505 """
506 classified = collections.defaultdict(_ManagedPoolsHostJobHistories)
Prathmesh Prabhu154cb2b2017-11-08 17:36:51 -0800507 for h in self.histories:
Prathmesh Prabhu021e7842017-11-08 18:05:45 -0800508 labels = labellib.LabelsMapping(h.host.labels)
509 if label_key in labels:
510 classified[labels[label_key]].record_host(h)
511 return dict(classified)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700512
513
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700514 def get_managed_boards(self, pool=_MANAGED_POOL_DEFAULT):
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700515 """Return the set of "managed" boards.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700516
Prathmesh Prabhua5a0e3d2017-11-09 08:53:53 -0800517 @param pool: The specified pool for managed boards.
518 @return A set of all the boards that have both spare and
519 non-spare pools, unless the pool is specified,
520 then the set of boards in that pool.
521 """
522 if self._managed_boards.get(pool) is None:
523 self._managed_boards[pool] = set()
524 for board, counts in self.by_board.iteritems():
525 if self._is_managed(pool, counts):
526 self._managed_boards[pool].add(board)
527 return self._managed_boards[pool]
528
529
530 def get_managed_models(self, pool=_MANAGED_POOL_DEFAULT):
531 """Return the set of "managed" models.
532
533 @param pool: The specified pool for managed models.
534 @return A set of all the models that have both spare and
535 non-spare pools, unless the pool is specified,
536 then the set of models in that pool.
537 """
538 if self._managed_models.get(pool) is None:
539 self._managed_models[pool] = set()
540 for board, counts in self.by_model.iteritems():
541 if self._is_managed(pool, counts):
542 self._managed_models[pool].add(board)
543 return self._managed_models[pool]
544
545
546 def _is_managed(self, pool, histories):
547 """Deterime if the given histories contain DUTs to be managed for pool.
548
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700549 Operationally, saying a board is "managed" means that the
550 board will be included in the "board" and "repair
551 recommendations" reports. That is, if there are failures in
552 the board's inventory then lab techs will be asked to fix
553 them without a separate ticket.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700554
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700555 For purposes of implementation, a board is "managed" if it
556 has DUTs in both the spare and a non-spare (i.e. critical)
557 pool.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700558
559 """
Prathmesh Prabhua5a0e3d2017-11-09 08:53:53 -0800560 # Get the counts for all pools, otherwise get it for the
561 # specified pool.
562 if pool == _MANAGED_POOL_DEFAULT:
563 spares = histories.get_total(SPARE_POOL)
564 total = histories.get_total()
565 return spares != 0 and spares != total
566 else:
567 return histories.get_total(pool) != 0
J. Richard Barnettef6839282015-06-01 16:00:35 -0700568
569
J. Richard Barnette96db3492015-03-27 17:23:52 -0700570 def get_num_duts(self):
571 """Return the total number of DUTs in the inventory."""
572 return self._dut_count
573
574
575 def get_num_boards(self):
576 """Return the total number of boards in the inventory."""
Prathmesh Prabhu154cb2b2017-11-08 17:36:51 -0800577 return len(self.by_board)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700578
579
Prathmesh Prabhua5a0e3d2017-11-09 08:53:53 -0800580 def get_num_models(self):
581 """Return the total number of models in the inventory."""
582 return len(self.by_model)
583
584
J. Richard Barnettef6839282015-06-01 16:00:35 -0700585def _sort_by_location(inventory_list):
586 """Return a list of DUTs, organized by location.
587
588 Take the given list of `HostJobHistory` objects, separate it
589 into a list per lab, and sort each lab's list by location. The
590 order of sorting within a lab is
591 * By row number within the lab,
592 * then by rack number within the row,
593 * then by host shelf number within the rack.
594
595 Return a list of the sorted lists.
596
597 Implementation note: host locations are sorted by converting
598 each location into a base 100 number. If row, rack or
599 host numbers exceed the range [0..99], then sorting will
600 break down.
601
602 @return A list of sorted lists of DUTs.
603
604 """
605 BASE = 100
606 lab_lists = {}
607 for history in inventory_list:
608 location = _HOSTNAME_PATTERN.match(history.host.hostname)
609 if location:
610 lab = location.group(1)
611 key = 0
612 for idx in location.group(2, 3, 4):
613 key = BASE * key + int(idx)
614 lab_lists.setdefault(lab, []).append((key, history))
615 return_list = []
616 for dut_list in lab_lists.values():
617 dut_list.sort(key=lambda t: t[0])
618 return_list.append([t[1] for t in dut_list])
619 return return_list
620
621
622def _score_repair_set(buffer_counts, repair_list):
623 """Return a numeric score rating a set of DUTs to be repaired.
624
625 `buffer_counts` is a dictionary mapping board names to the
626 size of the board's spares buffer.
627
628 `repair_list` is a list of DUTs to be repaired.
629
630 This function calculates the new set of buffer counts that would
631 result from the proposed repairs, and scores the new set using
632 two numbers:
633 * Worst case buffer count for any board (higher is better).
634 This is the more siginficant number for comparison.
635 * Number of boards at the worst case (lower is better). This
636 is the less significant number.
637
638 Implementation note: The score could fail to reflect the
639 intended criteria if there are more than 1000 boards in the
640 inventory.
641
642 @param spare_counts A dictionary mapping boards to buffer counts.
643 @param repair_list A list of boards to be repaired.
644 @return A numeric score.
645
646 """
647 # Go through `buffer_counts`, and create a list of new counts
648 # that records the buffer count for each board after repair.
649 # The new list of counts discards the board names, as they don't
650 # contribute to the final score.
651 _NBOARDS = 1000
652 repair_inventory = _LabInventory(repair_list)
653 new_counts = []
654 for b, c in buffer_counts.items():
Prathmesh Prabhu154cb2b2017-11-08 17:36:51 -0800655 if b in repair_inventory.by_board:
656 newcount = repair_inventory.by_board[b].get_total()
J. Richard Barnettef6839282015-06-01 16:00:35 -0700657 else:
658 newcount = 0
659 new_counts.append(c + newcount)
660 # Go through the new list of counts. Find the worst available
661 # spares count, and count how many times that worst case occurs.
662 worst_count = new_counts[0]
663 num_worst = 1
664 for c in new_counts[1:]:
665 if c == worst_count:
666 num_worst += 1
667 elif c < worst_count:
668 worst_count = c
669 num_worst = 1
670 # Return the calculated score
671 return _NBOARDS * worst_count - num_worst
672
673
674def _generate_repair_recommendation(inventory, num_recommend):
675 """Return a summary of selected DUTs needing repair.
676
677 Returns a message recommending a list of broken DUTs to be
678 repaired. The list of DUTs is selected based on these
679 criteria:
680 * No more than `num_recommend` DUTs will be listed.
681 * All DUTs must be in the same lab.
682 * DUTs should be selected for some degree of physical
683 proximity.
684 * DUTs for boards with a low spares buffer are more important
685 than DUTs with larger buffers.
686
687 The algorithm used will guarantee that at least one DUT from a
688 board with the smallest spares buffer will be recommended. If
689 the worst spares buffer number is shared by more than one board,
690 the algorithm will tend to prefer repair sets that include more
691 of those boards over sets that cover fewer boards.
692
J. Richard Barnette1df6a562015-06-09 10:06:17 -0700693 @param inventory Inventory for generating recommendations.
694 @param num_recommend Number of DUTs to recommend for repair.
695
J. Richard Barnettef6839282015-06-01 16:00:35 -0700696 """
697 logging.debug('Creating DUT repair recommendations')
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700698 board_buffer_counts = {}
699 broken_list = []
700 for board in inventory.get_managed_boards():
701 logging.debug('Listing failed DUTs for %s', board)
Prathmesh Prabhu154cb2b2017-11-08 17:36:51 -0800702 counts = inventory.by_board[board]
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700703 if counts.get_broken() != 0:
704 board_buffer_counts[board] = counts.get_spares_buffer()
705 broken_list.extend(counts.get_broken_list())
J. Richard Barnette55127432015-10-13 17:01:56 -0700706 # N.B. The logic inside this loop may seem complicated, but
J. Richard Barnettef6839282015-06-01 16:00:35 -0700707 # simplification is hard:
708 # * Calculating an initial recommendation outside of
709 # the loop likely would make things more complicated,
710 # not less.
711 # * It's necessary to calculate an initial lab slice once per
712 # lab _before_ the while loop, in case the number of broken
713 # DUTs in a lab is less than `num_recommend`.
J. Richard Barnette55127432015-10-13 17:01:56 -0700714 recommendation = None
715 best_score = None
716 for lab_duts in _sort_by_location(broken_list):
J. Richard Barnettef6839282015-06-01 16:00:35 -0700717 start = 0
718 end = num_recommend
719 lab_slice = lab_duts[start : end]
720 lab_score = _score_repair_set(board_buffer_counts,
721 lab_slice)
722 while end < len(lab_duts):
723 start += 1
724 end += 1
725 new_slice = lab_duts[start : end]
726 new_score = _score_repair_set(board_buffer_counts,
727 new_slice)
728 if new_score > lab_score:
729 lab_slice = new_slice
730 lab_score = new_score
731 if recommendation is None or lab_score > best_score:
732 recommendation = lab_slice
733 best_score = lab_score
J. Richard Barnette5d0fa512016-04-05 17:39:52 -0700734 # N.B. The trailing space here is manadatory: Without it, Gmail
735 # will parse the URL wrong. Don't ask. If you simply _must_
736 # know more, go try it yourself...
737 line_fmt = '%-30s %-16s %-6s\n %s '
J. Richard Barnette1df6a562015-06-09 10:06:17 -0700738 message = ['Repair recommendations:\n',
J. Richard Barnette5d0fa512016-04-05 17:39:52 -0700739 line_fmt % ( 'Hostname', 'Board', 'Servo?', 'Logs URL')]
J. Richard Barnettef6839282015-06-01 16:00:35 -0700740 for h in recommendation:
741 servo_name = servo_host.make_servo_hostname(h.host.hostname)
J. Richard Barnette5d0fa512016-04-05 17:39:52 -0700742 servo_present = utils.host_is_in_lab_zone(servo_name)
743 _, event = h.last_diagnosis()
744 line = line_fmt % (
745 h.host.hostname, h.host_board,
746 'Yes' if servo_present else 'No', event.job_url)
J. Richard Barnettef6839282015-06-01 16:00:35 -0700747 message.append(line)
748 return '\n'.join(message)
749
750
J. Richard Barnette96db3492015-03-27 17:23:52 -0700751def _generate_board_inventory_message(inventory):
752 """Generate the "board inventory" e-mail message.
753
754 The board inventory is a list by board summarizing the number
755 of working and broken DUTs, and the total shortfall or surplus
756 of working devices relative to the minimum critical pool
757 requirement.
758
759 The report omits boards with no DUTs in the spare pool or with
760 no DUTs in a critical pool.
761
762 N.B. For sample output text formattted as users can expect to
763 see it in e-mail and log files, refer to the unit tests.
764
765 @param inventory _LabInventory object with the inventory to
766 be reported on.
767 @return String with the inventory message to be sent.
768
769 """
770 logging.debug('Creating board inventory')
J. Richard Barnetted3ba33a2015-10-14 11:20:49 -0700771 nworking = 0
772 nbroken = 0
xixuan12ce04f2016-03-10 13:16:30 -0800773 nidle = 0
J. Richard Barnetted3ba33a2015-10-14 11:20:49 -0700774 nbroken_boards = 0
J. Richard Barnetteea5a4ba2016-02-18 16:34:50 -0800775 ntotal_boards = 0
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700776 summaries = []
777 for board in inventory.get_managed_boards():
Prathmesh Prabhu154cb2b2017-11-08 17:36:51 -0800778 counts = inventory.by_board[board]
Richard Barnette254d5b42016-07-06 19:13:23 -0700779 logging.debug('Counting %2d DUTS for board %s',
780 counts.get_total(), board)
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700781 # Summary elements laid out in the same order as the text
782 # headers:
xixuan12ce04f2016-03-10 13:16:30 -0800783 # Board Avail Bad Idle Good Spare Total
784 # e[0] e[1] e[2] e[3] e[4] e[5] e[6]
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700785 element = (board,
786 counts.get_spares_buffer(),
787 counts.get_broken(),
xixuan12ce04f2016-03-10 13:16:30 -0800788 counts.get_idle(),
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700789 counts.get_working(),
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700790 counts.get_total(SPARE_POOL),
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700791 counts.get_total())
J. Richard Barnetteea5a4ba2016-02-18 16:34:50 -0800792 if element[2]:
793 summaries.append(element)
794 nbroken_boards += 1
795 ntotal_boards += 1
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700796 nbroken += element[2]
xixuan12ce04f2016-03-10 13:16:30 -0800797 nidle += element[3]
798 nworking += element[4]
799 ntotal = nworking + nbroken + nidle
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700800 summaries = sorted(summaries, key=lambda e: (e[1], -e[2]))
J. Richard Barnetted3ba33a2015-10-14 11:20:49 -0700801 broken_percent = int(round(100.0 * nbroken / ntotal))
xixuan12ce04f2016-03-10 13:16:30 -0800802 idle_percent = int(round(100.0 * nidle / ntotal))
803 working_percent = 100 - broken_percent - idle_percent
J. Richard Barnetted3ba33a2015-10-14 11:20:49 -0700804 message = ['Summary of DUTs in inventory:',
xixuan12ce04f2016-03-10 13:16:30 -0800805 '%10s %10s %10s %6s' % ('Bad', 'Idle', 'Good', 'Total'),
806 '%5d %3d%% %5d %3d%% %5d %3d%% %6d' % (
J. Richard Barnetted3ba33a2015-10-14 11:20:49 -0700807 nbroken, broken_percent,
xixuan12ce04f2016-03-10 13:16:30 -0800808 nidle, idle_percent,
J. Richard Barnetted3ba33a2015-10-14 11:20:49 -0700809 nworking, working_percent,
810 ntotal),
811 '',
812 'Boards with failures: %d' % nbroken_boards,
J. Richard Barnetteea5a4ba2016-02-18 16:34:50 -0800813 'Boards in inventory: %d' % ntotal_boards,
J. Richard Barnetted3ba33a2015-10-14 11:20:49 -0700814 '', '',
815 'Full board inventory:\n',
xixuan12ce04f2016-03-10 13:16:30 -0800816 '%-22s %5s %5s %5s %5s %5s %5s' % (
817 'Board', 'Avail', 'Bad', 'Idle', 'Good',
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700818 'Spare', 'Total')]
J. Richard Barnette96db3492015-03-27 17:23:52 -0700819 message.extend(
xixuan12ce04f2016-03-10 13:16:30 -0800820 ['%-22s %5d %5d %5d %5d %5d %5d' % e for e in summaries])
J. Richard Barnette96db3492015-03-27 17:23:52 -0700821 return '\n'.join(message)
822
823
J. Richard Barnette4845fcf2015-04-20 14:26:25 -0700824_POOL_INVENTORY_HEADER = '''\
Aviv Keshet056d74c2015-07-14 09:18:43 -0700825Notice to Infrastructure deputies: All boards shown below are at
J. Richard Barnettec9a143c2015-06-04 11:11:19 -0700826less than full strength, please take action to resolve the issues.
827Once you're satisified that failures won't recur, failed DUTs can
828be replaced with spares by running `balance_pool`. Detailed
829instructions can be found here:
J. Richard Barnette4845fcf2015-04-20 14:26:25 -0700830 http://go/cros-manage-duts
831'''
832
833
J. Richard Barnette96db3492015-03-27 17:23:52 -0700834def _generate_pool_inventory_message(inventory):
835 """Generate the "pool inventory" e-mail message.
836
837 The pool inventory is a list by pool and board summarizing the
838 number of working and broken DUTs in the pool. Only boards with
839 at least one broken DUT are included in the list.
840
841 N.B. For sample output text formattted as users can expect to
842 see it in e-mail and log files, refer to the unit tests.
843
844 @param inventory _LabInventory object with the inventory to
845 be reported on.
846 @return String with the inventory message to be sent.
847
848 """
849 logging.debug('Creating pool inventory')
J. Richard Barnette4845fcf2015-04-20 14:26:25 -0700850 message = [_POOL_INVENTORY_HEADER]
J. Richard Barnette96db3492015-03-27 17:23:52 -0700851 newline = ''
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700852 for pool in CRITICAL_POOLS:
J. Richard Barnette96db3492015-03-27 17:23:52 -0700853 message.append(
854 '%sStatus for pool:%s, by board:' % (newline, pool))
855 message.append(
xixuan12ce04f2016-03-10 13:16:30 -0800856 '%-20s %5s %5s %5s %5s' % (
857 'Board', 'Bad', 'Idle', 'Good', 'Total'))
J. Richard Barnette96db3492015-03-27 17:23:52 -0700858 data_list = []
Prathmesh Prabhu154cb2b2017-11-08 17:36:51 -0800859 for board, counts in inventory.by_board.iteritems():
Richard Barnette254d5b42016-07-06 19:13:23 -0700860 logging.debug('Counting %2d DUTs for %s, %s',
861 counts.get_total(pool), board, pool)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700862 broken = counts.get_broken(pool)
xixuan12ce04f2016-03-10 13:16:30 -0800863 idle = counts.get_idle(pool)
864 # boards at full strength are not reported
865 if broken == 0 and idle == 0:
J. Richard Barnette96db3492015-03-27 17:23:52 -0700866 continue
867 working = counts.get_working(pool)
868 total = counts.get_total(pool)
xixuan12ce04f2016-03-10 13:16:30 -0800869 data_list.append((board, broken, idle, working, total))
J. Richard Barnette96db3492015-03-27 17:23:52 -0700870 if data_list:
871 data_list = sorted(data_list, key=lambda d: -d[1])
872 message.extend(
xixuan12ce04f2016-03-10 13:16:30 -0800873 ['%-20s %5d %5d %5d %5d' % t for t in data_list])
J. Richard Barnette96db3492015-03-27 17:23:52 -0700874 else:
875 message.append('(All boards at full strength)')
876 newline = '\n'
877 return '\n'.join(message)
878
879
xixuan12ce04f2016-03-10 13:16:30 -0800880_IDLE_INVENTORY_HEADER = '''\
881Notice to Infrastructure deputies: The hosts shown below haven't
882run any jobs for at least 24 hours. Please check each host; locked
883hosts should normally be unlocked; stuck jobs should normally be
884aborted.
885'''
886
887
888def _generate_idle_inventory_message(inventory):
889 """Generate the "idle inventory" e-mail message.
890
891 The idle inventory is a host list with corresponding pool and board,
892 where the hosts are idle (`UNKWOWN` or `UNUSED`).
893
894 N.B. For sample output text format as users can expect to
895 see it in e-mail and log files, refer to the unit tests.
896
897 @param inventory _LabInventory object with the inventory to
898 be reported on.
899 @return String with the inventory message to be sent.
900
901 """
902 logging.debug('Creating idle inventory')
903 message = [_IDLE_INVENTORY_HEADER]
904 message.append('Idle Host List:')
905 message.append('%-30s %-20s %s' % ('Hostname', 'Board', 'Pool'))
906 data_list = []
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700907 for pool in MANAGED_POOLS:
Prathmesh Prabhu154cb2b2017-11-08 17:36:51 -0800908 for board, counts in inventory.by_board.iteritems():
Richard Barnette254d5b42016-07-06 19:13:23 -0700909 logging.debug('Counting %2d DUTs for %s, %s',
910 counts.get_total(pool), board, pool)
xixuan12ce04f2016-03-10 13:16:30 -0800911 data_list.extend([(dut.host.hostname, board, pool)
912 for dut in counts.get_idle_list(pool)])
913 if data_list:
914 message.extend(['%-30s %-20s %s' % t for t in data_list])
915 else:
916 message.append('(No idle DUTs)')
917 return '\n'.join(message)
918
919
J. Richard Barnette96db3492015-03-27 17:23:52 -0700920def _send_email(arguments, tag, subject, recipients, body):
921 """Send an inventory e-mail message.
922
923 The message is logged in the selected log directory using `tag`
924 for the file name.
925
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700926 If the --debug option was requested, the message is neither
J. Richard Barnette96db3492015-03-27 17:23:52 -0700927 logged nor sent, but merely printed on stdout.
928
929 @param arguments Parsed command-line options.
930 @param tag Tag identifying the inventory for logging
931 purposes.
932 @param subject E-mail Subject: header line.
933 @param recipients E-mail addresses for the To: header line.
934 @param body E-mail message body.
935
936 """
937 logging.debug('Generating email: "%s"', subject)
938 all_recipients = ', '.join(recipients)
939 report_body = '\n'.join([
940 'To: %s' % all_recipients,
941 'Subject: %s' % subject,
942 '', body, ''])
J. Richard Barnette02e82432015-10-13 16:02:47 -0700943 if arguments.debug:
J. Richard Barnette96db3492015-03-27 17:23:52 -0700944 print report_body
945 else:
946 filename = os.path.join(arguments.logdir, tag)
947 try:
948 report_file = open(filename, 'w')
949 report_file.write(report_body)
950 report_file.close()
951 except EnvironmentError as e:
952 logging.error('Failed to write %s: %s', filename, e)
953 try:
954 gmail_lib.send_email(all_recipients, subject, body)
955 except Exception as e:
956 logging.error('Failed to send e-mail to %s: %s',
957 all_recipients, e)
958
959
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700960def _populate_board_counts(inventory):
961 """Gather board counts while providing interactive feedback.
962
963 Gathering the status of all individual DUTs in the lab can take
964 considerable time (~30 minutes at the time of this writing).
965
966 Normally, we pay that cost by querying as we go. However, with
967 the `--debug` option, we expect a human being to be watching the
968 progress in real time. So, we force the first (expensive)
969 queries to happen up front, and provide simple ASCII output
970 (without using logging) to show a progress bar and results.
971
972 @param inventory _LabInventory object with the inventory to
973 be gathered.
974
975 """
976 n = 0
977 total_broken = 0
978 for counts in inventory.by_board.itervalues():
979 n += 1
980 if n % 10 == 5:
981 c = '+'
982 elif n % 10 == 0:
983 c = '%d' % ((n / 10) % 10)
984 else:
985 c = '.'
986 sys.stdout.write(c)
987 sys.stdout.flush()
988 # This next call is where all the time goes - it forces all
989 # of a board's HostJobHistory objects to query the database
990 # and cache their results.
991 total_broken += counts.get_broken()
992 sys.stdout.write('\n')
993 sys.stdout.write('Found %d broken DUTs\n' % total_broken)
994
995
996def _perform_board_inventory(arguments, inventory, timestamp):
997 """Perform the board inventory report.
998
999 The board inventory report consists of the following:
1000 * A list of DUTs that are recommended to be repaired.
1001 This list is optional, and only appears if the `--recommend`
1002 option is present.
1003 * A list of all boards that have failed DUTs, with counts
1004 of working, broken, and spare DUTs, among others.
1005
1006 @param arguments Command-line arguments as returned by
1007 `ArgumentParser`
1008 @param inventory _LabInventory object with the inventory to
1009 be reported.
1010 @param timestamp A string used to identify this run's timestamp
1011 in logs and email output.
1012 """
1013 if arguments.recommend:
1014 recommend_message = _generate_repair_recommendation(
1015 inventory, arguments.recommend) + '\n\n\n'
1016 else:
1017 recommend_message = ''
1018 board_message = _generate_board_inventory_message(inventory)
1019 _send_email(arguments,
1020 'boards-%s.txt' % timestamp,
1021 'DUT board inventory %s' % timestamp,
1022 arguments.board_notify,
1023 recommend_message + board_message)
1024
1025
1026def _perform_pool_inventory(arguments, inventory, timestamp):
1027 """Perform the pool inventory report.
1028
1029 The pool inventory report consists of the following:
1030 * A list of all critical pools that have failed DUTs, with counts
1031 of working, broken, and idle DUTs.
1032 * A list of all idle DUTs by hostname including the board and
1033 pool.
1034
1035 @param arguments Command-line arguments as returned by
1036 `ArgumentParser`
Richard Barnettecf5d8342017-10-24 18:13:11 -07001037 @param inventory _LabInventory object with the inventory to be
1038 reported.
1039 @param timestamp A string used to identify this run's timestamp in
1040 logs and email output.
Richard Barnette3dcbb6a2017-10-23 17:57:50 -07001041 """
1042 pool_message = _generate_pool_inventory_message(inventory)
1043 idle_message = _generate_idle_inventory_message(inventory)
1044 _send_email(arguments,
1045 'pools-%s.txt' % timestamp,
1046 'DUT pool inventory %s' % timestamp,
1047 arguments.pool_notify,
1048 pool_message + '\n\n\n' + idle_message)
1049
1050
Richard Barnettecf5d8342017-10-24 18:13:11 -07001051def _dut_in_repair_loop(history):
1052 """Return whether a DUT's history indicates a repair loop.
1053
1054 A DUT is considered looping if it runs no tests, and no tasks pass
1055 other than repair tasks.
1056
1057 @param history An instance of `status_history.HostJobHistory` to be
1058 scanned for a repair loop. The caller guarantees
1059 that this history corresponds to a working DUT.
1060 @returns Return a true value if the DUT's most recent history
1061 indicates a repair loop.
1062 """
1063 # Our caller passes only histories for working DUTs; that means
1064 # we've already paid the cost of fetching the diagnosis task, and
1065 # we know that the task was successful. The diagnosis task will be
1066 # one of the tasks we must scan to find a loop, so if the task isn't
1067 # a repair task, then our history includes a successful non-repair
1068 # task, and we're not looping.
1069 #
1070 # The for loop below is very expensive, because it must fetch the
1071 # full history, regardless of how many tasks we examine. At the
1072 # time of this writing, this check against the diagnosis task
1073 # reduces the cost of finding loops in the full inventory from hours
1074 # to minutes.
1075 if history.last_diagnosis()[1].name != 'Repair':
1076 return False
1077 repair_ok_count = 0
1078 for task in history:
1079 if not task.is_special:
1080 # This is a test, so we're not looping.
1081 return False
1082 if task.diagnosis == status_history.BROKEN:
1083 # Failed a repair, so we're not looping.
1084 return False
1085 if (task.diagnosis == status_history.WORKING
1086 and task.name != 'Repair'):
1087 # Non-repair task succeeded, so we're not looping.
1088 return False
1089 # At this point, we have either a failed non-repair task, or
1090 # a successful repair.
1091 if task.name == 'Repair':
1092 repair_ok_count += 1
1093 if repair_ok_count >= _REPAIR_LOOP_THRESHOLD:
1094 return True
1095
1096
1097def _perform_repair_loop_report(arguments, inventory):
1098 """Scan the inventory for DUTs stuck in a repair loop.
1099
1100 This routine walks through the given inventory looking for DUTs
1101 where the most recent history shows that the DUT is regularly
1102 passing repair tasks, but has not run any tests.
1103
1104 @param arguments Command-line arguments as returned by
1105 `ArgumentParser`
1106 @param inventory _LabInventory object with the inventory to be
1107 reported.
1108 """
1109 loop_presence = metrics.BooleanMetric(
1110 'chromeos/autotest/inventory/repair_loops',
1111 'DUTs stuck in repair loops')
1112 logging.info('Scanning for DUTs in repair loops.')
1113 for counts in inventory.by_board.itervalues():
1114 for history in counts.get_working_list():
1115 # Managed DUTs with names that don't match
1116 # _HOSTNAME_PATTERN shouldn't be possible. However, we
1117 # don't want arbitrary strings being attached to the
1118 # 'dut_hostname' field, so for safety, we exclude all
1119 # anomalies.
1120 if not _HOSTNAME_PATTERN.match(history.hostname):
1121 continue
1122 if _dut_in_repair_loop(history):
1123 fields = {'dut_hostname': history.hostname,
1124 'board': history.host_board,
1125 'pool': history.host_pool}
1126 logging.info('Looping DUT: %(dut_hostname)s, '
1127 'board: %(board)s, pool: %(pool)s',
1128 fields)
1129 loop_presence.set(True, fields=fields)
1130
1131
Richard Barnette3dcbb6a2017-10-23 17:57:50 -07001132def _log_startup(arguments, startup_time):
1133 """Log the start of this inventory run.
1134
1135 Print various log messages indicating the start of the run. Return
1136 a string based on `startup_time` that will be used to identify this
1137 run in log files and e-mail messages.
1138
1139 @param startup_time A UNIX timestamp marking the moment when
1140 this inventory run began.
1141 @returns A timestamp string that will be used to identify this run
1142 in logs and email output.
1143 """
1144 timestamp = time.strftime('%Y-%m-%d.%H',
1145 time.localtime(startup_time))
1146 logging.debug('Starting lab inventory for %s', timestamp)
1147 if arguments.board_notify:
1148 if arguments.recommend:
1149 logging.debug('Will include repair recommendations')
1150 logging.debug('Will include board inventory')
1151 if arguments.pool_notify:
1152 logging.debug('Will include pool inventory')
1153 return timestamp
1154
1155
1156def _create_inventory(arguments, end_time):
1157 """Create the `_LabInventory` instance to use for reporting.
1158
1159 @param end_time A UNIX timestamp for the end of the time range
1160 to be searched in this inventory run.
1161 """
1162 start_time = end_time - arguments.duration * 60 * 60
1163 afe = frontend_wrappers.RetryingAFE(server=None)
1164 inventory = _LabInventory.create_inventory(
1165 afe, start_time, end_time, arguments.boardnames)
1166 logging.info('Found %d hosts across %d boards',
1167 inventory.get_num_duts(),
1168 inventory.get_num_boards())
1169 return inventory
1170
1171
Richard Barnettecf5d8342017-10-24 18:13:11 -07001172def _perform_inventory_reports(arguments):
1173 """Perform all inventory checks requested on the command line.
1174
1175 Create the initial inventory and run through the inventory reports
1176 as called for by the parsed command-line arguments.
1177
1178 @param arguments Command-line arguments as returned by
1179 `ArgumentParser`.
1180 """
1181 startup_time = time.time()
1182 timestamp = _log_startup(arguments, startup_time)
1183 inventory = _create_inventory(arguments, startup_time)
1184 if arguments.debug:
1185 _populate_board_counts(inventory)
1186 if arguments.board_notify:
1187 _perform_board_inventory(arguments, inventory, timestamp)
1188 if arguments.pool_notify:
1189 _perform_pool_inventory(arguments, inventory, timestamp)
1190 if arguments.repair_loops:
1191 _perform_repair_loop_report(arguments, inventory)
1192
1193
J. Richard Barnette96db3492015-03-27 17:23:52 -07001194def _separate_email_addresses(address_list):
1195 """Parse a list of comma-separated lists of e-mail addresses.
1196
1197 @param address_list A list of strings containing comma
1198 separate e-mail addresses.
1199 @return A list of the individual e-mail addresses.
1200
1201 """
1202 newlist = []
1203 for arg in address_list:
1204 newlist.extend([email.strip() for email in arg.split(',')])
1205 return newlist
1206
1207
1208def _verify_arguments(arguments):
1209 """Validate command-line arguments.
1210
1211 Join comma separated e-mail addresses for `--board-notify` and
1212 `--pool-notify` in separate option arguments into a single list.
1213
J. Richard Barnette02e82432015-10-13 16:02:47 -07001214 For non-debug uses, require that notification be requested for
1215 at least one report. For debug, if notification isn't specified,
1216 treat it as "run all the reports."
1217
1218 The return value indicates success or failure; in the case of
1219 failure, we also write an error message to stderr.
1220
J. Richard Barnette96db3492015-03-27 17:23:52 -07001221 @param arguments Command-line arguments as returned by
1222 `ArgumentParser`
J. Richard Barnette02e82432015-10-13 16:02:47 -07001223 @return True if the arguments are semantically good, or False
1224 if the arguments don't meet requirements.
J. Richard Barnette96db3492015-03-27 17:23:52 -07001225
1226 """
1227 arguments.board_notify = _separate_email_addresses(
1228 arguments.board_notify)
1229 arguments.pool_notify = _separate_email_addresses(
1230 arguments.pool_notify)
J. Richard Barnette02e82432015-10-13 16:02:47 -07001231 if not arguments.board_notify and not arguments.pool_notify:
1232 if not arguments.debug:
1233 sys.stderr.write('Must specify at least one of '
1234 '--board-notify or --pool-notify\n')
1235 return False
1236 else:
1237 # We want to run all the reports. An empty notify list
1238 # will cause a report to be skipped, so make sure the
1239 # lists are non-empty.
1240 arguments.board_notify = ['']
1241 arguments.pool_notify = ['']
1242 return True
J. Richard Barnette96db3492015-03-27 17:23:52 -07001243
1244
Richard Barnette3dcbb6a2017-10-23 17:57:50 -07001245def _get_default_logdir(script):
J. Richard Barnette96db3492015-03-27 17:23:52 -07001246 """Get the default directory for the `--logdir` option.
1247
1248 The default log directory is based on the parent directory
1249 containing this script.
1250
1251 @param script Path to this script file.
1252 @return A path to a directory.
1253
1254 """
1255 basedir = os.path.dirname(os.path.abspath(script))
1256 basedir = os.path.dirname(basedir)
1257 return os.path.join(basedir, _LOGDIR)
1258
1259
1260def _parse_command(argv):
1261 """Parse the command line arguments.
1262
1263 Create an argument parser for this command's syntax, parse the
1264 command line, and return the result of the ArgumentParser
1265 parse_args() method.
1266
1267 @param argv Standard command line argument vector; argv[0] is
1268 assumed to be the command name.
1269 @return Result returned by ArgumentParser.parse_args().
1270
1271 """
1272 parser = argparse.ArgumentParser(
1273 prog=argv[0],
1274 description='Gather and report lab inventory statistics')
1275 parser.add_argument('-d', '--duration', type=int,
1276 default=_DEFAULT_DURATION, metavar='HOURS',
1277 help='number of hours back to search for status'
1278 ' (default: %d)' % _DEFAULT_DURATION)
1279 parser.add_argument('--board-notify', action='append',
1280 default=[], metavar='ADDRESS',
1281 help='Generate board inventory message, '
1282 'and send it to the given e-mail address(es)')
1283 parser.add_argument('--pool-notify', action='append',
1284 default=[], metavar='ADDRESS',
1285 help='Generate pool inventory message, '
1286 'and send it to the given address(es)')
J. Richard Barnette1df6a562015-06-09 10:06:17 -07001287 parser.add_argument('-r', '--recommend', type=int, default=None,
J. Richard Barnettef6839282015-06-01 16:00:35 -07001288 help=('Specify how many DUTs should be '
J. Richard Barnette1df6a562015-06-09 10:06:17 -07001289 'recommended for repair (default: no '
1290 'recommendation)'))
Richard Barnettecf5d8342017-10-24 18:13:11 -07001291 parser.add_argument('--repair-loops', action='store_true',
1292 help='Check for devices stuck in repair loops.')
J. Richard Barnette02e82432015-10-13 16:02:47 -07001293 parser.add_argument('--debug', action='store_true',
J. Richard Barnette96db3492015-03-27 17:23:52 -07001294 help='Print e-mail messages on stdout '
1295 'without sending them.')
Richard Barnette3dcbb6a2017-10-23 17:57:50 -07001296 parser.add_argument('--logdir', default=_get_default_logdir(argv[0]),
J. Richard Barnette96db3492015-03-27 17:23:52 -07001297 help='Directory where logs will be written.')
1298 parser.add_argument('boardnames', nargs='*',
1299 metavar='BOARD',
1300 help='names of boards to report on '
1301 '(default: all boards)')
1302 arguments = parser.parse_args(argv[1:])
J. Richard Barnette02e82432015-10-13 16:02:47 -07001303 if not _verify_arguments(arguments):
1304 return None
J. Richard Barnette96db3492015-03-27 17:23:52 -07001305 return arguments
1306
1307
1308def _configure_logging(arguments):
1309 """Configure the `logging` module for our needs.
1310
Richard Barnette3dcbb6a2017-10-23 17:57:50 -07001311 How we log depends on whether the `--debug` option was provided on
1312 the command line.
1313 * Without the option, we configure the logging to capture all
1314 potentially relevant events in a log file. The log file is
1315 configured to rotate once a week on Friday evening, preserving
1316 ~3 months worth of history.
1317 * With the option, we expect stdout to contain other
1318 human-readable output (including the contents of the e-mail
Richard Barnettecf5d8342017-10-24 18:13:11 -07001319 messages), so we restrict the output to INFO level.
1320
1321 For convenience, when `--debug` is on, the logging format has
1322 no adornments, so that a call like `logging.info(msg)` simply writes
1323 `msg` to stdout, plus a trailing newline.
J. Richard Barnette96db3492015-03-27 17:23:52 -07001324
1325 @param arguments Command-line arguments as returned by
1326 `ArgumentParser`
J. Richard Barnette96db3492015-03-27 17:23:52 -07001327 """
J. Richard Barnettef6839282015-06-01 16:00:35 -07001328 root_logger = logging.getLogger()
J. Richard Barnette02e82432015-10-13 16:02:47 -07001329 if arguments.debug:
J. Richard Barnettef6839282015-06-01 16:00:35 -07001330 root_logger.setLevel(logging.INFO)
J. Richard Barnette96db3492015-03-27 17:23:52 -07001331 handler = logging.StreamHandler(sys.stdout)
1332 handler.setFormatter(logging.Formatter())
1333 else:
Richard Barnette5af97402016-04-18 11:00:26 -07001334 if not os.path.exists(arguments.logdir):
1335 os.mkdir(arguments.logdir)
J. Richard Barnettef6839282015-06-01 16:00:35 -07001336 root_logger.setLevel(logging.DEBUG)
J. Richard Barnette96db3492015-03-27 17:23:52 -07001337 logfile = os.path.join(arguments.logdir, _LOGFILE)
1338 handler = logging.handlers.TimedRotatingFileHandler(
1339 logfile, when='W4', backupCount=13)
1340 formatter = logging.Formatter(_LOG_FORMAT,
1341 time_utils.TIME_FMT)
1342 handler.setFormatter(formatter)
J. Richard Barnettef6839282015-06-01 16:00:35 -07001343 # TODO(jrbarnette) This is gross. Importing client.bin.utils
1344 # implicitly imported logging_config, which calls
1345 # logging.basicConfig() *at module level*. That gives us an
1346 # extra logging handler that we don't want. So, clear out all
1347 # the handlers here.
1348 for h in root_logger.handlers:
1349 root_logger.removeHandler(h)
1350 root_logger.addHandler(handler)
J. Richard Barnette96db3492015-03-27 17:23:52 -07001351
1352
J. Richard Barnette96db3492015-03-27 17:23:52 -07001353def main(argv):
1354 """Standard main routine.
Richard Barnettecf5d8342017-10-24 18:13:11 -07001355
1356 @param argv Command line arguments, including `sys.argv[0]`.
J. Richard Barnette96db3492015-03-27 17:23:52 -07001357 """
1358 arguments = _parse_command(argv)
J. Richard Barnette02e82432015-10-13 16:02:47 -07001359 if not arguments:
1360 sys.exit(1)
J. Richard Barnette96db3492015-03-27 17:23:52 -07001361 _configure_logging(arguments)
1362 try:
Richard Barnettecf5d8342017-10-24 18:13:11 -07001363 if not arguments.debug:
1364 with site_utils.SetupTsMonGlobalState(
1365 'repair_loops', short_lived=True, auto_flush=False):
1366 _perform_inventory_reports(arguments)
1367 else:
1368 _perform_inventory_reports(arguments)
J. Richard Barnette96db3492015-03-27 17:23:52 -07001369 except KeyboardInterrupt:
1370 pass
1371 except EnvironmentError as e:
1372 logging.exception('Unexpected OS error: %s', e)
1373 except Exception as e:
1374 logging.exception('Unexpected exception: %s', e)
1375
1376
Kevin Chengcf0ad2b2016-04-19 14:51:39 -07001377def get_inventory(afe):
J. Richard Barnetteaa868932015-10-23 13:28:59 -07001378 end_time = int(time.time())
1379 start_time = end_time - 24 * 60 * 60
Kevin Chengcf0ad2b2016-04-19 14:51:39 -07001380 return _LabInventory.create_inventory(afe, start_time, end_time)
1381
1382
1383def get_managed_boards(afe):
1384 return get_inventory(afe).get_managed_boards()
J. Richard Barnetteaa868932015-10-23 13:28:59 -07001385
1386
J. Richard Barnette96db3492015-03-27 17:23:52 -07001387if __name__ == '__main__':
1388 main(sys.argv)