blob: f538255b3398e0e3a13bf676bd540aad4ce6e87f [file] [log] [blame]
J. Richard Barnette96db3492015-03-27 17:23:52 -07001#!/usr/bin/env python
2# Copyright 2015 The Chromium OS Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6"""Create e-mail reports of the Lab's DUT inventory.
7
8Gathers a list of all DUTs of interest in the Lab, segregated by
9board and pool, and determines whether each DUT is working or
10broken. Then, send one or more e-mail reports summarizing the
11status to e-mail addresses provided on the command line.
12
13usage: lab_inventory.py [ options ] [ board ... ]
14
15Options:
16--duration / -d <hours>
17 How far back in time to search job history to determine DUT
18 status.
19
20--board-notify <address>[,<address>]
21 Send the "board status" e-mail to all the specified e-mail
22 addresses.
23
24--pool-notify <address>[,<address>]
25 Send the "pool status" e-mail to all the specified e-mail
26 addresses.
27
J. Richard Barnette1df6a562015-06-09 10:06:17 -070028--recommend <number>
29 When generating the "board status" e-mail, included a list of
30 <number> specific DUTs to be recommended for repair.
31
J. Richard Barnette96db3492015-03-27 17:23:52 -070032--logdir <directory>
33 Log progress and actions in a file under this directory. Text
34 of any e-mail sent will also be logged in a timestamped file in
35 this directory.
36
J. Richard Barnette02e82432015-10-13 16:02:47 -070037--debug
J. Richard Barnette96db3492015-03-27 17:23:52 -070038 Suppress all logging and sending e-mail. Instead, write the
39 output that would be generated onto stdout.
40
41<board> arguments:
42 With no arguments, gathers the status for all boards in the lab.
43 With one or more named boards on the command line, restricts
44 reporting to just those boards.
45
46"""
47
48
49import argparse
Prathmesh Prabhu021e7842017-11-08 18:05:45 -080050import collections
J. Richard Barnette96db3492015-03-27 17:23:52 -070051import logging
52import logging.handlers
53import os
J. Richard Barnettef6839282015-06-01 16:00:35 -070054import re
J. Richard Barnette96db3492015-03-27 17:23:52 -070055import sys
56import time
57
58import common
J. Richard Barnettef6839282015-06-01 16:00:35 -070059from autotest_lib.client.bin import utils
J. Richard Barnette96db3492015-03-27 17:23:52 -070060from autotest_lib.client.common_lib import time_utils
J. Richard Barnettea7c514e2015-09-15 11:13:23 -070061from autotest_lib.server.cros.dynamic_suite import frontend_wrappers
J. Richard Barnettef6839282015-06-01 16:00:35 -070062from autotest_lib.server.hosts import servo_host
Aviv Keshet7ee95862016-08-30 15:18:27 -070063from autotest_lib.server.lib import status_history
J. Richard Barnette96db3492015-03-27 17:23:52 -070064from autotest_lib.site_utils import gmail_lib
J. Richard Barnette96db3492015-03-27 17:23:52 -070065from autotest_lib.site_utils.suite_scheduler import constants
Prathmesh Prabhu021e7842017-11-08 18:05:45 -080066from autotest_lib.utils import labellib
J. Richard Barnette96db3492015-03-27 17:23:52 -070067
68
Richard Barnette673573b2016-12-12 09:46:39 -080069CRITICAL_POOLS = constants.Pools.CRITICAL_POOLS
70SPARE_POOL = constants.Pools.SPARE_POOL
71MANAGED_POOLS = constants.Pools.MANAGED_POOLS
J. Richard Barnette96db3492015-03-27 17:23:52 -070072
J. Richard Barnetteb8bc570c2016-03-17 17:03:57 -070073# _EXCLUDED_LABELS - A set of labels that disqualify a DUT from
Richard Barnetteeabcf392017-09-01 15:10:54 -070074# monitoring by this script. Currently, we're excluding these:
75# + 'adb' - We're not ready to monitor Android or Brillo hosts.
76# + 'board:guado_moblab' - These are maintained by a separate
77# process that doesn't use this script.
Kevin Chengcf0ad2b2016-04-19 14:51:39 -070078
Richard Barnetteeabcf392017-09-01 15:10:54 -070079_EXCLUDED_LABELS = {'adb', 'board:guado_moblab'}
J. Richard Barnetteb8bc570c2016-03-17 17:03:57 -070080
J. Richard Barnette96db3492015-03-27 17:23:52 -070081# _DEFAULT_DURATION:
82# Default value used for the --duration command line option.
83# Specifies how far back in time to search in order to determine
84# DUT status.
85
86_DEFAULT_DURATION = 24
87
J. Richard Barnette96db3492015-03-27 17:23:52 -070088# _LOGDIR:
89# Relative path used in the calculation of the default setting
90# for the --logdir option. The full path path is relative to
91# the root of the autotest directory, as determined from
92# sys.argv[0].
93# _LOGFILE:
94# Basename of a file to which general log information will be
95# written.
96# _LOG_FORMAT:
97# Format string for log messages.
98
99_LOGDIR = os.path.join('logs', 'dut-data')
100_LOGFILE = 'lab-inventory.log'
101_LOG_FORMAT = '%(asctime)s | %(levelname)-10s | %(message)s'
102
J. Richard Barnettef6839282015-06-01 16:00:35 -0700103# Pattern describing location-based host names in the Chrome OS test
104# labs. Each DUT hostname designates the DUT's location:
105# * A lab (room) that's physically separated from other labs
106# (i.e. there's a door).
107# * A row (or aisle) of DUTs within the lab.
108# * A vertical rack of shelves on the row.
109# * A specific host on one shelf of the rack.
110
111_HOSTNAME_PATTERN = re.compile(
112 r'(chromeos\d+)-row(\d+)-rack(\d+)-host(\d+)')
113
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700114# Default entry for managed pools.
115
116_MANAGED_POOL_DEFAULT = 'all_pools'
117
J. Richard Barnette96db3492015-03-27 17:23:52 -0700118
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800119class _CachedHostJobHistories(object):
J. Richard Barnette96db3492015-03-27 17:23:52 -0700120 """Maintains a set of `HostJobHistory` objects for a pool.
121
122 The collected history objects are nominally all part of a single
J. Richard Barnettef6839282015-06-01 16:00:35 -0700123 scheduling pool of DUTs. The collection maintains a list of
124 working DUTs, a list of broken DUTs, and a list of all DUTs.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700125
J. Richard Barnettef6839282015-06-01 16:00:35 -0700126 Performance note: Certain methods in this class are potentially
127 expensive:
128 * `get_working()`
129 * `get_working_list()`
130 * `get_broken()`
131 * `get_broken_list()`
xixuan12ce04f2016-03-10 13:16:30 -0800132 * `get_idle()`
133 * `get_idle_list()`
J. Richard Barnettef6839282015-06-01 16:00:35 -0700134 The first time any one of these methods is called, it causes
135 multiple RPC calls with a relatively expensive set of database
136 queries. However, the results of the queries are cached in the
137 individual `HostJobHistory` objects, so only the first call
138 actually pays the full cost.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700139
xixuan12ce04f2016-03-10 13:16:30 -0800140 Additionally, `get_working_list()`, `get_broken_list()` and
141 `get_idle_list()` cache their return values to avoid recalculating
142 lists at every call; this caching is separate from the caching of RPC
143 results described above.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700144
145 This class is deliberately constructed to delay the RPC cost
146 until the accessor methods are called (rather than to query in
J. Richard Barnette96db3492015-03-27 17:23:52 -0700147 `record_host()`) so that it's possible to construct a complete
148 `_LabInventory` without making the expensive queries at creation
J. Richard Barnettef6839282015-06-01 16:00:35 -0700149 time. `_populate_board_counts()`, below, assumes this behavior.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700150
151 """
152
153 def __init__(self):
154 self._histories = []
J. Richard Barnettef6839282015-06-01 16:00:35 -0700155 self._working_list = None
156 self._broken_list = None
xixuan12ce04f2016-03-10 13:16:30 -0800157 self._idle_list = None
J. Richard Barnette96db3492015-03-27 17:23:52 -0700158
159
160 def record_host(self, host_history):
161 """Add one `HostJobHistory` object to the collection.
162
163 @param host_history The `HostJobHistory` object to be
164 remembered.
165
166 """
J. Richard Barnettef6839282015-06-01 16:00:35 -0700167 self._working_list = None
168 self._broken_list = None
xixuan12ce04f2016-03-10 13:16:30 -0800169 self._idle_list = None
J. Richard Barnette96db3492015-03-27 17:23:52 -0700170 self._histories.append(host_history)
171
172
J. Richard Barnettef6839282015-06-01 16:00:35 -0700173 def get_working_list(self):
174 """Return a list of all working DUTs in the pool.
175
176 Filter `self._histories` for histories where the last
177 diagnosis is `WORKING`.
178
179 Cache the result so that we only cacluate it once.
180
181 @return A list of HostJobHistory objects.
182
183 """
184 if self._working_list is None:
185 self._working_list = [h for h in self._histories
186 if h.last_diagnosis()[0] == status_history.WORKING]
187 return self._working_list
188
189
J. Richard Barnette96db3492015-03-27 17:23:52 -0700190 def get_working(self):
J. Richard Barnettef6839282015-06-01 16:00:35 -0700191 """Return the number of working DUTs in the pool."""
192 return len(self.get_working_list())
193
194
195 def get_broken_list(self):
196 """Return a list of all broken DUTs in the pool.
197
198 Filter `self._histories` for histories where the last
xixuan12ce04f2016-03-10 13:16:30 -0800199 diagnosis is `BROKEN`.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700200
201 Cache the result so that we only cacluate it once.
202
203 @return A list of HostJobHistory objects.
204
205 """
206 if self._broken_list is None:
207 self._broken_list = [h for h in self._histories
xixuan12ce04f2016-03-10 13:16:30 -0800208 if h.last_diagnosis()[0] == status_history.BROKEN]
J. Richard Barnettef6839282015-06-01 16:00:35 -0700209 return self._broken_list
J. Richard Barnette96db3492015-03-27 17:23:52 -0700210
211
212 def get_broken(self):
J. Richard Barnettef6839282015-06-01 16:00:35 -0700213 """Return the number of broken DUTs in the pool."""
214 return len(self.get_broken_list())
J. Richard Barnette96db3492015-03-27 17:23:52 -0700215
216
xixuan12ce04f2016-03-10 13:16:30 -0800217 def get_idle_list(self):
218 """Return a list of all idle DUTs in the pool.
219
220 Filter `self._histories` for histories where the last
221 diagnosis is `UNUSED` or `UNKNOWN`.
222
223 Cache the result so that we only cacluate it once.
224
225 @return A list of HostJobHistory objects.
226
227 """
228 idle_list = [status_history.UNUSED, status_history.UNKNOWN]
229 if self._idle_list is None:
230 self._idle_list = [h for h in self._histories
231 if h.last_diagnosis()[0] in idle_list]
232 return self._idle_list
233
234
235 def get_idle(self):
236 """Return the number of idle DUTs in the pool."""
237 return len(self.get_idle_list())
238
239
J. Richard Barnette96db3492015-03-27 17:23:52 -0700240 def get_total(self):
J. Richard Barnettef6839282015-06-01 16:00:35 -0700241 """Return the total number of DUTs in the pool."""
J. Richard Barnette96db3492015-03-27 17:23:52 -0700242 return len(self._histories)
243
244
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800245class _ManagedPoolsHostJobHistories(object):
246 """Maintains a set of `HostJobHistory`s per managed pool.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700247
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800248 The collection maintains a count of working DUTs, a count of broken DUTs,
249 and a total count. The counts can be obtained either for a single pool, or
250 as a total across all pools.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700251
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800252 DUTs in the collection must be assigned to one of the pools in
253 `_MANAGED_POOLS`.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700254
255 The `get_working()` and `get_broken()` methods rely on the
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800256 methods of the same name in _CachedHostJobHistories, so the performance
257 note in _CachedHostJobHistories applies here as well.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700258
259 """
260
261 def __init__(self):
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800262 self._histories_by_pool = {
263 pool: _CachedHostJobHistories() for pool in MANAGED_POOLS
J. Richard Barnette96db3492015-03-27 17:23:52 -0700264 }
265
266 def record_host(self, host_history):
267 """Add one `HostJobHistory` object to the collection.
268
269 @param host_history The `HostJobHistory` object to be
270 remembered.
271
272 """
J. Richard Barnette3d0590a2015-04-29 12:56:12 -0700273 pool = host_history.host_pool
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800274 self._histories_by_pool[pool].record_host(host_history)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700275
276
277 def _count_pool(self, get_pool_count, pool=None):
278 """Internal helper to count hosts in a given pool.
279
280 The `get_pool_count` parameter is a function to calculate
281 the exact count of interest for the pool.
282
283 @param get_pool_count Function to return a count from a
284 _PoolCount object.
285 @param pool The pool to be counted. If `None`,
286 return the total across all pools.
287
288 """
289 if pool is None:
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800290 return sum([get_pool_count(cached_history) for cached_history in
291 self._histories_by_pool.values()])
J. Richard Barnette96db3492015-03-27 17:23:52 -0700292 else:
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800293 return get_pool_count(self._histories_by_pool[pool])
J. Richard Barnette96db3492015-03-27 17:23:52 -0700294
295
J. Richard Barnettef6839282015-06-01 16:00:35 -0700296 def get_working_list(self):
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800297 """Return a list of all working DUTs (across all pools).
J. Richard Barnettef6839282015-06-01 16:00:35 -0700298
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800299 Go through all HostJobHistory objects across all pools, selecting the
300 ones where the last diagnosis is `WORKING`.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700301
302 @return A list of HostJobHistory objects.
303
304 """
305 l = []
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800306 for p in self._histories_by_pool.values():
J. Richard Barnettef6839282015-06-01 16:00:35 -0700307 l.extend(p.get_working_list())
308 return l
309
310
J. Richard Barnette96db3492015-03-27 17:23:52 -0700311 def get_working(self, pool=None):
312 """Return the number of working DUTs in a pool.
313
314 @param pool The pool to be counted. If `None`, return the
315 total across all pools.
316
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700317 @return The total number of working DUTs in the selected
318 pool(s).
J. Richard Barnette96db3492015-03-27 17:23:52 -0700319 """
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800320 return self._count_pool(_CachedHostJobHistories.get_working, pool)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700321
322
J. Richard Barnettef6839282015-06-01 16:00:35 -0700323 def get_broken_list(self):
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800324 """Return a list of all broken DUTs (across all pools).
J. Richard Barnettef6839282015-06-01 16:00:35 -0700325
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800326 Go through all HostJobHistory objects in the across all pools,
xixuan12ce04f2016-03-10 13:16:30 -0800327 selecting the ones where the last diagnosis is `BROKEN`.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700328
329 @return A list of HostJobHistory objects.
330
331 """
332 l = []
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800333 for p in self._histories_by_pool.values():
J. Richard Barnettef6839282015-06-01 16:00:35 -0700334 l.extend(p.get_broken_list())
335 return l
336
337
J. Richard Barnette96db3492015-03-27 17:23:52 -0700338 def get_broken(self, pool=None):
339 """Return the number of broken DUTs in a pool.
340
341 @param pool The pool to be counted. If `None`, return the
342 total across all pools.
343
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700344 @return The total number of broken DUTs in the selected pool(s).
J. Richard Barnette96db3492015-03-27 17:23:52 -0700345 """
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800346 return self._count_pool(_CachedHostJobHistories.get_broken, pool)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700347
348
xixuan12ce04f2016-03-10 13:16:30 -0800349 def get_idle_list(self, pool=None):
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800350 """Return a list of all idle DUTs in the given pool.
xixuan12ce04f2016-03-10 13:16:30 -0800351
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800352 Go through all HostJobHistory objects in the given pool, selecting the
353 ones where the last diagnosis is `UNUSED` or `UNKNOWN`.
xixuan12ce04f2016-03-10 13:16:30 -0800354
355 @param pool: The pool to be counted. If `None`, return the total list
356 across all pools.
357
358 @return A list of HostJobHistory objects.
359
360 """
361 if pool is None:
362 l = []
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800363 for p in self._histories_by_pool.values():
xixuan12ce04f2016-03-10 13:16:30 -0800364 l.extend(p.get_idle_list())
365 return l
366 else:
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800367 return _CachedHostJobHistories.get_idle_list(
368 self._histories_by_pool[pool])
xixuan12ce04f2016-03-10 13:16:30 -0800369
370
371 def get_idle(self, pool=None):
372 """Return the number of idle DUTs in a pool.
373
374 @param pool: The pool to be counted. If `None`, return the total
375 across all pools.
376
377 @return The total number of idle DUTs in the selected pool(s).
378 """
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800379 return self._count_pool(_CachedHostJobHistories.get_idle, pool)
xixuan12ce04f2016-03-10 13:16:30 -0800380
381
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700382 def get_spares_buffer(self):
383 """Return the the nominal number of working spares.
384
385 Calculates and returns how many working spares there would
386 be in the spares pool if all broken DUTs were in the spares
387 pool. This number may be negative, indicating a shortfall
388 in the critical pools.
389
390 @return The total number DUTs in the spares pool, less the total
391 number of broken DUTs in all pools.
392 """
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700393 return self.get_total(SPARE_POOL) - self.get_broken()
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700394
395
J. Richard Barnette96db3492015-03-27 17:23:52 -0700396 def get_total(self, pool=None):
397 """Return the total number of DUTs in a pool.
398
399 @param pool The pool to be counted. If `None`, return the
400 total across all pools.
401
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700402 @return The total number of DUTs in the selected pool(s).
J. Richard Barnette96db3492015-03-27 17:23:52 -0700403 """
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800404 return self._count_pool(_CachedHostJobHistories.get_total, pool)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700405
406
Prathmesh Prabhu154cb2b2017-11-08 17:36:51 -0800407class _LabInventory(object):
J. Richard Barnette96db3492015-03-27 17:23:52 -0700408 """Collection of `HostJobHistory` objects for the Lab's inventory.
409
Prathmesh Prabhu154cb2b2017-11-08 17:36:51 -0800410 Important attributes:
411 by_board: A dict mapping board to ManagedPoolsHostJobHistories
J. Richard Barnette96db3492015-03-27 17:23:52 -0700412
413 """
414
J. Richard Barnetteb8bc570c2016-03-17 17:03:57 -0700415 @staticmethod
416 def _eligible_host(afehost):
417 """Return whether this host is eligible for monitoring.
418
Richard Barnette99473f62017-10-17 14:43:46 -0700419 A host is eligible if it's in exactly one pool and it has no
420 labels from the `_EXCLUDED_LABELS` set.
J. Richard Barnetteb8bc570c2016-03-17 17:03:57 -0700421
422 @param afehost The host to be tested for eligibility.
423 """
Richard Barnette99473f62017-10-17 14:43:46 -0700424 pools = [l for l in afehost.labels
425 if l.startswith(constants.Labels.POOL_PREFIX)]
426 excluded = _EXCLUDED_LABELS.intersection(afehost.labels)
427 return len(pools) == 1 and not excluded
J. Richard Barnetteb8bc570c2016-03-17 17:03:57 -0700428
429
J. Richard Barnette96db3492015-03-27 17:23:52 -0700430 @classmethod
431 def create_inventory(cls, afe, start_time, end_time, boardlist=[]):
432 """Return a Lab inventory with specified parameters.
433
434 By default, gathers inventory from `HostJobHistory` objects
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700435 for all DUTs in the `MANAGED_POOLS` list. If `boardlist`
J. Richard Barnette96db3492015-03-27 17:23:52 -0700436 is supplied, the inventory will be restricted to only the
437 given boards.
438
439 @param afe AFE object for constructing the
440 `HostJobHistory` objects.
441 @param start_time Start time for the `HostJobHistory`
442 objects.
443 @param end_time End time for the `HostJobHistory`
444 objects.
445 @param boardlist List of boards to include. If empty,
446 include all available boards.
447 @return A `_LabInventory` object for the specified boards.
448
449 """
450 label_list = [constants.Labels.POOL_PREFIX + l
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700451 for l in MANAGED_POOLS]
J. Richard Barnette96db3492015-03-27 17:23:52 -0700452 afehosts = afe.get_hosts(labels__name__in=label_list)
453 if boardlist:
J. Richard Barnetteb8bc570c2016-03-17 17:03:57 -0700454 # We're deliberately not checking host eligibility in this
455 # code path. This is a debug path, not used in production;
456 # it may be useful to include ineligible hosts here.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700457 boardhosts = []
458 for board in boardlist:
459 board_label = constants.Labels.BOARD_PREFIX + board
460 host_list = [h for h in afehosts
461 if board_label in h.labels]
462 boardhosts.extend(host_list)
463 afehosts = boardhosts
J. Richard Barnetteb8bc570c2016-03-17 17:03:57 -0700464 else:
465 afehosts = [h for h in afehosts if cls._eligible_host(h)]
J. Richard Barnette96db3492015-03-27 17:23:52 -0700466 create = lambda host: (
467 status_history.HostJobHistory(afe, host,
468 start_time, end_time))
469 return cls([create(host) for host in afehosts])
470
471
472 def __init__(self, histories):
J. Richard Barnette6948ed32015-05-06 08:57:10 -0700473 # N.B. The query that finds our hosts is restricted to those
474 # with a valid pool: label, but doesn't check for a valid
475 # board: label. In some (insufficiently) rare cases, the
476 # AFE hosts table has been known to (incorrectly) have DUTs
477 # with a pool: but no board: label. We explicitly exclude
478 # those here.
479 histories = [h for h in histories
480 if h.host_board is not None]
Prathmesh Prabhu154cb2b2017-11-08 17:36:51 -0800481 self.histories = histories
J. Richard Barnette96db3492015-03-27 17:23:52 -0700482 self._dut_count = len(histories)
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700483 self._managed_boards = {}
Prathmesh Prabhu021e7842017-11-08 18:05:45 -0800484 self.by_board = self._classify_by_label_type('board')
Prathmesh Prabhu154cb2b2017-11-08 17:36:51 -0800485
486
Prathmesh Prabhu021e7842017-11-08 18:05:45 -0800487 def _classify_by_label_type(self, label_key):
488 """Classify histories by labels with the given key.
489
490 @returns a dict mapping labels with the given key to
491 _ManagedPoolsHostJobHistories for DUTs with that label.
492 """
493 classified = collections.defaultdict(_ManagedPoolsHostJobHistories)
Prathmesh Prabhu154cb2b2017-11-08 17:36:51 -0800494 for h in self.histories:
Prathmesh Prabhu021e7842017-11-08 18:05:45 -0800495 labels = labellib.LabelsMapping(h.host.labels)
496 if label_key in labels:
497 classified[labels[label_key]].record_host(h)
498 return dict(classified)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700499
500
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700501 def get_managed_boards(self, pool=_MANAGED_POOL_DEFAULT):
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700502 """Return the set of "managed" boards.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700503
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700504 Operationally, saying a board is "managed" means that the
505 board will be included in the "board" and "repair
506 recommendations" reports. That is, if there are failures in
507 the board's inventory then lab techs will be asked to fix
508 them without a separate ticket.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700509
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700510 For purposes of implementation, a board is "managed" if it
511 has DUTs in both the spare and a non-spare (i.e. critical)
512 pool.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700513
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700514 @param pool: The specified pool for managed boards.
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700515 @return A set of all the boards that have both spare and
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700516 non-spare pools, unless the pool is specified,
517 then the set of boards in that pool.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700518 """
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700519 if self._managed_boards.get(pool, None) is None:
520 self._managed_boards[pool] = set()
Prathmesh Prabhu154cb2b2017-11-08 17:36:51 -0800521 for board, counts in self.by_board.iteritems():
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700522 # Get the counts for all pools, otherwise get it for the
523 # specified pool.
524 if pool == _MANAGED_POOL_DEFAULT:
525 spares = counts.get_total(SPARE_POOL)
526 total = counts.get_total()
527 if spares != 0 and spares != total:
528 self._managed_boards[pool].add(board)
529 else:
530 if counts.get_total(pool) != 0:
531 self._managed_boards[pool].add(board)
532 return self._managed_boards[pool]
J. Richard Barnettef6839282015-06-01 16:00:35 -0700533
534
J. Richard Barnette96db3492015-03-27 17:23:52 -0700535 def get_num_duts(self):
536 """Return the total number of DUTs in the inventory."""
537 return self._dut_count
538
539
540 def get_num_boards(self):
541 """Return the total number of boards in the inventory."""
Prathmesh Prabhu154cb2b2017-11-08 17:36:51 -0800542 return len(self.by_board)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700543
544
J. Richard Barnettef6839282015-06-01 16:00:35 -0700545def _sort_by_location(inventory_list):
546 """Return a list of DUTs, organized by location.
547
548 Take the given list of `HostJobHistory` objects, separate it
549 into a list per lab, and sort each lab's list by location. The
550 order of sorting within a lab is
551 * By row number within the lab,
552 * then by rack number within the row,
553 * then by host shelf number within the rack.
554
555 Return a list of the sorted lists.
556
557 Implementation note: host locations are sorted by converting
558 each location into a base 100 number. If row, rack or
559 host numbers exceed the range [0..99], then sorting will
560 break down.
561
562 @return A list of sorted lists of DUTs.
563
564 """
565 BASE = 100
566 lab_lists = {}
567 for history in inventory_list:
568 location = _HOSTNAME_PATTERN.match(history.host.hostname)
569 if location:
570 lab = location.group(1)
571 key = 0
572 for idx in location.group(2, 3, 4):
573 key = BASE * key + int(idx)
574 lab_lists.setdefault(lab, []).append((key, history))
575 return_list = []
576 for dut_list in lab_lists.values():
577 dut_list.sort(key=lambda t: t[0])
578 return_list.append([t[1] for t in dut_list])
579 return return_list
580
581
582def _score_repair_set(buffer_counts, repair_list):
583 """Return a numeric score rating a set of DUTs to be repaired.
584
585 `buffer_counts` is a dictionary mapping board names to the
586 size of the board's spares buffer.
587
588 `repair_list` is a list of DUTs to be repaired.
589
590 This function calculates the new set of buffer counts that would
591 result from the proposed repairs, and scores the new set using
592 two numbers:
593 * Worst case buffer count for any board (higher is better).
594 This is the more siginficant number for comparison.
595 * Number of boards at the worst case (lower is better). This
596 is the less significant number.
597
598 Implementation note: The score could fail to reflect the
599 intended criteria if there are more than 1000 boards in the
600 inventory.
601
602 @param spare_counts A dictionary mapping boards to buffer counts.
603 @param repair_list A list of boards to be repaired.
604 @return A numeric score.
605
606 """
607 # Go through `buffer_counts`, and create a list of new counts
608 # that records the buffer count for each board after repair.
609 # The new list of counts discards the board names, as they don't
610 # contribute to the final score.
611 _NBOARDS = 1000
612 repair_inventory = _LabInventory(repair_list)
613 new_counts = []
614 for b, c in buffer_counts.items():
Prathmesh Prabhu154cb2b2017-11-08 17:36:51 -0800615 if b in repair_inventory.by_board:
616 newcount = repair_inventory.by_board[b].get_total()
J. Richard Barnettef6839282015-06-01 16:00:35 -0700617 else:
618 newcount = 0
619 new_counts.append(c + newcount)
620 # Go through the new list of counts. Find the worst available
621 # spares count, and count how many times that worst case occurs.
622 worst_count = new_counts[0]
623 num_worst = 1
624 for c in new_counts[1:]:
625 if c == worst_count:
626 num_worst += 1
627 elif c < worst_count:
628 worst_count = c
629 num_worst = 1
630 # Return the calculated score
631 return _NBOARDS * worst_count - num_worst
632
633
634def _generate_repair_recommendation(inventory, num_recommend):
635 """Return a summary of selected DUTs needing repair.
636
637 Returns a message recommending a list of broken DUTs to be
638 repaired. The list of DUTs is selected based on these
639 criteria:
640 * No more than `num_recommend` DUTs will be listed.
641 * All DUTs must be in the same lab.
642 * DUTs should be selected for some degree of physical
643 proximity.
644 * DUTs for boards with a low spares buffer are more important
645 than DUTs with larger buffers.
646
647 The algorithm used will guarantee that at least one DUT from a
648 board with the smallest spares buffer will be recommended. If
649 the worst spares buffer number is shared by more than one board,
650 the algorithm will tend to prefer repair sets that include more
651 of those boards over sets that cover fewer boards.
652
J. Richard Barnette1df6a562015-06-09 10:06:17 -0700653 @param inventory Inventory for generating recommendations.
654 @param num_recommend Number of DUTs to recommend for repair.
655
J. Richard Barnettef6839282015-06-01 16:00:35 -0700656 """
657 logging.debug('Creating DUT repair recommendations')
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700658 board_buffer_counts = {}
659 broken_list = []
660 for board in inventory.get_managed_boards():
661 logging.debug('Listing failed DUTs for %s', board)
Prathmesh Prabhu154cb2b2017-11-08 17:36:51 -0800662 counts = inventory.by_board[board]
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700663 if counts.get_broken() != 0:
664 board_buffer_counts[board] = counts.get_spares_buffer()
665 broken_list.extend(counts.get_broken_list())
J. Richard Barnette55127432015-10-13 17:01:56 -0700666 # N.B. The logic inside this loop may seem complicated, but
J. Richard Barnettef6839282015-06-01 16:00:35 -0700667 # simplification is hard:
668 # * Calculating an initial recommendation outside of
669 # the loop likely would make things more complicated,
670 # not less.
671 # * It's necessary to calculate an initial lab slice once per
672 # lab _before_ the while loop, in case the number of broken
673 # DUTs in a lab is less than `num_recommend`.
J. Richard Barnette55127432015-10-13 17:01:56 -0700674 recommendation = None
675 best_score = None
676 for lab_duts in _sort_by_location(broken_list):
J. Richard Barnettef6839282015-06-01 16:00:35 -0700677 start = 0
678 end = num_recommend
679 lab_slice = lab_duts[start : end]
680 lab_score = _score_repair_set(board_buffer_counts,
681 lab_slice)
682 while end < len(lab_duts):
683 start += 1
684 end += 1
685 new_slice = lab_duts[start : end]
686 new_score = _score_repair_set(board_buffer_counts,
687 new_slice)
688 if new_score > lab_score:
689 lab_slice = new_slice
690 lab_score = new_score
691 if recommendation is None or lab_score > best_score:
692 recommendation = lab_slice
693 best_score = lab_score
J. Richard Barnette5d0fa512016-04-05 17:39:52 -0700694 # N.B. The trailing space here is manadatory: Without it, Gmail
695 # will parse the URL wrong. Don't ask. If you simply _must_
696 # know more, go try it yourself...
697 line_fmt = '%-30s %-16s %-6s\n %s '
J. Richard Barnette1df6a562015-06-09 10:06:17 -0700698 message = ['Repair recommendations:\n',
J. Richard Barnette5d0fa512016-04-05 17:39:52 -0700699 line_fmt % ( 'Hostname', 'Board', 'Servo?', 'Logs URL')]
J. Richard Barnettef6839282015-06-01 16:00:35 -0700700 for h in recommendation:
701 servo_name = servo_host.make_servo_hostname(h.host.hostname)
J. Richard Barnette5d0fa512016-04-05 17:39:52 -0700702 servo_present = utils.host_is_in_lab_zone(servo_name)
703 _, event = h.last_diagnosis()
704 line = line_fmt % (
705 h.host.hostname, h.host_board,
706 'Yes' if servo_present else 'No', event.job_url)
J. Richard Barnettef6839282015-06-01 16:00:35 -0700707 message.append(line)
708 return '\n'.join(message)
709
710
J. Richard Barnette96db3492015-03-27 17:23:52 -0700711def _generate_board_inventory_message(inventory):
712 """Generate the "board inventory" e-mail message.
713
714 The board inventory is a list by board summarizing the number
715 of working and broken DUTs, and the total shortfall or surplus
716 of working devices relative to the minimum critical pool
717 requirement.
718
719 The report omits boards with no DUTs in the spare pool or with
720 no DUTs in a critical pool.
721
722 N.B. For sample output text formattted as users can expect to
723 see it in e-mail and log files, refer to the unit tests.
724
725 @param inventory _LabInventory object with the inventory to
726 be reported on.
727 @return String with the inventory message to be sent.
728
729 """
730 logging.debug('Creating board inventory')
J. Richard Barnetted3ba33a2015-10-14 11:20:49 -0700731 nworking = 0
732 nbroken = 0
xixuan12ce04f2016-03-10 13:16:30 -0800733 nidle = 0
J. Richard Barnetted3ba33a2015-10-14 11:20:49 -0700734 nbroken_boards = 0
J. Richard Barnetteea5a4ba2016-02-18 16:34:50 -0800735 ntotal_boards = 0
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700736 summaries = []
737 for board in inventory.get_managed_boards():
Prathmesh Prabhu154cb2b2017-11-08 17:36:51 -0800738 counts = inventory.by_board[board]
Richard Barnette254d5b42016-07-06 19:13:23 -0700739 logging.debug('Counting %2d DUTS for board %s',
740 counts.get_total(), board)
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700741 # Summary elements laid out in the same order as the text
742 # headers:
xixuan12ce04f2016-03-10 13:16:30 -0800743 # Board Avail Bad Idle Good Spare Total
744 # e[0] e[1] e[2] e[3] e[4] e[5] e[6]
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700745 element = (board,
746 counts.get_spares_buffer(),
747 counts.get_broken(),
xixuan12ce04f2016-03-10 13:16:30 -0800748 counts.get_idle(),
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700749 counts.get_working(),
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700750 counts.get_total(SPARE_POOL),
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700751 counts.get_total())
J. Richard Barnetteea5a4ba2016-02-18 16:34:50 -0800752 if element[2]:
753 summaries.append(element)
754 nbroken_boards += 1
755 ntotal_boards += 1
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700756 nbroken += element[2]
xixuan12ce04f2016-03-10 13:16:30 -0800757 nidle += element[3]
758 nworking += element[4]
759 ntotal = nworking + nbroken + nidle
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700760 summaries = sorted(summaries, key=lambda e: (e[1], -e[2]))
J. Richard Barnetted3ba33a2015-10-14 11:20:49 -0700761 broken_percent = int(round(100.0 * nbroken / ntotal))
xixuan12ce04f2016-03-10 13:16:30 -0800762 idle_percent = int(round(100.0 * nidle / ntotal))
763 working_percent = 100 - broken_percent - idle_percent
J. Richard Barnetted3ba33a2015-10-14 11:20:49 -0700764 message = ['Summary of DUTs in inventory:',
xixuan12ce04f2016-03-10 13:16:30 -0800765 '%10s %10s %10s %6s' % ('Bad', 'Idle', 'Good', 'Total'),
766 '%5d %3d%% %5d %3d%% %5d %3d%% %6d' % (
J. Richard Barnetted3ba33a2015-10-14 11:20:49 -0700767 nbroken, broken_percent,
xixuan12ce04f2016-03-10 13:16:30 -0800768 nidle, idle_percent,
J. Richard Barnetted3ba33a2015-10-14 11:20:49 -0700769 nworking, working_percent,
770 ntotal),
771 '',
772 'Boards with failures: %d' % nbroken_boards,
J. Richard Barnetteea5a4ba2016-02-18 16:34:50 -0800773 'Boards in inventory: %d' % ntotal_boards,
J. Richard Barnetted3ba33a2015-10-14 11:20:49 -0700774 '', '',
775 'Full board inventory:\n',
xixuan12ce04f2016-03-10 13:16:30 -0800776 '%-22s %5s %5s %5s %5s %5s %5s' % (
777 'Board', 'Avail', 'Bad', 'Idle', 'Good',
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700778 'Spare', 'Total')]
J. Richard Barnette96db3492015-03-27 17:23:52 -0700779 message.extend(
xixuan12ce04f2016-03-10 13:16:30 -0800780 ['%-22s %5d %5d %5d %5d %5d %5d' % e for e in summaries])
J. Richard Barnette96db3492015-03-27 17:23:52 -0700781 return '\n'.join(message)
782
783
J. Richard Barnette4845fcf2015-04-20 14:26:25 -0700784_POOL_INVENTORY_HEADER = '''\
Aviv Keshet056d74c2015-07-14 09:18:43 -0700785Notice to Infrastructure deputies: All boards shown below are at
J. Richard Barnettec9a143c2015-06-04 11:11:19 -0700786less than full strength, please take action to resolve the issues.
787Once you're satisified that failures won't recur, failed DUTs can
788be replaced with spares by running `balance_pool`. Detailed
789instructions can be found here:
J. Richard Barnette4845fcf2015-04-20 14:26:25 -0700790 http://go/cros-manage-duts
791'''
792
793
J. Richard Barnette96db3492015-03-27 17:23:52 -0700794def _generate_pool_inventory_message(inventory):
795 """Generate the "pool inventory" e-mail message.
796
797 The pool inventory is a list by pool and board summarizing the
798 number of working and broken DUTs in the pool. Only boards with
799 at least one broken DUT are included in the list.
800
801 N.B. For sample output text formattted as users can expect to
802 see it in e-mail and log files, refer to the unit tests.
803
804 @param inventory _LabInventory object with the inventory to
805 be reported on.
806 @return String with the inventory message to be sent.
807
808 """
809 logging.debug('Creating pool inventory')
J. Richard Barnette4845fcf2015-04-20 14:26:25 -0700810 message = [_POOL_INVENTORY_HEADER]
J. Richard Barnette96db3492015-03-27 17:23:52 -0700811 newline = ''
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700812 for pool in CRITICAL_POOLS:
J. Richard Barnette96db3492015-03-27 17:23:52 -0700813 message.append(
814 '%sStatus for pool:%s, by board:' % (newline, pool))
815 message.append(
xixuan12ce04f2016-03-10 13:16:30 -0800816 '%-20s %5s %5s %5s %5s' % (
817 'Board', 'Bad', 'Idle', 'Good', 'Total'))
J. Richard Barnette96db3492015-03-27 17:23:52 -0700818 data_list = []
Prathmesh Prabhu154cb2b2017-11-08 17:36:51 -0800819 for board, counts in inventory.by_board.iteritems():
Richard Barnette254d5b42016-07-06 19:13:23 -0700820 logging.debug('Counting %2d DUTs for %s, %s',
821 counts.get_total(pool), board, pool)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700822 broken = counts.get_broken(pool)
xixuan12ce04f2016-03-10 13:16:30 -0800823 idle = counts.get_idle(pool)
824 # boards at full strength are not reported
825 if broken == 0 and idle == 0:
J. Richard Barnette96db3492015-03-27 17:23:52 -0700826 continue
827 working = counts.get_working(pool)
828 total = counts.get_total(pool)
xixuan12ce04f2016-03-10 13:16:30 -0800829 data_list.append((board, broken, idle, working, total))
J. Richard Barnette96db3492015-03-27 17:23:52 -0700830 if data_list:
831 data_list = sorted(data_list, key=lambda d: -d[1])
832 message.extend(
xixuan12ce04f2016-03-10 13:16:30 -0800833 ['%-20s %5d %5d %5d %5d' % t for t in data_list])
J. Richard Barnette96db3492015-03-27 17:23:52 -0700834 else:
835 message.append('(All boards at full strength)')
836 newline = '\n'
837 return '\n'.join(message)
838
839
xixuan12ce04f2016-03-10 13:16:30 -0800840_IDLE_INVENTORY_HEADER = '''\
841Notice to Infrastructure deputies: The hosts shown below haven't
842run any jobs for at least 24 hours. Please check each host; locked
843hosts should normally be unlocked; stuck jobs should normally be
844aborted.
845'''
846
847
848def _generate_idle_inventory_message(inventory):
849 """Generate the "idle inventory" e-mail message.
850
851 The idle inventory is a host list with corresponding pool and board,
852 where the hosts are idle (`UNKWOWN` or `UNUSED`).
853
854 N.B. For sample output text format as users can expect to
855 see it in e-mail and log files, refer to the unit tests.
856
857 @param inventory _LabInventory object with the inventory to
858 be reported on.
859 @return String with the inventory message to be sent.
860
861 """
862 logging.debug('Creating idle inventory')
863 message = [_IDLE_INVENTORY_HEADER]
864 message.append('Idle Host List:')
865 message.append('%-30s %-20s %s' % ('Hostname', 'Board', 'Pool'))
866 data_list = []
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700867 for pool in MANAGED_POOLS:
Prathmesh Prabhu154cb2b2017-11-08 17:36:51 -0800868 for board, counts in inventory.by_board.iteritems():
Richard Barnette254d5b42016-07-06 19:13:23 -0700869 logging.debug('Counting %2d DUTs for %s, %s',
870 counts.get_total(pool), board, pool)
xixuan12ce04f2016-03-10 13:16:30 -0800871 data_list.extend([(dut.host.hostname, board, pool)
872 for dut in counts.get_idle_list(pool)])
873 if data_list:
874 message.extend(['%-30s %-20s %s' % t for t in data_list])
875 else:
876 message.append('(No idle DUTs)')
877 return '\n'.join(message)
878
879
J. Richard Barnette96db3492015-03-27 17:23:52 -0700880def _send_email(arguments, tag, subject, recipients, body):
881 """Send an inventory e-mail message.
882
883 The message is logged in the selected log directory using `tag`
884 for the file name.
885
886 If the --print option was requested, the message is neither
887 logged nor sent, but merely printed on stdout.
888
889 @param arguments Parsed command-line options.
890 @param tag Tag identifying the inventory for logging
891 purposes.
892 @param subject E-mail Subject: header line.
893 @param recipients E-mail addresses for the To: header line.
894 @param body E-mail message body.
895
896 """
897 logging.debug('Generating email: "%s"', subject)
898 all_recipients = ', '.join(recipients)
899 report_body = '\n'.join([
900 'To: %s' % all_recipients,
901 'Subject: %s' % subject,
902 '', body, ''])
J. Richard Barnette02e82432015-10-13 16:02:47 -0700903 if arguments.debug:
J. Richard Barnette96db3492015-03-27 17:23:52 -0700904 print report_body
905 else:
906 filename = os.path.join(arguments.logdir, tag)
907 try:
908 report_file = open(filename, 'w')
909 report_file.write(report_body)
910 report_file.close()
911 except EnvironmentError as e:
912 logging.error('Failed to write %s: %s', filename, e)
913 try:
914 gmail_lib.send_email(all_recipients, subject, body)
915 except Exception as e:
916 logging.error('Failed to send e-mail to %s: %s',
917 all_recipients, e)
918
919
920def _separate_email_addresses(address_list):
921 """Parse a list of comma-separated lists of e-mail addresses.
922
923 @param address_list A list of strings containing comma
924 separate e-mail addresses.
925 @return A list of the individual e-mail addresses.
926
927 """
928 newlist = []
929 for arg in address_list:
930 newlist.extend([email.strip() for email in arg.split(',')])
931 return newlist
932
933
934def _verify_arguments(arguments):
935 """Validate command-line arguments.
936
937 Join comma separated e-mail addresses for `--board-notify` and
938 `--pool-notify` in separate option arguments into a single list.
939
J. Richard Barnette02e82432015-10-13 16:02:47 -0700940 For non-debug uses, require that notification be requested for
941 at least one report. For debug, if notification isn't specified,
942 treat it as "run all the reports."
943
944 The return value indicates success or failure; in the case of
945 failure, we also write an error message to stderr.
946
J. Richard Barnette96db3492015-03-27 17:23:52 -0700947 @param arguments Command-line arguments as returned by
948 `ArgumentParser`
J. Richard Barnette02e82432015-10-13 16:02:47 -0700949 @return True if the arguments are semantically good, or False
950 if the arguments don't meet requirements.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700951
952 """
953 arguments.board_notify = _separate_email_addresses(
954 arguments.board_notify)
955 arguments.pool_notify = _separate_email_addresses(
956 arguments.pool_notify)
J. Richard Barnette02e82432015-10-13 16:02:47 -0700957 if not arguments.board_notify and not arguments.pool_notify:
958 if not arguments.debug:
959 sys.stderr.write('Must specify at least one of '
960 '--board-notify or --pool-notify\n')
961 return False
962 else:
963 # We want to run all the reports. An empty notify list
964 # will cause a report to be skipped, so make sure the
965 # lists are non-empty.
966 arguments.board_notify = ['']
967 arguments.pool_notify = ['']
968 return True
J. Richard Barnette96db3492015-03-27 17:23:52 -0700969
970
971def _get_logdir(script):
972 """Get the default directory for the `--logdir` option.
973
974 The default log directory is based on the parent directory
975 containing this script.
976
977 @param script Path to this script file.
978 @return A path to a directory.
979
980 """
981 basedir = os.path.dirname(os.path.abspath(script))
982 basedir = os.path.dirname(basedir)
983 return os.path.join(basedir, _LOGDIR)
984
985
986def _parse_command(argv):
987 """Parse the command line arguments.
988
989 Create an argument parser for this command's syntax, parse the
990 command line, and return the result of the ArgumentParser
991 parse_args() method.
992
993 @param argv Standard command line argument vector; argv[0] is
994 assumed to be the command name.
995 @return Result returned by ArgumentParser.parse_args().
996
997 """
998 parser = argparse.ArgumentParser(
999 prog=argv[0],
1000 description='Gather and report lab inventory statistics')
1001 parser.add_argument('-d', '--duration', type=int,
1002 default=_DEFAULT_DURATION, metavar='HOURS',
1003 help='number of hours back to search for status'
1004 ' (default: %d)' % _DEFAULT_DURATION)
1005 parser.add_argument('--board-notify', action='append',
1006 default=[], metavar='ADDRESS',
1007 help='Generate board inventory message, '
1008 'and send it to the given e-mail address(es)')
1009 parser.add_argument('--pool-notify', action='append',
1010 default=[], metavar='ADDRESS',
1011 help='Generate pool inventory message, '
1012 'and send it to the given address(es)')
J. Richard Barnette1df6a562015-06-09 10:06:17 -07001013 parser.add_argument('-r', '--recommend', type=int, default=None,
J. Richard Barnettef6839282015-06-01 16:00:35 -07001014 help=('Specify how many DUTs should be '
J. Richard Barnette1df6a562015-06-09 10:06:17 -07001015 'recommended for repair (default: no '
1016 'recommendation)'))
J. Richard Barnette02e82432015-10-13 16:02:47 -07001017 parser.add_argument('--debug', action='store_true',
J. Richard Barnette96db3492015-03-27 17:23:52 -07001018 help='Print e-mail messages on stdout '
1019 'without sending them.')
1020 parser.add_argument('--logdir', default=_get_logdir(argv[0]),
1021 help='Directory where logs will be written.')
1022 parser.add_argument('boardnames', nargs='*',
1023 metavar='BOARD',
1024 help='names of boards to report on '
1025 '(default: all boards)')
1026 arguments = parser.parse_args(argv[1:])
J. Richard Barnette02e82432015-10-13 16:02:47 -07001027 if not _verify_arguments(arguments):
1028 return None
J. Richard Barnette96db3492015-03-27 17:23:52 -07001029 return arguments
1030
1031
1032def _configure_logging(arguments):
1033 """Configure the `logging` module for our needs.
1034
1035 How we log depends on whether the `--print` option was
1036 provided on the command line. Without the option, we log all
1037 messages at DEBUG level or above, and write them to a file in
1038 the directory specified by the `--logdir` option. With the
1039 option, we write log messages to stdout; messages below INFO
1040 level are discarded.
1041
1042 The log file is configured to rotate once a week on Friday
1043 evening, preserving ~3 months worth of history.
1044
1045 @param arguments Command-line arguments as returned by
1046 `ArgumentParser`
1047
1048 """
J. Richard Barnettef6839282015-06-01 16:00:35 -07001049 root_logger = logging.getLogger()
J. Richard Barnette02e82432015-10-13 16:02:47 -07001050 if arguments.debug:
J. Richard Barnettef6839282015-06-01 16:00:35 -07001051 root_logger.setLevel(logging.INFO)
J. Richard Barnette96db3492015-03-27 17:23:52 -07001052 handler = logging.StreamHandler(sys.stdout)
1053 handler.setFormatter(logging.Formatter())
1054 else:
Richard Barnette5af97402016-04-18 11:00:26 -07001055 if not os.path.exists(arguments.logdir):
1056 os.mkdir(arguments.logdir)
J. Richard Barnettef6839282015-06-01 16:00:35 -07001057 root_logger.setLevel(logging.DEBUG)
J. Richard Barnette96db3492015-03-27 17:23:52 -07001058 logfile = os.path.join(arguments.logdir, _LOGFILE)
1059 handler = logging.handlers.TimedRotatingFileHandler(
1060 logfile, when='W4', backupCount=13)
1061 formatter = logging.Formatter(_LOG_FORMAT,
1062 time_utils.TIME_FMT)
1063 handler.setFormatter(formatter)
J. Richard Barnettef6839282015-06-01 16:00:35 -07001064 # TODO(jrbarnette) This is gross. Importing client.bin.utils
1065 # implicitly imported logging_config, which calls
1066 # logging.basicConfig() *at module level*. That gives us an
1067 # extra logging handler that we don't want. So, clear out all
1068 # the handlers here.
1069 for h in root_logger.handlers:
1070 root_logger.removeHandler(h)
1071 root_logger.addHandler(handler)
J. Richard Barnette96db3492015-03-27 17:23:52 -07001072
1073
1074def _populate_board_counts(inventory):
1075 """Gather board counts while providing interactive feedback.
1076
1077 Gathering the status of all individual DUTs in the lab can take
1078 considerable time (~30 minutes at the time of this writing).
1079
1080 Normally, we pay that cost by querying as we go. However, with
1081 the `--print` option, a human being may be watching the
1082 progress. So, we force the first (expensive) queries to happen
1083 up front, and provide a small ASCII progress bar to give an
1084 indicator of how many boards have been processed.
1085
1086 @param inventory _LabInventory object with the inventory to
1087 be gathered.
1088
1089 """
1090 n = 0
J. Richard Barnettef6839282015-06-01 16:00:35 -07001091 total_broken = 0
J. Richard Barnette96db3492015-03-27 17:23:52 -07001092 for counts in inventory.values():
1093 n += 1
1094 if n % 10 == 5:
1095 c = '+'
1096 elif n % 10 == 0:
1097 c = '%d' % ((n / 10) % 10)
1098 else:
1099 c = '.'
1100 sys.stdout.write(c)
1101 sys.stdout.flush()
1102 # This next call is where all the time goes - it forces all
1103 # of a board's HostJobHistory objects to query the database
1104 # and cache their results.
J. Richard Barnettef6839282015-06-01 16:00:35 -07001105 total_broken += counts.get_broken()
J. Richard Barnette96db3492015-03-27 17:23:52 -07001106 sys.stdout.write('\n')
J. Richard Barnettef6839282015-06-01 16:00:35 -07001107 sys.stdout.write('Found %d broken DUTs\n' % total_broken)
J. Richard Barnette96db3492015-03-27 17:23:52 -07001108
1109
1110def main(argv):
1111 """Standard main routine.
1112 @param argv Command line arguments including `sys.argv[0]`.
1113 """
1114 arguments = _parse_command(argv)
J. Richard Barnette02e82432015-10-13 16:02:47 -07001115 if not arguments:
1116 sys.exit(1)
J. Richard Barnette96db3492015-03-27 17:23:52 -07001117 _configure_logging(arguments)
1118 try:
1119 end_time = int(time.time())
1120 start_time = end_time - arguments.duration * 60 * 60
1121 timestamp = time.strftime('%Y-%m-%d.%H',
1122 time.localtime(end_time))
1123 logging.debug('Starting lab inventory for %s', timestamp)
1124 if arguments.board_notify:
J. Richard Barnette1df6a562015-06-09 10:06:17 -07001125 if arguments.recommend:
1126 logging.debug('Will include repair recommendations')
J. Richard Barnette96db3492015-03-27 17:23:52 -07001127 logging.debug('Will include board inventory')
1128 if arguments.pool_notify:
1129 logging.debug('Will include pool inventory')
1130
J. Richard Barnettea7c514e2015-09-15 11:13:23 -07001131 afe = frontend_wrappers.RetryingAFE(server=None)
J. Richard Barnette96db3492015-03-27 17:23:52 -07001132 inventory = _LabInventory.create_inventory(
1133 afe, start_time, end_time, arguments.boardnames)
1134 logging.info('Found %d hosts across %d boards',
1135 inventory.get_num_duts(),
1136 inventory.get_num_boards())
1137
J. Richard Barnette02e82432015-10-13 16:02:47 -07001138 if arguments.debug:
J. Richard Barnette96db3492015-03-27 17:23:52 -07001139 _populate_board_counts(inventory)
1140
J. Richard Barnette02e82432015-10-13 16:02:47 -07001141 if arguments.board_notify:
J. Richard Barnette1df6a562015-06-09 10:06:17 -07001142 if arguments.recommend:
1143 recommend_message = _generate_repair_recommendation(
1144 inventory, arguments.recommend) + '\n\n\n'
1145 else:
1146 recommend_message = ''
1147 board_message = _generate_board_inventory_message(inventory)
J. Richard Barnette96db3492015-03-27 17:23:52 -07001148 _send_email(arguments,
1149 'boards-%s.txt' % timestamp,
1150 'DUT board inventory %s' % timestamp,
1151 arguments.board_notify,
J. Richard Barnette02e82432015-10-13 16:02:47 -07001152 recommend_message + board_message)
J. Richard Barnette96db3492015-03-27 17:23:52 -07001153
J. Richard Barnette02e82432015-10-13 16:02:47 -07001154 if arguments.pool_notify:
xixuan12ce04f2016-03-10 13:16:30 -08001155 pool_message = _generate_pool_inventory_message(inventory)
1156 idle_message = _generate_idle_inventory_message(inventory)
J. Richard Barnette96db3492015-03-27 17:23:52 -07001157 _send_email(arguments,
1158 'pools-%s.txt' % timestamp,
1159 'DUT pool inventory %s' % timestamp,
1160 arguments.pool_notify,
xixuan12ce04f2016-03-10 13:16:30 -08001161 pool_message + '\n\n\n' + idle_message)
J. Richard Barnette96db3492015-03-27 17:23:52 -07001162 except KeyboardInterrupt:
1163 pass
1164 except EnvironmentError as e:
1165 logging.exception('Unexpected OS error: %s', e)
1166 except Exception as e:
1167 logging.exception('Unexpected exception: %s', e)
1168
1169
Kevin Chengcf0ad2b2016-04-19 14:51:39 -07001170def get_inventory(afe):
J. Richard Barnetteaa868932015-10-23 13:28:59 -07001171 end_time = int(time.time())
1172 start_time = end_time - 24 * 60 * 60
Kevin Chengcf0ad2b2016-04-19 14:51:39 -07001173 return _LabInventory.create_inventory(afe, start_time, end_time)
1174
1175
1176def get_managed_boards(afe):
1177 return get_inventory(afe).get_managed_boards()
J. Richard Barnetteaa868932015-10-23 13:28:59 -07001178
1179
J. Richard Barnette96db3492015-03-27 17:23:52 -07001180if __name__ == '__main__':
1181 main(sys.argv)