blob: d43e12d7abb56ca1feaeba65662943f1e6fa498b [file] [log] [blame]
J. Richard Barnette96db3492015-03-27 17:23:52 -07001#!/usr/bin/env python
2# Copyright 2015 The Chromium OS Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6"""Create e-mail reports of the Lab's DUT inventory.
7
8Gathers a list of all DUTs of interest in the Lab, segregated by
Richard Barnette5de01eb2017-12-15 09:53:42 -08009model and pool, and determines whether each DUT is working or
J. Richard Barnette96db3492015-03-27 17:23:52 -070010broken. Then, send one or more e-mail reports summarizing the
11status to e-mail addresses provided on the command line.
12
Richard Barnette5de01eb2017-12-15 09:53:42 -080013usage: lab_inventory.py [ options ] [ model ... ]
J. Richard Barnette96db3492015-03-27 17:23:52 -070014
15Options:
16--duration / -d <hours>
17 How far back in time to search job history to determine DUT
18 status.
19
Richard Barnette5de01eb2017-12-15 09:53:42 -080020--model-notify <address>[,<address>]
21 Send the "model status" e-mail to all the specified e-mail
J. Richard Barnette96db3492015-03-27 17:23:52 -070022 addresses.
23
24--pool-notify <address>[,<address>]
25 Send the "pool status" e-mail to all the specified e-mail
26 addresses.
27
J. Richard Barnette1df6a562015-06-09 10:06:17 -070028--recommend <number>
Richard Barnette5de01eb2017-12-15 09:53:42 -080029 When generating the "model status" e-mail, include a list of
J. Richard Barnette1df6a562015-06-09 10:06:17 -070030 <number> specific DUTs to be recommended for repair.
31
Richard Barnette1ca30e62018-04-09 16:45:58 -070032--report-untestable
33 Scan the inventory for DUTs that can't test because they're stuck in
34 repair loops, or because the scheduler can't give them work.
Richard Barnettecf5d8342017-10-24 18:13:11 -070035
J. Richard Barnette96db3492015-03-27 17:23:52 -070036--logdir <directory>
37 Log progress and actions in a file under this directory. Text
38 of any e-mail sent will also be logged in a timestamped file in
39 this directory.
40
J. Richard Barnette02e82432015-10-13 16:02:47 -070041--debug
Richard Barnettecf5d8342017-10-24 18:13:11 -070042 Suppress all logging, metrics reporting, and sending e-mail.
43 Instead, write the output that would be generated onto stdout.
J. Richard Barnette96db3492015-03-27 17:23:52 -070044
Richard Barnette5de01eb2017-12-15 09:53:42 -080045<model> arguments:
46 With no arguments, gathers the status for all models in the lab.
47 With one or more named models on the command line, restricts
48 reporting to just those models.
J. Richard Barnette96db3492015-03-27 17:23:52 -070049
50"""
51
52
53import argparse
Prathmesh Prabhu021e7842017-11-08 18:05:45 -080054import collections
J. Richard Barnette96db3492015-03-27 17:23:52 -070055import logging
56import logging.handlers
57import os
J. Richard Barnettef6839282015-06-01 16:00:35 -070058import re
J. Richard Barnette96db3492015-03-27 17:23:52 -070059import sys
60import time
61
62import common
J. Richard Barnettef6839282015-06-01 16:00:35 -070063from autotest_lib.client.bin import utils
J. Richard Barnette96db3492015-03-27 17:23:52 -070064from autotest_lib.client.common_lib import time_utils
Xixuan Wu93e646c2017-12-07 18:36:10 -080065from autotest_lib.server import constants
Richard Barnettecf5d8342017-10-24 18:13:11 -070066from autotest_lib.server import site_utils
J. Richard Barnettea7c514e2015-09-15 11:13:23 -070067from autotest_lib.server.cros.dynamic_suite import frontend_wrappers
J. Richard Barnettef6839282015-06-01 16:00:35 -070068from autotest_lib.server.hosts import servo_host
Aviv Keshet7ee95862016-08-30 15:18:27 -070069from autotest_lib.server.lib import status_history
J. Richard Barnette96db3492015-03-27 17:23:52 -070070from autotest_lib.site_utils import gmail_lib
Richard Barnettecf5d8342017-10-24 18:13:11 -070071from chromite.lib import metrics
J. Richard Barnette96db3492015-03-27 17:23:52 -070072
73
Richard Barnette673573b2016-12-12 09:46:39 -080074CRITICAL_POOLS = constants.Pools.CRITICAL_POOLS
75SPARE_POOL = constants.Pools.SPARE_POOL
76MANAGED_POOLS = constants.Pools.MANAGED_POOLS
J. Richard Barnette96db3492015-03-27 17:23:52 -070077
J. Richard Barnetteb8bc570c2016-03-17 17:03:57 -070078# _EXCLUDED_LABELS - A set of labels that disqualify a DUT from
Richard Barnetteeabcf392017-09-01 15:10:54 -070079# monitoring by this script. Currently, we're excluding these:
80# + 'adb' - We're not ready to monitor Android or Brillo hosts.
81# + 'board:guado_moblab' - These are maintained by a separate
82# process that doesn't use this script.
Kevin Chengcf0ad2b2016-04-19 14:51:39 -070083
Richard Barnetteeabcf392017-09-01 15:10:54 -070084_EXCLUDED_LABELS = {'adb', 'board:guado_moblab'}
J. Richard Barnetteb8bc570c2016-03-17 17:03:57 -070085
J. Richard Barnette96db3492015-03-27 17:23:52 -070086# _DEFAULT_DURATION:
87# Default value used for the --duration command line option.
88# Specifies how far back in time to search in order to determine
89# DUT status.
90
91_DEFAULT_DURATION = 24
92
J. Richard Barnette96db3492015-03-27 17:23:52 -070093# _LOGDIR:
Richard Barnettecf5d8342017-10-24 18:13:11 -070094# Relative path used in the calculation of the default setting for
95# the --logdir option. The full path is relative to the root of the
96# autotest directory, as determined from sys.argv[0].
J. Richard Barnette96db3492015-03-27 17:23:52 -070097# _LOGFILE:
98# Basename of a file to which general log information will be
99# written.
100# _LOG_FORMAT:
101# Format string for log messages.
102
103_LOGDIR = os.path.join('logs', 'dut-data')
104_LOGFILE = 'lab-inventory.log'
105_LOG_FORMAT = '%(asctime)s | %(levelname)-10s | %(message)s'
106
J. Richard Barnettef6839282015-06-01 16:00:35 -0700107# Pattern describing location-based host names in the Chrome OS test
108# labs. Each DUT hostname designates the DUT's location:
109# * A lab (room) that's physically separated from other labs
110# (i.e. there's a door).
111# * A row (or aisle) of DUTs within the lab.
112# * A vertical rack of shelves on the row.
113# * A specific host on one shelf of the rack.
114
115_HOSTNAME_PATTERN = re.compile(
116 r'(chromeos\d+)-row(\d+)-rack(\d+)-host(\d+)')
117
Richard Barnettecf5d8342017-10-24 18:13:11 -0700118# _REPAIR_LOOP_THRESHOLD:
119# The number of repeated Repair tasks that must be seen to declare
120# that a DUT is stuck in a repair loop.
121
122_REPAIR_LOOP_THRESHOLD = 4
123
J. Richard Barnette96db3492015-03-27 17:23:52 -0700124
Richard Barnette1ca30e62018-04-09 16:45:58 -0700125_UNTESTABLE_PRESENCE_METRIC = metrics.BooleanMetric(
126 'chromeos/autotest/inventory/untestable',
127 'DUTs that cannot be scheduled for testing')
128
129
Richard Barnette5de01eb2017-12-15 09:53:42 -0800130class _HostSetInventory(object):
131 """Maintains a set of related `HostJobHistory` objects.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700132
Richard Barnette5de01eb2017-12-15 09:53:42 -0800133 The collection is segregated into disjoint categories of "working",
134 "broken", and "idle" DUTs. Accessor methods allow finding both the
135 list of DUTs in each category, as well as counts of each category.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700136
J. Richard Barnettef6839282015-06-01 16:00:35 -0700137 Performance note: Certain methods in this class are potentially
138 expensive:
139 * `get_working()`
140 * `get_working_list()`
141 * `get_broken()`
142 * `get_broken_list()`
xixuan12ce04f2016-03-10 13:16:30 -0800143 * `get_idle()`
144 * `get_idle_list()`
J. Richard Barnettef6839282015-06-01 16:00:35 -0700145 The first time any one of these methods is called, it causes
146 multiple RPC calls with a relatively expensive set of database
147 queries. However, the results of the queries are cached in the
148 individual `HostJobHistory` objects, so only the first call
149 actually pays the full cost.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700150
xixuan12ce04f2016-03-10 13:16:30 -0800151 Additionally, `get_working_list()`, `get_broken_list()` and
152 `get_idle_list()` cache their return values to avoid recalculating
Richard Barnette5de01eb2017-12-15 09:53:42 -0800153 lists at every call; this caching is separate from the caching of
154 RPC results described above.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700155
Richard Barnette5de01eb2017-12-15 09:53:42 -0800156 This class is deliberately constructed to delay the RPC cost until
157 the accessor methods are called (rather than to query in
J. Richard Barnette96db3492015-03-27 17:23:52 -0700158 `record_host()`) so that it's possible to construct a complete
159 `_LabInventory` without making the expensive queries at creation
Richard Barnette5de01eb2017-12-15 09:53:42 -0800160 time. `_populate_model_counts()`, below, assumes this behavior.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700161
Richard Barnette5de01eb2017-12-15 09:53:42 -0800162 Current usage of this class is that all DUTs are part of a single
163 scheduling pool of DUTs; however, this class make no assumptions
164 about the actual relationship among the DUTs.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700165 """
166
167 def __init__(self):
168 self._histories = []
J. Richard Barnettef6839282015-06-01 16:00:35 -0700169 self._working_list = None
170 self._broken_list = None
xixuan12ce04f2016-03-10 13:16:30 -0800171 self._idle_list = None
J. Richard Barnette96db3492015-03-27 17:23:52 -0700172
173
174 def record_host(self, host_history):
175 """Add one `HostJobHistory` object to the collection.
176
177 @param host_history The `HostJobHistory` object to be
178 remembered.
179
180 """
J. Richard Barnettef6839282015-06-01 16:00:35 -0700181 self._working_list = None
182 self._broken_list = None
xixuan12ce04f2016-03-10 13:16:30 -0800183 self._idle_list = None
J. Richard Barnette96db3492015-03-27 17:23:52 -0700184 self._histories.append(host_history)
185
186
J. Richard Barnettef6839282015-06-01 16:00:35 -0700187 def get_working_list(self):
188 """Return a list of all working DUTs in the pool.
189
190 Filter `self._histories` for histories where the last
191 diagnosis is `WORKING`.
192
193 Cache the result so that we only cacluate it once.
194
195 @return A list of HostJobHistory objects.
196
197 """
198 if self._working_list is None:
199 self._working_list = [h for h in self._histories
200 if h.last_diagnosis()[0] == status_history.WORKING]
201 return self._working_list
202
203
J. Richard Barnette96db3492015-03-27 17:23:52 -0700204 def get_working(self):
J. Richard Barnettef6839282015-06-01 16:00:35 -0700205 """Return the number of working DUTs in the pool."""
206 return len(self.get_working_list())
207
208
209 def get_broken_list(self):
210 """Return a list of all broken DUTs in the pool.
211
212 Filter `self._histories` for histories where the last
xixuan12ce04f2016-03-10 13:16:30 -0800213 diagnosis is `BROKEN`.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700214
215 Cache the result so that we only cacluate it once.
216
217 @return A list of HostJobHistory objects.
218
219 """
220 if self._broken_list is None:
221 self._broken_list = [h for h in self._histories
xixuan12ce04f2016-03-10 13:16:30 -0800222 if h.last_diagnosis()[0] == status_history.BROKEN]
J. Richard Barnettef6839282015-06-01 16:00:35 -0700223 return self._broken_list
J. Richard Barnette96db3492015-03-27 17:23:52 -0700224
225
226 def get_broken(self):
J. Richard Barnettef6839282015-06-01 16:00:35 -0700227 """Return the number of broken DUTs in the pool."""
228 return len(self.get_broken_list())
J. Richard Barnette96db3492015-03-27 17:23:52 -0700229
230
xixuan12ce04f2016-03-10 13:16:30 -0800231 def get_idle_list(self):
232 """Return a list of all idle DUTs in the pool.
233
234 Filter `self._histories` for histories where the last
235 diagnosis is `UNUSED` or `UNKNOWN`.
236
237 Cache the result so that we only cacluate it once.
238
239 @return A list of HostJobHistory objects.
240
241 """
Richard Barnette5de01eb2017-12-15 09:53:42 -0800242 idle_statuses = {status_history.UNUSED, status_history.UNKNOWN}
xixuan12ce04f2016-03-10 13:16:30 -0800243 if self._idle_list is None:
244 self._idle_list = [h for h in self._histories
Richard Barnette5de01eb2017-12-15 09:53:42 -0800245 if h.last_diagnosis()[0] in idle_statuses]
xixuan12ce04f2016-03-10 13:16:30 -0800246 return self._idle_list
247
248
249 def get_idle(self):
250 """Return the number of idle DUTs in the pool."""
251 return len(self.get_idle_list())
252
253
J. Richard Barnette96db3492015-03-27 17:23:52 -0700254 def get_total(self):
J. Richard Barnettef6839282015-06-01 16:00:35 -0700255 """Return the total number of DUTs in the pool."""
J. Richard Barnette96db3492015-03-27 17:23:52 -0700256 return len(self._histories)
257
258
Richard Barnette5de01eb2017-12-15 09:53:42 -0800259class _PoolSetInventory(object):
260 """Maintains a set of `HostJobHistory`s for a set of pools.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700261
Richard Barnette5de01eb2017-12-15 09:53:42 -0800262 The collection is segregated into disjoint categories of "working",
263 "broken", and "idle" DUTs. Accessor methods allow finding both the
264 list of DUTs in each category, as well as counts of each category.
265 Accessor queries can be for an individual pool, or against all
266 pools.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700267
Richard Barnette5de01eb2017-12-15 09:53:42 -0800268 Performance note: This class relies on `_HostSetInventory`. Public
269 methods in this class generally rely on methods of the same name in
270 the underlying class, and so will have the same underlying
271 performance characteristics.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700272 """
273
Richard Barnette5de01eb2017-12-15 09:53:42 -0800274 def __init__(self, pools):
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800275 self._histories_by_pool = {
Richard Barnette5de01eb2017-12-15 09:53:42 -0800276 pool: _HostSetInventory() for pool in pools
J. Richard Barnette96db3492015-03-27 17:23:52 -0700277 }
278
279 def record_host(self, host_history):
280 """Add one `HostJobHistory` object to the collection.
281
282 @param host_history The `HostJobHistory` object to be
283 remembered.
284
285 """
J. Richard Barnette3d0590a2015-04-29 12:56:12 -0700286 pool = host_history.host_pool
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800287 self._histories_by_pool[pool].record_host(host_history)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700288
289
290 def _count_pool(self, get_pool_count, pool=None):
291 """Internal helper to count hosts in a given pool.
292
293 The `get_pool_count` parameter is a function to calculate
294 the exact count of interest for the pool.
295
296 @param get_pool_count Function to return a count from a
297 _PoolCount object.
298 @param pool The pool to be counted. If `None`,
299 return the total across all pools.
300
301 """
302 if pool is None:
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800303 return sum([get_pool_count(cached_history) for cached_history in
304 self._histories_by_pool.values()])
J. Richard Barnette96db3492015-03-27 17:23:52 -0700305 else:
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800306 return get_pool_count(self._histories_by_pool[pool])
J. Richard Barnette96db3492015-03-27 17:23:52 -0700307
308
J. Richard Barnettef6839282015-06-01 16:00:35 -0700309 def get_working_list(self):
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800310 """Return a list of all working DUTs (across all pools).
J. Richard Barnettef6839282015-06-01 16:00:35 -0700311
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800312 Go through all HostJobHistory objects across all pools, selecting the
313 ones where the last diagnosis is `WORKING`.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700314
315 @return A list of HostJobHistory objects.
316
317 """
318 l = []
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800319 for p in self._histories_by_pool.values():
J. Richard Barnettef6839282015-06-01 16:00:35 -0700320 l.extend(p.get_working_list())
321 return l
322
323
J. Richard Barnette96db3492015-03-27 17:23:52 -0700324 def get_working(self, pool=None):
325 """Return the number of working DUTs in a pool.
326
327 @param pool The pool to be counted. If `None`, return the
328 total across all pools.
329
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700330 @return The total number of working DUTs in the selected
331 pool(s).
J. Richard Barnette96db3492015-03-27 17:23:52 -0700332 """
Richard Barnette5de01eb2017-12-15 09:53:42 -0800333 return self._count_pool(_HostSetInventory.get_working, pool)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700334
335
J. Richard Barnettef6839282015-06-01 16:00:35 -0700336 def get_broken_list(self):
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800337 """Return a list of all broken DUTs (across all pools).
J. Richard Barnettef6839282015-06-01 16:00:35 -0700338
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800339 Go through all HostJobHistory objects in the across all pools,
xixuan12ce04f2016-03-10 13:16:30 -0800340 selecting the ones where the last diagnosis is `BROKEN`.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700341
342 @return A list of HostJobHistory objects.
343
344 """
345 l = []
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800346 for p in self._histories_by_pool.values():
J. Richard Barnettef6839282015-06-01 16:00:35 -0700347 l.extend(p.get_broken_list())
348 return l
349
350
J. Richard Barnette96db3492015-03-27 17:23:52 -0700351 def get_broken(self, pool=None):
352 """Return the number of broken DUTs in a pool.
353
354 @param pool The pool to be counted. If `None`, return the
355 total across all pools.
356
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700357 @return The total number of broken DUTs in the selected pool(s).
J. Richard Barnette96db3492015-03-27 17:23:52 -0700358 """
Richard Barnette5de01eb2017-12-15 09:53:42 -0800359 return self._count_pool(_HostSetInventory.get_broken, pool)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700360
361
xixuan12ce04f2016-03-10 13:16:30 -0800362 def get_idle_list(self, pool=None):
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800363 """Return a list of all idle DUTs in the given pool.
xixuan12ce04f2016-03-10 13:16:30 -0800364
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800365 Go through all HostJobHistory objects in the given pool, selecting the
366 ones where the last diagnosis is `UNUSED` or `UNKNOWN`.
xixuan12ce04f2016-03-10 13:16:30 -0800367
368 @param pool: The pool to be counted. If `None`, return the total list
369 across all pools.
370
371 @return A list of HostJobHistory objects.
372
373 """
374 if pool is None:
375 l = []
Richard Barnette5de01eb2017-12-15 09:53:42 -0800376 for p in self._histories_by_pool.itervalues():
xixuan12ce04f2016-03-10 13:16:30 -0800377 l.extend(p.get_idle_list())
378 return l
379 else:
Richard Barnette5de01eb2017-12-15 09:53:42 -0800380 return self._histories_by_pool[pool].get_idle_list()
xixuan12ce04f2016-03-10 13:16:30 -0800381
382
383 def get_idle(self, pool=None):
384 """Return the number of idle DUTs in a pool.
385
386 @param pool: The pool to be counted. If `None`, return the total
387 across all pools.
388
389 @return The total number of idle DUTs in the selected pool(s).
390 """
Richard Barnette5de01eb2017-12-15 09:53:42 -0800391 return self._count_pool(_HostSetInventory.get_idle, pool)
xixuan12ce04f2016-03-10 13:16:30 -0800392
393
Richard Barnette5de01eb2017-12-15 09:53:42 -0800394 def get_spares_buffer(self, spare_pool=SPARE_POOL):
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700395 """Return the the nominal number of working spares.
396
397 Calculates and returns how many working spares there would
398 be in the spares pool if all broken DUTs were in the spares
399 pool. This number may be negative, indicating a shortfall
400 in the critical pools.
401
402 @return The total number DUTs in the spares pool, less the total
403 number of broken DUTs in all pools.
404 """
Richard Barnette5de01eb2017-12-15 09:53:42 -0800405 return self.get_total(spare_pool) - self.get_broken()
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700406
407
J. Richard Barnette96db3492015-03-27 17:23:52 -0700408 def get_total(self, pool=None):
409 """Return the total number of DUTs in a pool.
410
411 @param pool The pool to be counted. If `None`, return the
412 total across all pools.
413
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700414 @return The total number of DUTs in the selected pool(s).
J. Richard Barnette96db3492015-03-27 17:23:52 -0700415 """
Richard Barnette5de01eb2017-12-15 09:53:42 -0800416 return self._count_pool(_HostSetInventory.get_total, pool)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700417
418
Richard Barnette5de01eb2017-12-15 09:53:42 -0800419def _eligible_host(afehost):
420 """Return whether this host is eligible for monitoring.
421
422 A host is eligible if it has a (unique) 'model' label, it's in
423 exactly one pool, and it has no labels from the
424 `_EXCLUDED_LABELS` set.
425
426 @param afehost The host to be tested for eligibility.
427 """
428 # DUTs without an existing, unique 'model' or 'pool' label
429 # aren't meant to exist in the managed inventory; their presence
430 # generally indicates an error in the database. Unfortunately
431 # such errors have been seen to occur from time to time.
432 #
433 # The _LabInventory constructor requires hosts to conform to the
434 # label restrictions, and may fail if they don't. Failing an
435 # inventory run for a single bad entry is the wrong thing, so we
436 # ignore the problem children here, to keep them out of the
437 # inventory.
438 models = [l for l in afehost.labels
439 if l.startswith(constants.Labels.MODEL_PREFIX)]
440 pools = [l for l in afehost.labels
441 if l.startswith(constants.Labels.POOL_PREFIX)]
442 excluded = _EXCLUDED_LABELS.intersection(afehost.labels)
443 return len(models) == 1 and len(pools) == 1 and not excluded
444
445
446class _LabInventory(collections.Mapping):
J. Richard Barnette96db3492015-03-27 17:23:52 -0700447 """Collection of `HostJobHistory` objects for the Lab's inventory.
448
Richard Barnette5de01eb2017-12-15 09:53:42 -0800449 This is a dict-like collection indexed by model. Indexing returns
450 the _PoolSetInventory object associated with the model.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700451 """
452
453 @classmethod
Richard Barnette5de01eb2017-12-15 09:53:42 -0800454 def create_inventory(cls, afe, start_time, end_time, modellist=[]):
J. Richard Barnette96db3492015-03-27 17:23:52 -0700455 """Return a Lab inventory with specified parameters.
456
Richard Barnette5de01eb2017-12-15 09:53:42 -0800457 By default, gathers inventory from `HostJobHistory` objects for
458 all DUTs in the `MANAGED_POOLS` list. If `modellist` is
459 supplied, the inventory will be restricted to only the given
460 models.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700461
Richard Barnette5de01eb2017-12-15 09:53:42 -0800462 @param afe AFE object for constructing the
463 `HostJobHistory` objects.
464 @param start_time Start time for the `HostJobHistory` objects.
465 @param end_time End time for the `HostJobHistory` objects.
466 @param modellist List of models to include. If empty,
467 include all available models.
468 @return A `_LabInventory` object for the specified models.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700469
470 """
Richard Barnette5de01eb2017-12-15 09:53:42 -0800471 target_pools = MANAGED_POOLS
472 label_list = [constants.Labels.POOL_PREFIX + l for l in target_pools]
J. Richard Barnette96db3492015-03-27 17:23:52 -0700473 afehosts = afe.get_hosts(labels__name__in=label_list)
Richard Barnette5de01eb2017-12-15 09:53:42 -0800474 if modellist:
J. Richard Barnetteb8bc570c2016-03-17 17:03:57 -0700475 # We're deliberately not checking host eligibility in this
476 # code path. This is a debug path, not used in production;
477 # it may be useful to include ineligible hosts here.
Richard Barnette5de01eb2017-12-15 09:53:42 -0800478 modelhosts = []
479 for model in modellist:
480 model_label = constants.Labels.MODEL_PREFIX + model
J. Richard Barnette96db3492015-03-27 17:23:52 -0700481 host_list = [h for h in afehosts
Richard Barnette5de01eb2017-12-15 09:53:42 -0800482 if model_label in h.labels]
483 modelhosts.extend(host_list)
484 afehosts = modelhosts
J. Richard Barnetteb8bc570c2016-03-17 17:03:57 -0700485 else:
Richard Barnette3a404492018-02-08 13:57:01 -0800486 afehosts = [h for h in afehosts if _eligible_host(h)]
J. Richard Barnette96db3492015-03-27 17:23:52 -0700487 create = lambda host: (
488 status_history.HostJobHistory(afe, host,
489 start_time, end_time))
Richard Barnette5de01eb2017-12-15 09:53:42 -0800490 return cls([create(host) for host in afehosts], target_pools)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700491
492
Richard Barnette5de01eb2017-12-15 09:53:42 -0800493 def __init__(self, histories, pools):
494 models = {h.host_model for h in histories}
495 self._modeldata = {model: _PoolSetInventory(pools) for model in models}
J. Richard Barnette96db3492015-03-27 17:23:52 -0700496 self._dut_count = len(histories)
Richard Barnette5de01eb2017-12-15 09:53:42 -0800497 for h in histories:
498 self[h.host_model].record_host(h)
499 self._boards = {h.host_board for h in histories}
Prathmesh Prabhu154cb2b2017-11-08 17:36:51 -0800500
501
Richard Barnette5de01eb2017-12-15 09:53:42 -0800502 def __getitem__(self, key):
503 return self._modeldata.__getitem__(key)
Prathmesh Prabhu021e7842017-11-08 18:05:45 -0800504
Richard Barnette5de01eb2017-12-15 09:53:42 -0800505
506 def __len__(self):
507 return self._modeldata.__len__()
508
509
510 def __iter__(self):
511 return self._modeldata.__iter__()
512
513
514 def reportable_items(self, spare_pool=SPARE_POOL):
515 """Iterate over all items subject to reporting.
516
517 Yields the contents of `self.iteritems()` filtered to include
518 only reportable models. A model is reportable if it has DUTs in
519 both `spare_pool` and at least one other pool.
520
521 @param spare_pool The spare pool to be tested for reporting.
Prathmesh Prabhu021e7842017-11-08 18:05:45 -0800522 """
Richard Barnette5de01eb2017-12-15 09:53:42 -0800523 for model, histories in self.iteritems():
524 spares = histories.get_total(spare_pool)
Prathmesh Prabhua5a0e3d2017-11-09 08:53:53 -0800525 total = histories.get_total()
Richard Barnette5de01eb2017-12-15 09:53:42 -0800526 if spares != 0 and spares != total:
527 yield model, histories
J. Richard Barnettef6839282015-06-01 16:00:35 -0700528
529
J. Richard Barnette96db3492015-03-27 17:23:52 -0700530 def get_num_duts(self):
531 """Return the total number of DUTs in the inventory."""
532 return self._dut_count
533
534
Prathmesh Prabhua5a0e3d2017-11-09 08:53:53 -0800535 def get_num_models(self):
536 """Return the total number of models in the inventory."""
Richard Barnette5de01eb2017-12-15 09:53:42 -0800537 return len(self)
538
539
540 def get_pool_models(self, pool):
541 """Return all models in `pool`.
542
543 @param pool The pool to be inventoried for models.
544 """
545 return {m for m, h in self.iteritems() if h.get_total(pool)}
546
547
548 def get_boards(self):
549 return self._boards
Prathmesh Prabhua5a0e3d2017-11-09 08:53:53 -0800550
551
J. Richard Barnettef6839282015-06-01 16:00:35 -0700552def _sort_by_location(inventory_list):
553 """Return a list of DUTs, organized by location.
554
555 Take the given list of `HostJobHistory` objects, separate it
556 into a list per lab, and sort each lab's list by location. The
557 order of sorting within a lab is
558 * By row number within the lab,
559 * then by rack number within the row,
560 * then by host shelf number within the rack.
561
562 Return a list of the sorted lists.
563
564 Implementation note: host locations are sorted by converting
565 each location into a base 100 number. If row, rack or
566 host numbers exceed the range [0..99], then sorting will
567 break down.
568
569 @return A list of sorted lists of DUTs.
570
571 """
572 BASE = 100
573 lab_lists = {}
574 for history in inventory_list:
575 location = _HOSTNAME_PATTERN.match(history.host.hostname)
576 if location:
577 lab = location.group(1)
578 key = 0
579 for idx in location.group(2, 3, 4):
580 key = BASE * key + int(idx)
581 lab_lists.setdefault(lab, []).append((key, history))
582 return_list = []
583 for dut_list in lab_lists.values():
584 dut_list.sort(key=lambda t: t[0])
585 return_list.append([t[1] for t in dut_list])
586 return return_list
587
588
589def _score_repair_set(buffer_counts, repair_list):
590 """Return a numeric score rating a set of DUTs to be repaired.
591
Richard Barnette5de01eb2017-12-15 09:53:42 -0800592 `buffer_counts` is a dictionary mapping model names to the size of
593 the model's spares buffer.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700594
Richard Barnette5de01eb2017-12-15 09:53:42 -0800595 `repair_list` is a list of `HostJobHistory` objects for the DUTs to
596 be repaired.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700597
598 This function calculates the new set of buffer counts that would
Richard Barnette5de01eb2017-12-15 09:53:42 -0800599 result from the proposed repairs, and scores the new set using two
600 numbers:
601 * Worst case buffer count for any model (higher is better). This
602 is the more significant number for comparison.
603 * Number of models at the worst case (lower is better). This is
604 the less significant number.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700605
Richard Barnette5de01eb2017-12-15 09:53:42 -0800606 Implementation note: The score could fail to reflect the intended
607 criteria if there are more than 1000 models in the inventory.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700608
Richard Barnette5de01eb2017-12-15 09:53:42 -0800609 @param spare_counts A dictionary mapping models to buffer counts.
610 @param repair_list A list of `HostJobHistory` objects for the
611 DUTs to be repaired.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700612 @return A numeric score.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700613 """
614 # Go through `buffer_counts`, and create a list of new counts
Richard Barnette5de01eb2017-12-15 09:53:42 -0800615 # that records the buffer count for each model after repair.
616 # The new list of counts discards the model names, as they don't
J. Richard Barnettef6839282015-06-01 16:00:35 -0700617 # contribute to the final score.
Richard Barnette5de01eb2017-12-15 09:53:42 -0800618 _NMODELS = 1000
619 pools = {h.host_pool for h in repair_list}
620 repair_inventory = _LabInventory(repair_list, pools)
J. Richard Barnettef6839282015-06-01 16:00:35 -0700621 new_counts = []
Richard Barnette5de01eb2017-12-15 09:53:42 -0800622 for m, c in buffer_counts.iteritems():
623 if m in repair_inventory:
624 newcount = repair_inventory[m].get_total()
J. Richard Barnettef6839282015-06-01 16:00:35 -0700625 else:
626 newcount = 0
627 new_counts.append(c + newcount)
628 # Go through the new list of counts. Find the worst available
629 # spares count, and count how many times that worst case occurs.
630 worst_count = new_counts[0]
631 num_worst = 1
632 for c in new_counts[1:]:
633 if c == worst_count:
634 num_worst += 1
635 elif c < worst_count:
636 worst_count = c
637 num_worst = 1
638 # Return the calculated score
Richard Barnette5de01eb2017-12-15 09:53:42 -0800639 return _NMODELS * worst_count - num_worst
J. Richard Barnettef6839282015-06-01 16:00:35 -0700640
641
642def _generate_repair_recommendation(inventory, num_recommend):
643 """Return a summary of selected DUTs needing repair.
644
Richard Barnette5de01eb2017-12-15 09:53:42 -0800645 Returns a message recommending a list of broken DUTs to be repaired.
646 The list of DUTs is selected based on these criteria:
J. Richard Barnettef6839282015-06-01 16:00:35 -0700647 * No more than `num_recommend` DUTs will be listed.
648 * All DUTs must be in the same lab.
Richard Barnette5de01eb2017-12-15 09:53:42 -0800649 * DUTs should be selected for some degree of physical proximity.
650 * DUTs for models with a low spares buffer are more important than
651 DUTs with larger buffers.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700652
Richard Barnette5de01eb2017-12-15 09:53:42 -0800653 The algorithm used will guarantee that at least one DUT from a model
654 with the lowest spares buffer will be recommended. If the worst
655 spares buffer number is shared by more than one model, the algorithm
656 will tend to prefer repair sets that include more of those models
657 over sets that cover fewer models.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700658
Richard Barnette5de01eb2017-12-15 09:53:42 -0800659 @param inventory `_LabInventory` object from which to generate
660 recommendations.
J. Richard Barnette1df6a562015-06-09 10:06:17 -0700661 @param num_recommend Number of DUTs to recommend for repair.
662
J. Richard Barnettef6839282015-06-01 16:00:35 -0700663 """
664 logging.debug('Creating DUT repair recommendations')
Richard Barnette5de01eb2017-12-15 09:53:42 -0800665 model_buffer_counts = {}
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700666 broken_list = []
Richard Barnette5de01eb2017-12-15 09:53:42 -0800667 for model, counts in inventory.reportable_items():
668 logging.debug('Listing failed DUTs for %s', model)
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700669 if counts.get_broken() != 0:
Richard Barnette5de01eb2017-12-15 09:53:42 -0800670 model_buffer_counts[model] = counts.get_spares_buffer()
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700671 broken_list.extend(counts.get_broken_list())
J. Richard Barnette55127432015-10-13 17:01:56 -0700672 # N.B. The logic inside this loop may seem complicated, but
J. Richard Barnettef6839282015-06-01 16:00:35 -0700673 # simplification is hard:
674 # * Calculating an initial recommendation outside of
675 # the loop likely would make things more complicated,
676 # not less.
677 # * It's necessary to calculate an initial lab slice once per
678 # lab _before_ the while loop, in case the number of broken
679 # DUTs in a lab is less than `num_recommend`.
J. Richard Barnette55127432015-10-13 17:01:56 -0700680 recommendation = None
681 best_score = None
682 for lab_duts in _sort_by_location(broken_list):
J. Richard Barnettef6839282015-06-01 16:00:35 -0700683 start = 0
684 end = num_recommend
685 lab_slice = lab_duts[start : end]
Richard Barnette5de01eb2017-12-15 09:53:42 -0800686 lab_score = _score_repair_set(model_buffer_counts, lab_slice)
J. Richard Barnettef6839282015-06-01 16:00:35 -0700687 while end < len(lab_duts):
688 start += 1
689 end += 1
690 new_slice = lab_duts[start : end]
Richard Barnette5de01eb2017-12-15 09:53:42 -0800691 new_score = _score_repair_set(model_buffer_counts, new_slice)
J. Richard Barnettef6839282015-06-01 16:00:35 -0700692 if new_score > lab_score:
693 lab_slice = new_slice
694 lab_score = new_score
695 if recommendation is None or lab_score > best_score:
696 recommendation = lab_slice
697 best_score = lab_score
Richard Barnette5de01eb2017-12-15 09:53:42 -0800698 # N.B. The trailing space in `line_fmt` is manadatory: Without it,
699 # Gmail will parse the URL wrong. Don't ask. If you simply _must_
J. Richard Barnette5d0fa512016-04-05 17:39:52 -0700700 # know more, go try it yourself...
701 line_fmt = '%-30s %-16s %-6s\n %s '
J. Richard Barnette1df6a562015-06-09 10:06:17 -0700702 message = ['Repair recommendations:\n',
Richard Barnette5de01eb2017-12-15 09:53:42 -0800703 line_fmt % ( 'Hostname', 'Model', 'Servo?', 'Logs URL')]
J. Richard Barnettef6839282015-06-01 16:00:35 -0700704 for h in recommendation:
705 servo_name = servo_host.make_servo_hostname(h.host.hostname)
J. Richard Barnette5d0fa512016-04-05 17:39:52 -0700706 servo_present = utils.host_is_in_lab_zone(servo_name)
707 _, event = h.last_diagnosis()
708 line = line_fmt % (
Richard Barnette5de01eb2017-12-15 09:53:42 -0800709 h.host.hostname, h.host_model,
J. Richard Barnette5d0fa512016-04-05 17:39:52 -0700710 'Yes' if servo_present else 'No', event.job_url)
J. Richard Barnettef6839282015-06-01 16:00:35 -0700711 message.append(line)
712 return '\n'.join(message)
713
714
Richard Barnette5de01eb2017-12-15 09:53:42 -0800715def _generate_model_inventory_message(inventory):
716 """Generate the "model inventory" e-mail message.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700717
Richard Barnette5de01eb2017-12-15 09:53:42 -0800718 The model inventory is a list by model summarizing the number of
719 working, broken, and idle DUTs, and the total shortfall or surplus
J. Richard Barnette96db3492015-03-27 17:23:52 -0700720 of working devices relative to the minimum critical pool
721 requirement.
722
Richard Barnette5de01eb2017-12-15 09:53:42 -0800723 The report omits models with no DUTs in the spare pool or with no
724 DUTs in a critical pool.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700725
726 N.B. For sample output text formattted as users can expect to
727 see it in e-mail and log files, refer to the unit tests.
728
Richard Barnette5de01eb2017-12-15 09:53:42 -0800729 @param inventory `_LabInventory` object to be reported on.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700730 @return String with the inventory message to be sent.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700731 """
Richard Barnette5de01eb2017-12-15 09:53:42 -0800732 logging.debug('Creating model inventory')
J. Richard Barnetted3ba33a2015-10-14 11:20:49 -0700733 nworking = 0
734 nbroken = 0
xixuan12ce04f2016-03-10 13:16:30 -0800735 nidle = 0
Richard Barnette5de01eb2017-12-15 09:53:42 -0800736 nbroken_models = 0
737 ntotal_models = 0
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700738 summaries = []
Richard Barnette5de01eb2017-12-15 09:53:42 -0800739 column_names = (
740 'Model', 'Avail', 'Bad', 'Idle', 'Good', 'Spare', 'Total')
741 for model, counts in inventory.reportable_items():
742 logging.debug('Counting %2d DUTS for model %s',
743 counts.get_total(), model)
744 # Summary elements laid out in the same order as the column
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700745 # headers:
Richard Barnette5de01eb2017-12-15 09:53:42 -0800746 # Model Avail Bad Idle Good Spare Total
xixuan12ce04f2016-03-10 13:16:30 -0800747 # e[0] e[1] e[2] e[3] e[4] e[5] e[6]
Richard Barnette5de01eb2017-12-15 09:53:42 -0800748 element = (model,
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700749 counts.get_spares_buffer(),
750 counts.get_broken(),
xixuan12ce04f2016-03-10 13:16:30 -0800751 counts.get_idle(),
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700752 counts.get_working(),
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700753 counts.get_total(SPARE_POOL),
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700754 counts.get_total())
J. Richard Barnetteea5a4ba2016-02-18 16:34:50 -0800755 if element[2]:
756 summaries.append(element)
Richard Barnette5de01eb2017-12-15 09:53:42 -0800757 nbroken_models += 1
758 ntotal_models += 1
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700759 nbroken += element[2]
xixuan12ce04f2016-03-10 13:16:30 -0800760 nidle += element[3]
761 nworking += element[4]
762 ntotal = nworking + nbroken + nidle
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700763 summaries = sorted(summaries, key=lambda e: (e[1], -e[2]))
J. Richard Barnetted3ba33a2015-10-14 11:20:49 -0700764 broken_percent = int(round(100.0 * nbroken / ntotal))
xixuan12ce04f2016-03-10 13:16:30 -0800765 idle_percent = int(round(100.0 * nidle / ntotal))
766 working_percent = 100 - broken_percent - idle_percent
J. Richard Barnetted3ba33a2015-10-14 11:20:49 -0700767 message = ['Summary of DUTs in inventory:',
xixuan12ce04f2016-03-10 13:16:30 -0800768 '%10s %10s %10s %6s' % ('Bad', 'Idle', 'Good', 'Total'),
769 '%5d %3d%% %5d %3d%% %5d %3d%% %6d' % (
J. Richard Barnetted3ba33a2015-10-14 11:20:49 -0700770 nbroken, broken_percent,
xixuan12ce04f2016-03-10 13:16:30 -0800771 nidle, idle_percent,
J. Richard Barnetted3ba33a2015-10-14 11:20:49 -0700772 nworking, working_percent,
773 ntotal),
774 '',
Richard Barnette5de01eb2017-12-15 09:53:42 -0800775 'Models with failures: %d' % nbroken_models,
776 'Models in inventory: %d' % ntotal_models,
J. Richard Barnetted3ba33a2015-10-14 11:20:49 -0700777 '', '',
Richard Barnette5de01eb2017-12-15 09:53:42 -0800778 'Full model inventory:\n',
779 '%-22s %5s %5s %5s %5s %5s %5s' % column_names]
J. Richard Barnette96db3492015-03-27 17:23:52 -0700780 message.extend(
xixuan12ce04f2016-03-10 13:16:30 -0800781 ['%-22s %5d %5d %5d %5d %5d %5d' % e for e in summaries])
J. Richard Barnette96db3492015-03-27 17:23:52 -0700782 return '\n'.join(message)
783
784
J. Richard Barnette4845fcf2015-04-20 14:26:25 -0700785_POOL_INVENTORY_HEADER = '''\
Richard Barnette5de01eb2017-12-15 09:53:42 -0800786Notice to Infrastructure deputies: All models shown below are at
J. Richard Barnettec9a143c2015-06-04 11:11:19 -0700787less than full strength, please take action to resolve the issues.
788Once you're satisified that failures won't recur, failed DUTs can
789be replaced with spares by running `balance_pool`. Detailed
790instructions can be found here:
J. Richard Barnette4845fcf2015-04-20 14:26:25 -0700791 http://go/cros-manage-duts
792'''
793
794
J. Richard Barnette96db3492015-03-27 17:23:52 -0700795def _generate_pool_inventory_message(inventory):
796 """Generate the "pool inventory" e-mail message.
797
Richard Barnette5de01eb2017-12-15 09:53:42 -0800798 The pool inventory is a list by pool and model summarizing the
799 number of working and broken DUTs in the pool. Only models with
J. Richard Barnette96db3492015-03-27 17:23:52 -0700800 at least one broken DUT are included in the list.
801
Richard Barnette5de01eb2017-12-15 09:53:42 -0800802 N.B. For sample output text formattted as users can expect to see it
803 in e-mail and log files, refer to the unit tests.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700804
Richard Barnette5de01eb2017-12-15 09:53:42 -0800805 @param inventory `_LabInventory` object to be reported on.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700806 @return String with the inventory message to be sent.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700807 """
808 logging.debug('Creating pool inventory')
J. Richard Barnette4845fcf2015-04-20 14:26:25 -0700809 message = [_POOL_INVENTORY_HEADER]
J. Richard Barnette96db3492015-03-27 17:23:52 -0700810 newline = ''
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700811 for pool in CRITICAL_POOLS:
J. Richard Barnette96db3492015-03-27 17:23:52 -0700812 message.append(
Richard Barnette5de01eb2017-12-15 09:53:42 -0800813 '%sStatus for pool:%s, by model:' % (newline, pool))
J. Richard Barnette96db3492015-03-27 17:23:52 -0700814 message.append(
xixuan12ce04f2016-03-10 13:16:30 -0800815 '%-20s %5s %5s %5s %5s' % (
Richard Barnette5de01eb2017-12-15 09:53:42 -0800816 'Model', 'Bad', 'Idle', 'Good', 'Total'))
J. Richard Barnette96db3492015-03-27 17:23:52 -0700817 data_list = []
Richard Barnette5de01eb2017-12-15 09:53:42 -0800818 for model, counts in inventory.iteritems():
Richard Barnette254d5b42016-07-06 19:13:23 -0700819 logging.debug('Counting %2d DUTs for %s, %s',
Richard Barnette5de01eb2017-12-15 09:53:42 -0800820 counts.get_total(pool), model, pool)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700821 broken = counts.get_broken(pool)
xixuan12ce04f2016-03-10 13:16:30 -0800822 idle = counts.get_idle(pool)
Richard Barnette5de01eb2017-12-15 09:53:42 -0800823 # models at full strength are not reported
824 if not broken and not idle:
J. Richard Barnette96db3492015-03-27 17:23:52 -0700825 continue
826 working = counts.get_working(pool)
827 total = counts.get_total(pool)
Richard Barnette5de01eb2017-12-15 09:53:42 -0800828 data_list.append((model, broken, idle, working, total))
J. Richard Barnette96db3492015-03-27 17:23:52 -0700829 if data_list:
830 data_list = sorted(data_list, key=lambda d: -d[1])
831 message.extend(
xixuan12ce04f2016-03-10 13:16:30 -0800832 ['%-20s %5d %5d %5d %5d' % t for t in data_list])
J. Richard Barnette96db3492015-03-27 17:23:52 -0700833 else:
Richard Barnette5de01eb2017-12-15 09:53:42 -0800834 message.append('(All models at full strength)')
J. Richard Barnette96db3492015-03-27 17:23:52 -0700835 newline = '\n'
836 return '\n'.join(message)
837
838
xixuan12ce04f2016-03-10 13:16:30 -0800839_IDLE_INVENTORY_HEADER = '''\
840Notice to Infrastructure deputies: The hosts shown below haven't
841run any jobs for at least 24 hours. Please check each host; locked
842hosts should normally be unlocked; stuck jobs should normally be
843aborted.
844'''
845
846
847def _generate_idle_inventory_message(inventory):
848 """Generate the "idle inventory" e-mail message.
849
Richard Barnette5de01eb2017-12-15 09:53:42 -0800850 The idle inventory is a host list with corresponding pool and model,
xixuan12ce04f2016-03-10 13:16:30 -0800851 where the hosts are idle (`UNKWOWN` or `UNUSED`).
852
853 N.B. For sample output text format as users can expect to
854 see it in e-mail and log files, refer to the unit tests.
855
Richard Barnette5de01eb2017-12-15 09:53:42 -0800856 @param inventory `_LabInventory` object to be reported on.
xixuan12ce04f2016-03-10 13:16:30 -0800857 @return String with the inventory message to be sent.
858
859 """
860 logging.debug('Creating idle inventory')
861 message = [_IDLE_INVENTORY_HEADER]
862 message.append('Idle Host List:')
Richard Barnette5de01eb2017-12-15 09:53:42 -0800863 message.append('%-30s %-20s %s' % ('Hostname', 'Model', 'Pool'))
xixuan12ce04f2016-03-10 13:16:30 -0800864 data_list = []
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700865 for pool in MANAGED_POOLS:
Richard Barnette5de01eb2017-12-15 09:53:42 -0800866 for model, counts in inventory.iteritems():
Richard Barnette254d5b42016-07-06 19:13:23 -0700867 logging.debug('Counting %2d DUTs for %s, %s',
Richard Barnette5de01eb2017-12-15 09:53:42 -0800868 counts.get_total(pool), model, pool)
869 data_list.extend([(dut.host.hostname, model, pool)
xixuan12ce04f2016-03-10 13:16:30 -0800870 for dut in counts.get_idle_list(pool)])
871 if data_list:
872 message.extend(['%-30s %-20s %s' % t for t in data_list])
873 else:
874 message.append('(No idle DUTs)')
875 return '\n'.join(message)
876
877
J. Richard Barnette96db3492015-03-27 17:23:52 -0700878def _send_email(arguments, tag, subject, recipients, body):
879 """Send an inventory e-mail message.
880
Richard Barnette5de01eb2017-12-15 09:53:42 -0800881 The message is logged in the selected log directory using `tag` for
882 the file name.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700883
Richard Barnette5de01eb2017-12-15 09:53:42 -0800884 If the --debug option was requested, the message is neither logged
885 nor sent, but merely printed on stdout.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700886
887 @param arguments Parsed command-line options.
888 @param tag Tag identifying the inventory for logging
889 purposes.
890 @param subject E-mail Subject: header line.
891 @param recipients E-mail addresses for the To: header line.
892 @param body E-mail message body.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700893 """
894 logging.debug('Generating email: "%s"', subject)
895 all_recipients = ', '.join(recipients)
896 report_body = '\n'.join([
897 'To: %s' % all_recipients,
898 'Subject: %s' % subject,
899 '', body, ''])
J. Richard Barnette02e82432015-10-13 16:02:47 -0700900 if arguments.debug:
J. Richard Barnette96db3492015-03-27 17:23:52 -0700901 print report_body
902 else:
903 filename = os.path.join(arguments.logdir, tag)
904 try:
905 report_file = open(filename, 'w')
906 report_file.write(report_body)
907 report_file.close()
908 except EnvironmentError as e:
909 logging.error('Failed to write %s: %s', filename, e)
910 try:
911 gmail_lib.send_email(all_recipients, subject, body)
912 except Exception as e:
913 logging.error('Failed to send e-mail to %s: %s',
914 all_recipients, e)
915
916
Richard Barnette5de01eb2017-12-15 09:53:42 -0800917def _populate_model_counts(inventory):
918 """Gather model counts while providing interactive feedback.
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700919
920 Gathering the status of all individual DUTs in the lab can take
921 considerable time (~30 minutes at the time of this writing).
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700922 Normally, we pay that cost by querying as we go. However, with
923 the `--debug` option, we expect a human being to be watching the
Richard Barnette5de01eb2017-12-15 09:53:42 -0800924 progress in real time. So, we force the first (expensive) queries
925 to happen up front, and provide simple ASCII output on sys.stdout
926 to show a progress bar and results.
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700927
Richard Barnette5de01eb2017-12-15 09:53:42 -0800928 @param inventory `_LabInventory` object from which to gather
929 counts.
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700930 """
931 n = 0
932 total_broken = 0
Richard Barnette5de01eb2017-12-15 09:53:42 -0800933 for counts in inventory.itervalues():
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700934 n += 1
935 if n % 10 == 5:
936 c = '+'
937 elif n % 10 == 0:
938 c = '%d' % ((n / 10) % 10)
939 else:
940 c = '.'
941 sys.stdout.write(c)
942 sys.stdout.flush()
Richard Barnette5de01eb2017-12-15 09:53:42 -0800943 # This next call is where all the time goes - it forces all of a
944 # model's `HostJobHistory` objects to query the database and
945 # cache their results.
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700946 total_broken += counts.get_broken()
947 sys.stdout.write('\n')
948 sys.stdout.write('Found %d broken DUTs\n' % total_broken)
949
950
Richard Barnette5de01eb2017-12-15 09:53:42 -0800951def _perform_model_inventory(arguments, inventory, timestamp):
952 """Perform the model inventory report.
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700953
Richard Barnette5de01eb2017-12-15 09:53:42 -0800954 The model inventory report consists of the following:
955 * A list of DUTs that are recommended to be repaired. This list
956 is optional, and only appears if the `--recommend` option is
957 present.
958 * A list of all models that have failed DUTs, with counts
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700959 of working, broken, and spare DUTs, among others.
960
961 @param arguments Command-line arguments as returned by
962 `ArgumentParser`
Richard Barnette5de01eb2017-12-15 09:53:42 -0800963 @param inventory `_LabInventory` object to be reported on.
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700964 @param timestamp A string used to identify this run's timestamp
965 in logs and email output.
966 """
967 if arguments.recommend:
968 recommend_message = _generate_repair_recommendation(
969 inventory, arguments.recommend) + '\n\n\n'
970 else:
971 recommend_message = ''
Richard Barnette5de01eb2017-12-15 09:53:42 -0800972 model_message = _generate_model_inventory_message(inventory)
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700973 _send_email(arguments,
Richard Barnette5de01eb2017-12-15 09:53:42 -0800974 'models-%s.txt' % timestamp,
975 'DUT model inventory %s' % timestamp,
976 arguments.model_notify,
977 recommend_message + model_message)
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700978
979
980def _perform_pool_inventory(arguments, inventory, timestamp):
981 """Perform the pool inventory report.
982
983 The pool inventory report consists of the following:
984 * A list of all critical pools that have failed DUTs, with counts
985 of working, broken, and idle DUTs.
Richard Barnette5de01eb2017-12-15 09:53:42 -0800986 * A list of all idle DUTs by hostname including the model and
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700987 pool.
988
989 @param arguments Command-line arguments as returned by
990 `ArgumentParser`
Richard Barnette5de01eb2017-12-15 09:53:42 -0800991 @param inventory `_LabInventory` object to be reported on.
Richard Barnettecf5d8342017-10-24 18:13:11 -0700992 @param timestamp A string used to identify this run's timestamp in
993 logs and email output.
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700994 """
995 pool_message = _generate_pool_inventory_message(inventory)
996 idle_message = _generate_idle_inventory_message(inventory)
997 _send_email(arguments,
998 'pools-%s.txt' % timestamp,
999 'DUT pool inventory %s' % timestamp,
1000 arguments.pool_notify,
1001 pool_message + '\n\n\n' + idle_message)
1002
1003
Richard Barnettecf5d8342017-10-24 18:13:11 -07001004def _dut_in_repair_loop(history):
1005 """Return whether a DUT's history indicates a repair loop.
1006
1007 A DUT is considered looping if it runs no tests, and no tasks pass
1008 other than repair tasks.
1009
1010 @param history An instance of `status_history.HostJobHistory` to be
1011 scanned for a repair loop. The caller guarantees
1012 that this history corresponds to a working DUT.
1013 @returns Return a true value if the DUT's most recent history
1014 indicates a repair loop.
1015 """
1016 # Our caller passes only histories for working DUTs; that means
1017 # we've already paid the cost of fetching the diagnosis task, and
1018 # we know that the task was successful. The diagnosis task will be
1019 # one of the tasks we must scan to find a loop, so if the task isn't
1020 # a repair task, then our history includes a successful non-repair
1021 # task, and we're not looping.
1022 #
Richard Barnette1ca30e62018-04-09 16:45:58 -07001023 # The for loop below is very expensive, because it must fetch the
Richard Barnettecf5d8342017-10-24 18:13:11 -07001024 # full history, regardless of how many tasks we examine. At the
1025 # time of this writing, this check against the diagnosis task
1026 # reduces the cost of finding loops in the full inventory from hours
1027 # to minutes.
1028 if history.last_diagnosis()[1].name != 'Repair':
1029 return False
1030 repair_ok_count = 0
1031 for task in history:
1032 if not task.is_special:
1033 # This is a test, so we're not looping.
1034 return False
1035 if task.diagnosis == status_history.BROKEN:
1036 # Failed a repair, so we're not looping.
1037 return False
1038 if (task.diagnosis == status_history.WORKING
1039 and task.name != 'Repair'):
1040 # Non-repair task succeeded, so we're not looping.
1041 return False
1042 # At this point, we have either a failed non-repair task, or
1043 # a successful repair.
1044 if task.name == 'Repair':
1045 repair_ok_count += 1
1046 if repair_ok_count >= _REPAIR_LOOP_THRESHOLD:
1047 return True
1048
1049
Richard Barnette1ca30e62018-04-09 16:45:58 -07001050def _report_untestable_dut(history, state):
1051 fields = {
1052 'dut_hostname': history.hostname,
1053 'model': history.host_model,
1054 'pool': history.host_pool,
1055 'state': state,
1056 }
1057 logging.info('Untestable DUT: %(dut_hostname)s, model: %(model)s, '
1058 'pool: %(pool)s', fields)
1059 _UNTESTABLE_PRESENCE_METRIC.set(True, fields=fields)
Richard Barnettecf5d8342017-10-24 18:13:11 -07001060
Richard Barnettecf5d8342017-10-24 18:13:11 -07001061
Richard Barnette1ca30e62018-04-09 16:45:58 -07001062def _report_repair_loop_metrics(inventory):
1063 """Find and report DUTs stuck in a repair loop.
1064
1065 Go through `inventory`, and find and report any DUT identified as
1066 being in a repair loop.
1067
Richard Barnette5de01eb2017-12-15 09:53:42 -08001068 @param inventory `_LabInventory` object to be reported on.
Richard Barnettecf5d8342017-10-24 18:13:11 -07001069 """
Richard Barnettecf5d8342017-10-24 18:13:11 -07001070 logging.info('Scanning for DUTs in repair loops.')
Richard Barnette5de01eb2017-12-15 09:53:42 -08001071 for counts in inventory.itervalues():
Richard Barnettecf5d8342017-10-24 18:13:11 -07001072 for history in counts.get_working_list():
1073 # Managed DUTs with names that don't match
1074 # _HOSTNAME_PATTERN shouldn't be possible. However, we
1075 # don't want arbitrary strings being attached to the
1076 # 'dut_hostname' field, so for safety, we exclude all
1077 # anomalies.
1078 if not _HOSTNAME_PATTERN.match(history.hostname):
1079 continue
1080 if _dut_in_repair_loop(history):
Richard Barnette1ca30e62018-04-09 16:45:58 -07001081 _report_untestable_dut(history, 'repair_loop')
1082
1083
1084def _report_idle_dut_metrics(inventory):
1085 """Find and report idle, unlocked DUTs.
1086
1087 Go through `inventory`, and find and report any DUT identified as
1088 "idle" that is not also locked.
1089
1090 @param inventory `_LabInventory` object to be reported on.
1091 """
1092 logging.info('Scanning for idle, unlocked DUTs.')
1093 for counts in inventory.itervalues():
1094 for history in counts.get_idle_list():
1095 # Managed DUTs with names that don't match
1096 # _HOSTNAME_PATTERN shouldn't be possible. However, we
1097 # don't want arbitrary strings being attached to the
1098 # 'dut_hostname' field, so for safety, we exclude all
1099 # anomalies.
1100 if not _HOSTNAME_PATTERN.match(history.hostname):
1101 continue
1102 if not history.host.locked:
1103 _report_untestable_dut(history, 'idle_unlocked')
1104
1105
1106def _report_untestable_dut_metrics(inventory):
1107 """Scan the inventory for DUTs unable to run tests.
1108
1109 DUTs in the inventory are judged "untestable" if they meet one of
1110 two criteria:
1111 * The DUT is stuck in a repair loop; that is, it regularly passes
1112 repair, but never passes other operations.
1113 * The DUT runs no tasks at all, but is not locked.
1114
1115 This routine walks through the given inventory looking for DUTs in
1116 either of these states. Results are reported via a Monarch presence
1117 metric.
1118
1119 Note: To make sure that DUTs aren't flagged as "idle" merely
1120 because there's no work, a separate job runs prior to regular
1121 inventory runs which schedules trivial work on any DUT that appears
1122 idle.
1123
1124 @param inventory `_LabInventory` object to be reported on.
1125 """
1126 _report_repair_loop_metrics(inventory)
1127 _report_idle_dut_metrics(inventory)
Richard Barnettecf5d8342017-10-24 18:13:11 -07001128
1129
Richard Barnette3dcbb6a2017-10-23 17:57:50 -07001130def _log_startup(arguments, startup_time):
1131 """Log the start of this inventory run.
1132
1133 Print various log messages indicating the start of the run. Return
1134 a string based on `startup_time` that will be used to identify this
1135 run in log files and e-mail messages.
1136
1137 @param startup_time A UNIX timestamp marking the moment when
1138 this inventory run began.
1139 @returns A timestamp string that will be used to identify this run
1140 in logs and email output.
1141 """
1142 timestamp = time.strftime('%Y-%m-%d.%H',
1143 time.localtime(startup_time))
1144 logging.debug('Starting lab inventory for %s', timestamp)
Richard Barnette5de01eb2017-12-15 09:53:42 -08001145 if arguments.model_notify:
Richard Barnette3dcbb6a2017-10-23 17:57:50 -07001146 if arguments.recommend:
1147 logging.debug('Will include repair recommendations')
Richard Barnette5de01eb2017-12-15 09:53:42 -08001148 logging.debug('Will include model inventory')
Richard Barnette3dcbb6a2017-10-23 17:57:50 -07001149 if arguments.pool_notify:
1150 logging.debug('Will include pool inventory')
1151 return timestamp
1152
1153
1154def _create_inventory(arguments, end_time):
1155 """Create the `_LabInventory` instance to use for reporting.
1156
1157 @param end_time A UNIX timestamp for the end of the time range
1158 to be searched in this inventory run.
1159 """
1160 start_time = end_time - arguments.duration * 60 * 60
1161 afe = frontend_wrappers.RetryingAFE(server=None)
1162 inventory = _LabInventory.create_inventory(
Richard Barnette5de01eb2017-12-15 09:53:42 -08001163 afe, start_time, end_time, arguments.modelnames)
1164 logging.info('Found %d hosts across %d models',
Richard Barnette3dcbb6a2017-10-23 17:57:50 -07001165 inventory.get_num_duts(),
Richard Barnette5de01eb2017-12-15 09:53:42 -08001166 inventory.get_num_models())
Richard Barnette3dcbb6a2017-10-23 17:57:50 -07001167 return inventory
1168
1169
Richard Barnettecf5d8342017-10-24 18:13:11 -07001170def _perform_inventory_reports(arguments):
1171 """Perform all inventory checks requested on the command line.
1172
1173 Create the initial inventory and run through the inventory reports
1174 as called for by the parsed command-line arguments.
1175
1176 @param arguments Command-line arguments as returned by
1177 `ArgumentParser`.
1178 """
1179 startup_time = time.time()
1180 timestamp = _log_startup(arguments, startup_time)
1181 inventory = _create_inventory(arguments, startup_time)
1182 if arguments.debug:
Richard Barnette5de01eb2017-12-15 09:53:42 -08001183 _populate_model_counts(inventory)
1184 if arguments.model_notify:
1185 _perform_model_inventory(arguments, inventory, timestamp)
Richard Barnettecf5d8342017-10-24 18:13:11 -07001186 if arguments.pool_notify:
1187 _perform_pool_inventory(arguments, inventory, timestamp)
Richard Barnette1ca30e62018-04-09 16:45:58 -07001188 if arguments.report_untestable:
1189 _report_untestable_dut_metrics(inventory)
Richard Barnettecf5d8342017-10-24 18:13:11 -07001190
1191
J. Richard Barnette96db3492015-03-27 17:23:52 -07001192def _separate_email_addresses(address_list):
1193 """Parse a list of comma-separated lists of e-mail addresses.
1194
1195 @param address_list A list of strings containing comma
1196 separate e-mail addresses.
1197 @return A list of the individual e-mail addresses.
1198
1199 """
1200 newlist = []
1201 for arg in address_list:
1202 newlist.extend([email.strip() for email in arg.split(',')])
1203 return newlist
1204
1205
1206def _verify_arguments(arguments):
1207 """Validate command-line arguments.
1208
Richard Barnette5de01eb2017-12-15 09:53:42 -08001209 Join comma separated e-mail addresses for `--model-notify` and
J. Richard Barnette96db3492015-03-27 17:23:52 -07001210 `--pool-notify` in separate option arguments into a single list.
1211
Richard Barnette54150302018-02-26 10:42:46 -08001212 For non-debug uses, require that at least one inventory report be
1213 requested. For debug, if a report isn't specified, treat it as "run
1214 all the reports."
J. Richard Barnette02e82432015-10-13 16:02:47 -07001215
1216 The return value indicates success or failure; in the case of
1217 failure, we also write an error message to stderr.
1218
J. Richard Barnette96db3492015-03-27 17:23:52 -07001219 @param arguments Command-line arguments as returned by
1220 `ArgumentParser`
J. Richard Barnette02e82432015-10-13 16:02:47 -07001221 @return True if the arguments are semantically good, or False
1222 if the arguments don't meet requirements.
J. Richard Barnette96db3492015-03-27 17:23:52 -07001223
1224 """
Richard Barnette5de01eb2017-12-15 09:53:42 -08001225 arguments.model_notify = _separate_email_addresses(
1226 arguments.model_notify)
J. Richard Barnette96db3492015-03-27 17:23:52 -07001227 arguments.pool_notify = _separate_email_addresses(
1228 arguments.pool_notify)
Richard Barnette54150302018-02-26 10:42:46 -08001229 if not any([arguments.model_notify, arguments.pool_notify,
Richard Barnette1ca30e62018-04-09 16:45:58 -07001230 arguments.report_untestable]):
J. Richard Barnette02e82432015-10-13 16:02:47 -07001231 if not arguments.debug:
Richard Barnette54150302018-02-26 10:42:46 -08001232 sys.stderr.write('Must request at least one report via '
1233 '--model-notify, --pool-notify, or '
Richard Barnette1ca30e62018-04-09 16:45:58 -07001234 '--report-untestable\n')
J. Richard Barnette02e82432015-10-13 16:02:47 -07001235 return False
1236 else:
Richard Barnette54150302018-02-26 10:42:46 -08001237 # We want to run all the e-mail reports. An empty notify
1238 # list will cause a report to be skipped, so make sure the
J. Richard Barnette02e82432015-10-13 16:02:47 -07001239 # lists are non-empty.
Richard Barnette5de01eb2017-12-15 09:53:42 -08001240 arguments.model_notify = ['']
J. Richard Barnette02e82432015-10-13 16:02:47 -07001241 arguments.pool_notify = ['']
1242 return True
J. Richard Barnette96db3492015-03-27 17:23:52 -07001243
1244
Richard Barnette3dcbb6a2017-10-23 17:57:50 -07001245def _get_default_logdir(script):
J. Richard Barnette96db3492015-03-27 17:23:52 -07001246 """Get the default directory for the `--logdir` option.
1247
1248 The default log directory is based on the parent directory
1249 containing this script.
1250
1251 @param script Path to this script file.
1252 @return A path to a directory.
1253
1254 """
1255 basedir = os.path.dirname(os.path.abspath(script))
1256 basedir = os.path.dirname(basedir)
1257 return os.path.join(basedir, _LOGDIR)
1258
1259
1260def _parse_command(argv):
1261 """Parse the command line arguments.
1262
1263 Create an argument parser for this command's syntax, parse the
1264 command line, and return the result of the ArgumentParser
1265 parse_args() method.
1266
1267 @param argv Standard command line argument vector; argv[0] is
1268 assumed to be the command name.
1269 @return Result returned by ArgumentParser.parse_args().
1270
1271 """
1272 parser = argparse.ArgumentParser(
1273 prog=argv[0],
1274 description='Gather and report lab inventory statistics')
1275 parser.add_argument('-d', '--duration', type=int,
1276 default=_DEFAULT_DURATION, metavar='HOURS',
1277 help='number of hours back to search for status'
1278 ' (default: %d)' % _DEFAULT_DURATION)
Richard Barnette5de01eb2017-12-15 09:53:42 -08001279 parser.add_argument('--model-notify', action='append',
J. Richard Barnette96db3492015-03-27 17:23:52 -07001280 default=[], metavar='ADDRESS',
Richard Barnette5de01eb2017-12-15 09:53:42 -08001281 help='Generate model inventory message, '
J. Richard Barnette96db3492015-03-27 17:23:52 -07001282 'and send it to the given e-mail address(es)')
1283 parser.add_argument('--pool-notify', action='append',
1284 default=[], metavar='ADDRESS',
1285 help='Generate pool inventory message, '
1286 'and send it to the given address(es)')
J. Richard Barnette1df6a562015-06-09 10:06:17 -07001287 parser.add_argument('-r', '--recommend', type=int, default=None,
J. Richard Barnettef6839282015-06-01 16:00:35 -07001288 help=('Specify how many DUTs should be '
J. Richard Barnette1df6a562015-06-09 10:06:17 -07001289 'recommended for repair (default: no '
1290 'recommendation)'))
Richard Barnette1ca30e62018-04-09 16:45:58 -07001291 parser.add_argument('--report-untestable', action='store_true',
1292 help='Check for devices unable to run tests.')
Richard Barnette88b94892018-02-07 12:11:02 -08001293 parser.add_argument('--debug-metrics', action='store_true',
1294 help='Include debug information about the metrics '
1295 'that would be reported ')
J. Richard Barnette02e82432015-10-13 16:02:47 -07001296 parser.add_argument('--debug', action='store_true',
J. Richard Barnette96db3492015-03-27 17:23:52 -07001297 help='Print e-mail messages on stdout '
1298 'without sending them.')
Richard Barnette3dcbb6a2017-10-23 17:57:50 -07001299 parser.add_argument('--logdir', default=_get_default_logdir(argv[0]),
J. Richard Barnette96db3492015-03-27 17:23:52 -07001300 help='Directory where logs will be written.')
Richard Barnette5de01eb2017-12-15 09:53:42 -08001301 parser.add_argument('modelnames', nargs='*',
1302 metavar='MODEL',
1303 help='names of models to report on '
1304 '(default: all models)')
J. Richard Barnette96db3492015-03-27 17:23:52 -07001305 arguments = parser.parse_args(argv[1:])
J. Richard Barnette02e82432015-10-13 16:02:47 -07001306 if not _verify_arguments(arguments):
1307 return None
J. Richard Barnette96db3492015-03-27 17:23:52 -07001308 return arguments
1309
1310
1311def _configure_logging(arguments):
1312 """Configure the `logging` module for our needs.
1313
Richard Barnette3dcbb6a2017-10-23 17:57:50 -07001314 How we log depends on whether the `--debug` option was provided on
1315 the command line.
1316 * Without the option, we configure the logging to capture all
1317 potentially relevant events in a log file. The log file is
1318 configured to rotate once a week on Friday evening, preserving
1319 ~3 months worth of history.
1320 * With the option, we expect stdout to contain other
1321 human-readable output (including the contents of the e-mail
Richard Barnettecf5d8342017-10-24 18:13:11 -07001322 messages), so we restrict the output to INFO level.
1323
1324 For convenience, when `--debug` is on, the logging format has
1325 no adornments, so that a call like `logging.info(msg)` simply writes
1326 `msg` to stdout, plus a trailing newline.
J. Richard Barnette96db3492015-03-27 17:23:52 -07001327
1328 @param arguments Command-line arguments as returned by
1329 `ArgumentParser`
J. Richard Barnette96db3492015-03-27 17:23:52 -07001330 """
J. Richard Barnettef6839282015-06-01 16:00:35 -07001331 root_logger = logging.getLogger()
J. Richard Barnette02e82432015-10-13 16:02:47 -07001332 if arguments.debug:
J. Richard Barnettef6839282015-06-01 16:00:35 -07001333 root_logger.setLevel(logging.INFO)
J. Richard Barnette96db3492015-03-27 17:23:52 -07001334 handler = logging.StreamHandler(sys.stdout)
1335 handler.setFormatter(logging.Formatter())
1336 else:
Richard Barnette5af97402016-04-18 11:00:26 -07001337 if not os.path.exists(arguments.logdir):
1338 os.mkdir(arguments.logdir)
J. Richard Barnettef6839282015-06-01 16:00:35 -07001339 root_logger.setLevel(logging.DEBUG)
J. Richard Barnette96db3492015-03-27 17:23:52 -07001340 logfile = os.path.join(arguments.logdir, _LOGFILE)
1341 handler = logging.handlers.TimedRotatingFileHandler(
1342 logfile, when='W4', backupCount=13)
1343 formatter = logging.Formatter(_LOG_FORMAT,
1344 time_utils.TIME_FMT)
1345 handler.setFormatter(formatter)
J. Richard Barnettef6839282015-06-01 16:00:35 -07001346 # TODO(jrbarnette) This is gross. Importing client.bin.utils
1347 # implicitly imported logging_config, which calls
1348 # logging.basicConfig() *at module level*. That gives us an
1349 # extra logging handler that we don't want. So, clear out all
1350 # the handlers here.
1351 for h in root_logger.handlers:
1352 root_logger.removeHandler(h)
1353 root_logger.addHandler(handler)
J. Richard Barnette96db3492015-03-27 17:23:52 -07001354
1355
J. Richard Barnette96db3492015-03-27 17:23:52 -07001356def main(argv):
1357 """Standard main routine.
Richard Barnettecf5d8342017-10-24 18:13:11 -07001358
1359 @param argv Command line arguments, including `sys.argv[0]`.
J. Richard Barnette96db3492015-03-27 17:23:52 -07001360 """
1361 arguments = _parse_command(argv)
J. Richard Barnette02e82432015-10-13 16:02:47 -07001362 if not arguments:
1363 sys.exit(1)
J. Richard Barnette96db3492015-03-27 17:23:52 -07001364 _configure_logging(arguments)
1365 try:
Richard Barnette88b94892018-02-07 12:11:02 -08001366 if arguments.debug_metrics or not arguments.debug:
1367 metrics_file = None if not arguments.debug_metrics else '/dev/null'
Richard Barnettecf5d8342017-10-24 18:13:11 -07001368 with site_utils.SetupTsMonGlobalState(
Richard Barnette1ca30e62018-04-09 16:45:58 -07001369 'lab_inventory', debug_file=metrics_file,
Richard Barnette88b94892018-02-07 12:11:02 -08001370 auto_flush=False):
Richard Barnettecf5d8342017-10-24 18:13:11 -07001371 _perform_inventory_reports(arguments)
Richard Barnette88405882018-02-07 11:39:30 -08001372 metrics.Flush()
Richard Barnettecf5d8342017-10-24 18:13:11 -07001373 else:
1374 _perform_inventory_reports(arguments)
J. Richard Barnette96db3492015-03-27 17:23:52 -07001375 except KeyboardInterrupt:
1376 pass
1377 except EnvironmentError as e:
1378 logging.exception('Unexpected OS error: %s', e)
1379 except Exception as e:
1380 logging.exception('Unexpected exception: %s', e)
1381
1382
Kevin Chengcf0ad2b2016-04-19 14:51:39 -07001383def get_inventory(afe):
J. Richard Barnetteaa868932015-10-23 13:28:59 -07001384 end_time = int(time.time())
1385 start_time = end_time - 24 * 60 * 60
Kevin Chengcf0ad2b2016-04-19 14:51:39 -07001386 return _LabInventory.create_inventory(afe, start_time, end_time)
1387
1388
1389def get_managed_boards(afe):
Richard Barnette5de01eb2017-12-15 09:53:42 -08001390 return get_inventory(afe).get_boards()
J. Richard Barnetteaa868932015-10-23 13:28:59 -07001391
1392
J. Richard Barnette96db3492015-03-27 17:23:52 -07001393if __name__ == '__main__':
1394 main(sys.argv)