blob: bbb8553ae26f8b329666d67e33e082ca5cd1afb1 [file] [log] [blame]
J. Richard Barnette96db3492015-03-27 17:23:52 -07001#!/usr/bin/env python
2# Copyright 2015 The Chromium OS Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6"""Create e-mail reports of the Lab's DUT inventory.
7
8Gathers a list of all DUTs of interest in the Lab, segregated by
Richard Barnette5de01eb2017-12-15 09:53:42 -08009model and pool, and determines whether each DUT is working or
J. Richard Barnette96db3492015-03-27 17:23:52 -070010broken. Then, send one or more e-mail reports summarizing the
11status to e-mail addresses provided on the command line.
12
Richard Barnette5de01eb2017-12-15 09:53:42 -080013usage: lab_inventory.py [ options ] [ model ... ]
J. Richard Barnette96db3492015-03-27 17:23:52 -070014
15Options:
16--duration / -d <hours>
17 How far back in time to search job history to determine DUT
18 status.
19
Richard Barnette5de01eb2017-12-15 09:53:42 -080020--model-notify <address>[,<address>]
21 Send the "model status" e-mail to all the specified e-mail
J. Richard Barnette96db3492015-03-27 17:23:52 -070022 addresses.
23
24--pool-notify <address>[,<address>]
25 Send the "pool status" e-mail to all the specified e-mail
26 addresses.
27
J. Richard Barnette1df6a562015-06-09 10:06:17 -070028--recommend <number>
Richard Barnette5de01eb2017-12-15 09:53:42 -080029 When generating the "model status" e-mail, include a list of
J. Richard Barnette1df6a562015-06-09 10:06:17 -070030 <number> specific DUTs to be recommended for repair.
31
Richard Barnette1ca30e62018-04-09 16:45:58 -070032--report-untestable
33 Scan the inventory for DUTs that can't test because they're stuck in
34 repair loops, or because the scheduler can't give them work.
Richard Barnettecf5d8342017-10-24 18:13:11 -070035
J. Richard Barnette96db3492015-03-27 17:23:52 -070036--logdir <directory>
37 Log progress and actions in a file under this directory. Text
38 of any e-mail sent will also be logged in a timestamped file in
39 this directory.
40
J. Richard Barnette02e82432015-10-13 16:02:47 -070041--debug
Richard Barnettecf5d8342017-10-24 18:13:11 -070042 Suppress all logging, metrics reporting, and sending e-mail.
43 Instead, write the output that would be generated onto stdout.
J. Richard Barnette96db3492015-03-27 17:23:52 -070044
Richard Barnette5de01eb2017-12-15 09:53:42 -080045<model> arguments:
46 With no arguments, gathers the status for all models in the lab.
47 With one or more named models on the command line, restricts
48 reporting to just those models.
J. Richard Barnette96db3492015-03-27 17:23:52 -070049"""
50
51
52import argparse
Prathmesh Prabhu021e7842017-11-08 18:05:45 -080053import collections
Jacob Kopczynskief6c92e2018-08-09 11:05:37 -070054import datetime
J. Richard Barnette96db3492015-03-27 17:23:52 -070055import logging
56import logging.handlers
57import os
J. Richard Barnettef6839282015-06-01 16:00:35 -070058import re
J. Richard Barnette96db3492015-03-27 17:23:52 -070059import sys
60import time
61
62import common
J. Richard Barnettef6839282015-06-01 16:00:35 -070063from autotest_lib.client.bin import utils
Richard Barnette6f6ce322018-09-07 16:23:20 +000064from autotest_lib.client.common_lib import time_utils
Jacob Kopczynski7c4c9542018-08-13 17:24:41 -070065from autotest_lib.frontend.afe.json_rpc import proxy
Xixuan Wu93e646c2017-12-07 18:36:10 -080066from autotest_lib.server import constants
Richard Barnettecf5d8342017-10-24 18:13:11 -070067from autotest_lib.server import site_utils
J. Richard Barnettea7c514e2015-09-15 11:13:23 -070068from autotest_lib.server.cros.dynamic_suite import frontend_wrappers
J. Richard Barnettef6839282015-06-01 16:00:35 -070069from autotest_lib.server.hosts import servo_host
Aviv Keshet7ee95862016-08-30 15:18:27 -070070from autotest_lib.server.lib import status_history
J. Richard Barnette96db3492015-03-27 17:23:52 -070071from autotest_lib.site_utils import gmail_lib
Richard Barnettecf5d8342017-10-24 18:13:11 -070072from chromite.lib import metrics
J. Richard Barnette96db3492015-03-27 17:23:52 -070073
74
Richard Barnette673573b2016-12-12 09:46:39 -080075CRITICAL_POOLS = constants.Pools.CRITICAL_POOLS
76SPARE_POOL = constants.Pools.SPARE_POOL
77MANAGED_POOLS = constants.Pools.MANAGED_POOLS
J. Richard Barnette96db3492015-03-27 17:23:52 -070078
J. Richard Barnetteb8bc570c2016-03-17 17:03:57 -070079# _EXCLUDED_LABELS - A set of labels that disqualify a DUT from
Richard Barnetteeabcf392017-09-01 15:10:54 -070080# monitoring by this script. Currently, we're excluding these:
81# + 'adb' - We're not ready to monitor Android or Brillo hosts.
82# + 'board:guado_moblab' - These are maintained by a separate
83# process that doesn't use this script.
Aviv Keshet1ba0dec2018-07-12 17:14:08 -070084# + 'board:veyron_rialto' due to crbug.com/854404
Kevin Chengcf0ad2b2016-04-19 14:51:39 -070085
C Shapiro7de04422018-08-29 14:46:11 -060086_EXCLUDED_LABELS = {'adb', 'board:guado_moblab',
Aviv Keshet1ba0dec2018-07-12 17:14:08 -070087 'board:veyron_rialto'}
J. Richard Barnetteb8bc570c2016-03-17 17:03:57 -070088
J. Richard Barnette96db3492015-03-27 17:23:52 -070089# _DEFAULT_DURATION:
90# Default value used for the --duration command line option.
91# Specifies how far back in time to search in order to determine
92# DUT status.
93
94_DEFAULT_DURATION = 24
95
J. Richard Barnette96db3492015-03-27 17:23:52 -070096# _LOGDIR:
Richard Barnettecf5d8342017-10-24 18:13:11 -070097# Relative path used in the calculation of the default setting for
98# the --logdir option. The full path is relative to the root of the
99# autotest directory, as determined from sys.argv[0].
J. Richard Barnette96db3492015-03-27 17:23:52 -0700100# _LOGFILE:
101# Basename of a file to which general log information will be
102# written.
103# _LOG_FORMAT:
104# Format string for log messages.
105
106_LOGDIR = os.path.join('logs', 'dut-data')
107_LOGFILE = 'lab-inventory.log'
108_LOG_FORMAT = '%(asctime)s | %(levelname)-10s | %(message)s'
109
J. Richard Barnettef6839282015-06-01 16:00:35 -0700110# Pattern describing location-based host names in the Chrome OS test
111# labs. Each DUT hostname designates the DUT's location:
112# * A lab (room) that's physically separated from other labs
113# (i.e. there's a door).
114# * A row (or aisle) of DUTs within the lab.
115# * A vertical rack of shelves on the row.
116# * A specific host on one shelf of the rack.
117
118_HOSTNAME_PATTERN = re.compile(
119 r'(chromeos\d+)-row(\d+)-rack(\d+)-host(\d+)')
120
Richard Barnettecf5d8342017-10-24 18:13:11 -0700121# _REPAIR_LOOP_THRESHOLD:
122# The number of repeated Repair tasks that must be seen to declare
123# that a DUT is stuck in a repair loop.
124
125_REPAIR_LOOP_THRESHOLD = 4
126
J. Richard Barnette96db3492015-03-27 17:23:52 -0700127
Prathmesh Prabhub69a6cc2018-05-07 14:49:33 -0700128_METRICS_PREFIX = 'chromeos/autotest/inventory'
Richard Barnette1ca30e62018-04-09 16:45:58 -0700129_UNTESTABLE_PRESENCE_METRIC = metrics.BooleanMetric(
Jacob Kopczynski7c4c9542018-08-13 17:24:41 -0700130 _METRICS_PREFIX + '/untestable',
Richard Barnette1ca30e62018-04-09 16:45:58 -0700131 'DUTs that cannot be scheduled for testing')
132
Jacob Kopczynski7c4c9542018-08-13 17:24:41 -0700133_MISSING_DUT_METRIC = metrics.Counter(
134 _METRICS_PREFIX + '/missing', 'DUTs which cannot be found by lookup queries'
135 ' because they are invalid or deleted')
136
Jacob Kopczynskief6c92e2018-08-09 11:05:37 -0700137_TIMESTAMP_FORMAT = '%Y-%m-%d.%H'
138
Richard Barnette59404262018-09-14 15:25:30 -0700139# _Diagnosis - namedtuple corresponding to the return value from
140# `HostHistory.last_diagnosis()`
141_Diagnosis = collections.namedtuple('_Diagnosis', ['status', 'task'])
142
143
144def _get_diagnosis(history):
Jacob Kopczynski7c4c9542018-08-13 17:24:41 -0700145 dut_present = True
146 try:
Richard Barnette59404262018-09-14 15:25:30 -0700147 diagnosis = _Diagnosis(*history.last_diagnosis())
148 if (diagnosis.status == status_history.BROKEN
149 and diagnosis.task.end_time < history.start_time):
150 return _Diagnosis(status_history.UNUSED, diagnosis.task)
151 else:
152 return diagnosis
Jacob Kopczynski7c4c9542018-08-13 17:24:41 -0700153 except proxy.JSONRPCException as e:
154 logging.warn(e)
155 dut_present = False
156 finally:
157 _MISSING_DUT_METRIC.increment(
158 fields={'host': history.hostname, 'presence': dut_present})
Richard Barnette1ca30e62018-04-09 16:45:58 -0700159
Richard Barnettee8eee312018-04-27 13:12:04 -0400160def _host_is_working(history):
Richard Barnette59404262018-09-14 15:25:30 -0700161 return _get_diagnosis(history).status == status_history.WORKING
Richard Barnettee8eee312018-04-27 13:12:04 -0400162
163
164def _host_is_broken(history):
Richard Barnette59404262018-09-14 15:25:30 -0700165 return _get_diagnosis(history).status == status_history.BROKEN
Richard Barnettee8eee312018-04-27 13:12:04 -0400166
167
168def _host_is_idle(history):
169 idle_statuses = {status_history.UNUSED, status_history.UNKNOWN}
Richard Barnette59404262018-09-14 15:25:30 -0700170 return _get_diagnosis(history).status in idle_statuses
Richard Barnettee8eee312018-04-27 13:12:04 -0400171
172
Richard Barnette5de01eb2017-12-15 09:53:42 -0800173class _HostSetInventory(object):
174 """Maintains a set of related `HostJobHistory` objects.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700175
Richard Barnettee8eee312018-04-27 13:12:04 -0400176 Current usage of this class is that all DUTs are part of a single
177 scheduling pool of DUTs for a single model; however, this class make
178 no assumptions about the actual relationship among the DUTs.
179
Richard Barnette5de01eb2017-12-15 09:53:42 -0800180 The collection is segregated into disjoint categories of "working",
181 "broken", and "idle" DUTs. Accessor methods allow finding both the
182 list of DUTs in each category, as well as counts of each category.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700183
J. Richard Barnettef6839282015-06-01 16:00:35 -0700184 Performance note: Certain methods in this class are potentially
185 expensive:
186 * `get_working()`
187 * `get_working_list()`
188 * `get_broken()`
189 * `get_broken_list()`
xixuan12ce04f2016-03-10 13:16:30 -0800190 * `get_idle()`
191 * `get_idle_list()`
J. Richard Barnettef6839282015-06-01 16:00:35 -0700192 The first time any one of these methods is called, it causes
193 multiple RPC calls with a relatively expensive set of database
194 queries. However, the results of the queries are cached in the
195 individual `HostJobHistory` objects, so only the first call
196 actually pays the full cost.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700197
xixuan12ce04f2016-03-10 13:16:30 -0800198 Additionally, `get_working_list()`, `get_broken_list()` and
199 `get_idle_list()` cache their return values to avoid recalculating
Richard Barnette5de01eb2017-12-15 09:53:42 -0800200 lists at every call; this caching is separate from the caching of
201 RPC results described above.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700202
Richard Barnette5de01eb2017-12-15 09:53:42 -0800203 This class is deliberately constructed to delay the RPC cost until
204 the accessor methods are called (rather than to query in
J. Richard Barnette96db3492015-03-27 17:23:52 -0700205 `record_host()`) so that it's possible to construct a complete
206 `_LabInventory` without making the expensive queries at creation
Richard Barnette5de01eb2017-12-15 09:53:42 -0800207 time. `_populate_model_counts()`, below, assumes this behavior.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700208 """
209
210 def __init__(self):
211 self._histories = []
J. Richard Barnettef6839282015-06-01 16:00:35 -0700212 self._working_list = None
213 self._broken_list = None
xixuan12ce04f2016-03-10 13:16:30 -0800214 self._idle_list = None
J. Richard Barnette96db3492015-03-27 17:23:52 -0700215
J. Richard Barnette96db3492015-03-27 17:23:52 -0700216 def record_host(self, host_history):
217 """Add one `HostJobHistory` object to the collection.
218
219 @param host_history The `HostJobHistory` object to be
220 remembered.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700221 """
J. Richard Barnettef6839282015-06-01 16:00:35 -0700222 self._working_list = None
223 self._broken_list = None
xixuan12ce04f2016-03-10 13:16:30 -0800224 self._idle_list = None
J. Richard Barnette96db3492015-03-27 17:23:52 -0700225 self._histories.append(host_history)
226
J. Richard Barnettef6839282015-06-01 16:00:35 -0700227 def get_working_list(self):
228 """Return a list of all working DUTs in the pool.
229
Richard Barnettee8eee312018-04-27 13:12:04 -0400230 Filter `self._histories` for histories where the DUT is
231 diagnosed as working.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700232
233 Cache the result so that we only cacluate it once.
234
235 @return A list of HostJobHistory objects.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700236 """
237 if self._working_list is None:
238 self._working_list = [h for h in self._histories
Richard Barnettee8eee312018-04-27 13:12:04 -0400239 if _host_is_working(h)]
J. Richard Barnettef6839282015-06-01 16:00:35 -0700240 return self._working_list
241
J. Richard Barnette96db3492015-03-27 17:23:52 -0700242 def get_working(self):
J. Richard Barnettef6839282015-06-01 16:00:35 -0700243 """Return the number of working DUTs in the pool."""
244 return len(self.get_working_list())
245
J. Richard Barnettef6839282015-06-01 16:00:35 -0700246 def get_broken_list(self):
247 """Return a list of all broken DUTs in the pool.
248
Richard Barnettee8eee312018-04-27 13:12:04 -0400249 Filter `self._histories` for histories where the DUT is
250 diagnosed as broken.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700251
252 Cache the result so that we only cacluate it once.
253
254 @return A list of HostJobHistory objects.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700255 """
256 if self._broken_list is None:
257 self._broken_list = [h for h in self._histories
Richard Barnettee8eee312018-04-27 13:12:04 -0400258 if _host_is_broken(h)]
J. Richard Barnettef6839282015-06-01 16:00:35 -0700259 return self._broken_list
J. Richard Barnette96db3492015-03-27 17:23:52 -0700260
J. Richard Barnette96db3492015-03-27 17:23:52 -0700261 def get_broken(self):
J. Richard Barnettef6839282015-06-01 16:00:35 -0700262 """Return the number of broken DUTs in the pool."""
263 return len(self.get_broken_list())
J. Richard Barnette96db3492015-03-27 17:23:52 -0700264
xixuan12ce04f2016-03-10 13:16:30 -0800265 def get_idle_list(self):
266 """Return a list of all idle DUTs in the pool.
267
Richard Barnettee8eee312018-04-27 13:12:04 -0400268 Filter `self._histories` for histories where the DUT is
269 diagnosed as idle.
xixuan12ce04f2016-03-10 13:16:30 -0800270
271 Cache the result so that we only cacluate it once.
272
273 @return A list of HostJobHistory objects.
xixuan12ce04f2016-03-10 13:16:30 -0800274 """
xixuan12ce04f2016-03-10 13:16:30 -0800275 if self._idle_list is None:
276 self._idle_list = [h for h in self._histories
Richard Barnettee8eee312018-04-27 13:12:04 -0400277 if _host_is_idle(h)]
xixuan12ce04f2016-03-10 13:16:30 -0800278 return self._idle_list
279
xixuan12ce04f2016-03-10 13:16:30 -0800280 def get_idle(self):
281 """Return the number of idle DUTs in the pool."""
282 return len(self.get_idle_list())
283
J. Richard Barnette96db3492015-03-27 17:23:52 -0700284 def get_total(self):
J. Richard Barnettef6839282015-06-01 16:00:35 -0700285 """Return the total number of DUTs in the pool."""
J. Richard Barnette96db3492015-03-27 17:23:52 -0700286 return len(self._histories)
287
Richard Barnettee8eee312018-04-27 13:12:04 -0400288 def get_all_histories(self):
289 return self._histories
290
J. Richard Barnette96db3492015-03-27 17:23:52 -0700291
Richard Barnette5de01eb2017-12-15 09:53:42 -0800292class _PoolSetInventory(object):
293 """Maintains a set of `HostJobHistory`s for a set of pools.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700294
Richard Barnette5de01eb2017-12-15 09:53:42 -0800295 The collection is segregated into disjoint categories of "working",
296 "broken", and "idle" DUTs. Accessor methods allow finding both the
297 list of DUTs in each category, as well as counts of each category.
298 Accessor queries can be for an individual pool, or against all
299 pools.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700300
Richard Barnette5de01eb2017-12-15 09:53:42 -0800301 Performance note: This class relies on `_HostSetInventory`. Public
302 methods in this class generally rely on methods of the same name in
303 the underlying class, and so will have the same underlying
304 performance characteristics.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700305 """
306
Richard Barnette5de01eb2017-12-15 09:53:42 -0800307 def __init__(self, pools):
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800308 self._histories_by_pool = {
Richard Barnette5de01eb2017-12-15 09:53:42 -0800309 pool: _HostSetInventory() for pool in pools
J. Richard Barnette96db3492015-03-27 17:23:52 -0700310 }
311
312 def record_host(self, host_history):
313 """Add one `HostJobHistory` object to the collection.
314
315 @param host_history The `HostJobHistory` object to be
316 remembered.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700317 """
J. Richard Barnette3d0590a2015-04-29 12:56:12 -0700318 pool = host_history.host_pool
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800319 self._histories_by_pool[pool].record_host(host_history)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700320
J. Richard Barnette96db3492015-03-27 17:23:52 -0700321 def _count_pool(self, get_pool_count, pool=None):
322 """Internal helper to count hosts in a given pool.
323
324 The `get_pool_count` parameter is a function to calculate
325 the exact count of interest for the pool.
326
327 @param get_pool_count Function to return a count from a
328 _PoolCount object.
329 @param pool The pool to be counted. If `None`,
330 return the total across all pools.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700331 """
332 if pool is None:
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800333 return sum([get_pool_count(cached_history) for cached_history in
334 self._histories_by_pool.values()])
J. Richard Barnette96db3492015-03-27 17:23:52 -0700335 else:
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800336 return get_pool_count(self._histories_by_pool[pool])
J. Richard Barnette96db3492015-03-27 17:23:52 -0700337
J. Richard Barnettef6839282015-06-01 16:00:35 -0700338 def get_working_list(self):
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800339 """Return a list of all working DUTs (across all pools).
J. Richard Barnettef6839282015-06-01 16:00:35 -0700340
Richard Barnettee8eee312018-04-27 13:12:04 -0400341 Go through all HostJobHistory objects across all pools,
342 selecting all DUTs identified as working.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700343
344 @return A list of HostJobHistory objects.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700345 """
346 l = []
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800347 for p in self._histories_by_pool.values():
J. Richard Barnettef6839282015-06-01 16:00:35 -0700348 l.extend(p.get_working_list())
349 return l
350
J. Richard Barnette96db3492015-03-27 17:23:52 -0700351 def get_working(self, pool=None):
352 """Return the number of working DUTs in a pool.
353
354 @param pool The pool to be counted. If `None`, return the
355 total across all pools.
356
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700357 @return The total number of working DUTs in the selected
358 pool(s).
J. Richard Barnette96db3492015-03-27 17:23:52 -0700359 """
Richard Barnette5de01eb2017-12-15 09:53:42 -0800360 return self._count_pool(_HostSetInventory.get_working, pool)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700361
J. Richard Barnettef6839282015-06-01 16:00:35 -0700362 def get_broken_list(self):
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800363 """Return a list of all broken DUTs (across all pools).
J. Richard Barnettef6839282015-06-01 16:00:35 -0700364
Richard Barnettee8eee312018-04-27 13:12:04 -0400365 Go through all HostJobHistory objects across all pools,
366 selecting all DUTs identified as broken.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700367
368 @return A list of HostJobHistory objects.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700369 """
370 l = []
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800371 for p in self._histories_by_pool.values():
J. Richard Barnettef6839282015-06-01 16:00:35 -0700372 l.extend(p.get_broken_list())
373 return l
374
J. Richard Barnette96db3492015-03-27 17:23:52 -0700375 def get_broken(self, pool=None):
376 """Return the number of broken DUTs in a pool.
377
378 @param pool The pool to be counted. If `None`, return the
379 total across all pools.
380
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700381 @return The total number of broken DUTs in the selected pool(s).
J. Richard Barnette96db3492015-03-27 17:23:52 -0700382 """
Richard Barnette5de01eb2017-12-15 09:53:42 -0800383 return self._count_pool(_HostSetInventory.get_broken, pool)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700384
xixuan12ce04f2016-03-10 13:16:30 -0800385 def get_idle_list(self, pool=None):
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800386 """Return a list of all idle DUTs in the given pool.
xixuan12ce04f2016-03-10 13:16:30 -0800387
Richard Barnettee8eee312018-04-27 13:12:04 -0400388 Go through all HostJobHistory objects across all pools,
389 selecting all DUTs identified as idle.
xixuan12ce04f2016-03-10 13:16:30 -0800390
391 @param pool: The pool to be counted. If `None`, return the total list
392 across all pools.
393
394 @return A list of HostJobHistory objects.
xixuan12ce04f2016-03-10 13:16:30 -0800395 """
396 if pool is None:
397 l = []
Richard Barnette5de01eb2017-12-15 09:53:42 -0800398 for p in self._histories_by_pool.itervalues():
xixuan12ce04f2016-03-10 13:16:30 -0800399 l.extend(p.get_idle_list())
400 return l
401 else:
Richard Barnette5de01eb2017-12-15 09:53:42 -0800402 return self._histories_by_pool[pool].get_idle_list()
xixuan12ce04f2016-03-10 13:16:30 -0800403
xixuan12ce04f2016-03-10 13:16:30 -0800404 def get_idle(self, pool=None):
405 """Return the number of idle DUTs in a pool.
406
407 @param pool: The pool to be counted. If `None`, return the total
408 across all pools.
409
410 @return The total number of idle DUTs in the selected pool(s).
411 """
Richard Barnette5de01eb2017-12-15 09:53:42 -0800412 return self._count_pool(_HostSetInventory.get_idle, pool)
xixuan12ce04f2016-03-10 13:16:30 -0800413
Richard Barnette5de01eb2017-12-15 09:53:42 -0800414 def get_spares_buffer(self, spare_pool=SPARE_POOL):
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700415 """Return the the nominal number of working spares.
416
417 Calculates and returns how many working spares there would
418 be in the spares pool if all broken DUTs were in the spares
419 pool. This number may be negative, indicating a shortfall
420 in the critical pools.
421
422 @return The total number DUTs in the spares pool, less the total
423 number of broken DUTs in all pools.
424 """
Richard Barnette5de01eb2017-12-15 09:53:42 -0800425 return self.get_total(spare_pool) - self.get_broken()
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700426
J. Richard Barnette96db3492015-03-27 17:23:52 -0700427 def get_total(self, pool=None):
428 """Return the total number of DUTs in a pool.
429
430 @param pool The pool to be counted. If `None`, return the
431 total across all pools.
432
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700433 @return The total number of DUTs in the selected pool(s).
J. Richard Barnette96db3492015-03-27 17:23:52 -0700434 """
Richard Barnette5de01eb2017-12-15 09:53:42 -0800435 return self._count_pool(_HostSetInventory.get_total, pool)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700436
Richard Barnettee8eee312018-04-27 13:12:04 -0400437 def get_all_histories(self, pool=None):
438 if pool is None:
439 for p in self._histories_by_pool.itervalues():
440 for h in p.get_all_histories():
441 yield h
442 else:
443 for h in self._histories_by_pool[pool].get_all_histories():
444 yield h
445
J. Richard Barnette96db3492015-03-27 17:23:52 -0700446
Richard Barnette5de01eb2017-12-15 09:53:42 -0800447def _eligible_host(afehost):
448 """Return whether this host is eligible for monitoring.
449
450 A host is eligible if it has a (unique) 'model' label, it's in
451 exactly one pool, and it has no labels from the
452 `_EXCLUDED_LABELS` set.
453
454 @param afehost The host to be tested for eligibility.
455 """
456 # DUTs without an existing, unique 'model' or 'pool' label
457 # aren't meant to exist in the managed inventory; their presence
458 # generally indicates an error in the database. Unfortunately
459 # such errors have been seen to occur from time to time.
460 #
461 # The _LabInventory constructor requires hosts to conform to the
462 # label restrictions, and may fail if they don't. Failing an
463 # inventory run for a single bad entry is the wrong thing, so we
464 # ignore the problem children here, to keep them out of the
465 # inventory.
466 models = [l for l in afehost.labels
467 if l.startswith(constants.Labels.MODEL_PREFIX)]
468 pools = [l for l in afehost.labels
469 if l.startswith(constants.Labels.POOL_PREFIX)]
470 excluded = _EXCLUDED_LABELS.intersection(afehost.labels)
471 return len(models) == 1 and len(pools) == 1 and not excluded
472
473
474class _LabInventory(collections.Mapping):
J. Richard Barnette96db3492015-03-27 17:23:52 -0700475 """Collection of `HostJobHistory` objects for the Lab's inventory.
476
Richard Barnette5de01eb2017-12-15 09:53:42 -0800477 This is a dict-like collection indexed by model. Indexing returns
478 the _PoolSetInventory object associated with the model.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700479 """
480
481 @classmethod
Richard Barnette5de01eb2017-12-15 09:53:42 -0800482 def create_inventory(cls, afe, start_time, end_time, modellist=[]):
J. Richard Barnette96db3492015-03-27 17:23:52 -0700483 """Return a Lab inventory with specified parameters.
484
Richard Barnette5de01eb2017-12-15 09:53:42 -0800485 By default, gathers inventory from `HostJobHistory` objects for
486 all DUTs in the `MANAGED_POOLS` list. If `modellist` is
487 supplied, the inventory will be restricted to only the given
488 models.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700489
Richard Barnette5de01eb2017-12-15 09:53:42 -0800490 @param afe AFE object for constructing the
491 `HostJobHistory` objects.
492 @param start_time Start time for the `HostJobHistory` objects.
493 @param end_time End time for the `HostJobHistory` objects.
494 @param modellist List of models to include. If empty,
495 include all available models.
496 @return A `_LabInventory` object for the specified models.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700497 """
Richard Barnette5de01eb2017-12-15 09:53:42 -0800498 target_pools = MANAGED_POOLS
499 label_list = [constants.Labels.POOL_PREFIX + l for l in target_pools]
J. Richard Barnette96db3492015-03-27 17:23:52 -0700500 afehosts = afe.get_hosts(labels__name__in=label_list)
Richard Barnette5de01eb2017-12-15 09:53:42 -0800501 if modellist:
J. Richard Barnetteb8bc570c2016-03-17 17:03:57 -0700502 # We're deliberately not checking host eligibility in this
503 # code path. This is a debug path, not used in production;
504 # it may be useful to include ineligible hosts here.
Richard Barnette5de01eb2017-12-15 09:53:42 -0800505 modelhosts = []
506 for model in modellist:
507 model_label = constants.Labels.MODEL_PREFIX + model
J. Richard Barnette96db3492015-03-27 17:23:52 -0700508 host_list = [h for h in afehosts
Richard Barnette5de01eb2017-12-15 09:53:42 -0800509 if model_label in h.labels]
510 modelhosts.extend(host_list)
511 afehosts = modelhosts
J. Richard Barnetteb8bc570c2016-03-17 17:03:57 -0700512 else:
Richard Barnette3a404492018-02-08 13:57:01 -0800513 afehosts = [h for h in afehosts if _eligible_host(h)]
J. Richard Barnette96db3492015-03-27 17:23:52 -0700514 create = lambda host: (
515 status_history.HostJobHistory(afe, host,
516 start_time, end_time))
Richard Barnette5de01eb2017-12-15 09:53:42 -0800517 return cls([create(host) for host in afehosts], target_pools)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700518
Richard Barnette5de01eb2017-12-15 09:53:42 -0800519 def __init__(self, histories, pools):
520 models = {h.host_model for h in histories}
521 self._modeldata = {model: _PoolSetInventory(pools) for model in models}
J. Richard Barnette96db3492015-03-27 17:23:52 -0700522 self._dut_count = len(histories)
Richard Barnette5de01eb2017-12-15 09:53:42 -0800523 for h in histories:
524 self[h.host_model].record_host(h)
525 self._boards = {h.host_board for h in histories}
Prathmesh Prabhu154cb2b2017-11-08 17:36:51 -0800526
Richard Barnette5de01eb2017-12-15 09:53:42 -0800527 def __getitem__(self, key):
528 return self._modeldata.__getitem__(key)
Prathmesh Prabhu021e7842017-11-08 18:05:45 -0800529
Richard Barnette5de01eb2017-12-15 09:53:42 -0800530 def __len__(self):
531 return self._modeldata.__len__()
532
Richard Barnette5de01eb2017-12-15 09:53:42 -0800533 def __iter__(self):
534 return self._modeldata.__iter__()
535
J. Richard Barnette96db3492015-03-27 17:23:52 -0700536 def get_num_duts(self):
537 """Return the total number of DUTs in the inventory."""
538 return self._dut_count
539
Prathmesh Prabhua5a0e3d2017-11-09 08:53:53 -0800540 def get_num_models(self):
541 """Return the total number of models in the inventory."""
Richard Barnette5de01eb2017-12-15 09:53:42 -0800542 return len(self)
543
Richard Barnette5de01eb2017-12-15 09:53:42 -0800544 def get_pool_models(self, pool):
545 """Return all models in `pool`.
546
547 @param pool The pool to be inventoried for models.
548 """
549 return {m for m, h in self.iteritems() if h.get_total(pool)}
550
Richard Barnette5de01eb2017-12-15 09:53:42 -0800551 def get_boards(self):
552 return self._boards
Prathmesh Prabhua5a0e3d2017-11-09 08:53:53 -0800553
554
Richard Barnettee8eee312018-04-27 13:12:04 -0400555def _reportable_models(inventory, spare_pool=SPARE_POOL):
556 """Iterate over all models subject to reporting.
557
558 Yields the contents of `inventory.iteritems()` filtered to include
559 only reportable models. A model is reportable if it has DUTs in
560 both `spare_pool` and at least one other pool.
561
562 @param spare_pool The spare pool to be tested for reporting.
563 """
564 for model, poolset in inventory.iteritems():
565 spares = poolset.get_total(spare_pool)
566 total = poolset.get_total()
567 if spares != 0 and spares != total:
568 yield model, poolset
569
570
571def _all_dut_histories(inventory):
572 for poolset in inventory.itervalues():
573 for h in poolset.get_all_histories():
574 yield h
575
576
J. Richard Barnettef6839282015-06-01 16:00:35 -0700577def _sort_by_location(inventory_list):
578 """Return a list of DUTs, organized by location.
579
580 Take the given list of `HostJobHistory` objects, separate it
581 into a list per lab, and sort each lab's list by location. The
582 order of sorting within a lab is
583 * By row number within the lab,
584 * then by rack number within the row,
585 * then by host shelf number within the rack.
586
587 Return a list of the sorted lists.
588
589 Implementation note: host locations are sorted by converting
590 each location into a base 100 number. If row, rack or
591 host numbers exceed the range [0..99], then sorting will
592 break down.
593
594 @return A list of sorted lists of DUTs.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700595 """
596 BASE = 100
597 lab_lists = {}
598 for history in inventory_list:
599 location = _HOSTNAME_PATTERN.match(history.host.hostname)
600 if location:
601 lab = location.group(1)
602 key = 0
603 for idx in location.group(2, 3, 4):
604 key = BASE * key + int(idx)
605 lab_lists.setdefault(lab, []).append((key, history))
606 return_list = []
607 for dut_list in lab_lists.values():
608 dut_list.sort(key=lambda t: t[0])
609 return_list.append([t[1] for t in dut_list])
610 return return_list
611
612
613def _score_repair_set(buffer_counts, repair_list):
614 """Return a numeric score rating a set of DUTs to be repaired.
615
Richard Barnette5de01eb2017-12-15 09:53:42 -0800616 `buffer_counts` is a dictionary mapping model names to the size of
617 the model's spares buffer.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700618
Richard Barnette5de01eb2017-12-15 09:53:42 -0800619 `repair_list` is a list of `HostJobHistory` objects for the DUTs to
620 be repaired.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700621
622 This function calculates the new set of buffer counts that would
Richard Barnette5de01eb2017-12-15 09:53:42 -0800623 result from the proposed repairs, and scores the new set using two
624 numbers:
625 * Worst case buffer count for any model (higher is better). This
626 is the more significant number for comparison.
627 * Number of models at the worst case (lower is better). This is
628 the less significant number.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700629
Richard Barnette5de01eb2017-12-15 09:53:42 -0800630 Implementation note: The score could fail to reflect the intended
631 criteria if there are more than 1000 models in the inventory.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700632
Richard Barnette5de01eb2017-12-15 09:53:42 -0800633 @param spare_counts A dictionary mapping models to buffer counts.
634 @param repair_list A list of `HostJobHistory` objects for the
635 DUTs to be repaired.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700636 @return A numeric score.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700637 """
638 # Go through `buffer_counts`, and create a list of new counts
Richard Barnette5de01eb2017-12-15 09:53:42 -0800639 # that records the buffer count for each model after repair.
640 # The new list of counts discards the model names, as they don't
J. Richard Barnettef6839282015-06-01 16:00:35 -0700641 # contribute to the final score.
Richard Barnette5de01eb2017-12-15 09:53:42 -0800642 _NMODELS = 1000
643 pools = {h.host_pool for h in repair_list}
644 repair_inventory = _LabInventory(repair_list, pools)
J. Richard Barnettef6839282015-06-01 16:00:35 -0700645 new_counts = []
Richard Barnette5de01eb2017-12-15 09:53:42 -0800646 for m, c in buffer_counts.iteritems():
647 if m in repair_inventory:
648 newcount = repair_inventory[m].get_total()
J. Richard Barnettef6839282015-06-01 16:00:35 -0700649 else:
650 newcount = 0
651 new_counts.append(c + newcount)
652 # Go through the new list of counts. Find the worst available
653 # spares count, and count how many times that worst case occurs.
654 worst_count = new_counts[0]
655 num_worst = 1
656 for c in new_counts[1:]:
657 if c == worst_count:
658 num_worst += 1
659 elif c < worst_count:
660 worst_count = c
661 num_worst = 1
662 # Return the calculated score
Richard Barnette5de01eb2017-12-15 09:53:42 -0800663 return _NMODELS * worst_count - num_worst
J. Richard Barnettef6839282015-06-01 16:00:35 -0700664
665
666def _generate_repair_recommendation(inventory, num_recommend):
667 """Return a summary of selected DUTs needing repair.
668
Richard Barnette5de01eb2017-12-15 09:53:42 -0800669 Returns a message recommending a list of broken DUTs to be repaired.
670 The list of DUTs is selected based on these criteria:
J. Richard Barnettef6839282015-06-01 16:00:35 -0700671 * No more than `num_recommend` DUTs will be listed.
672 * All DUTs must be in the same lab.
Richard Barnette5de01eb2017-12-15 09:53:42 -0800673 * DUTs should be selected for some degree of physical proximity.
674 * DUTs for models with a low spares buffer are more important than
675 DUTs with larger buffers.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700676
Richard Barnette5de01eb2017-12-15 09:53:42 -0800677 The algorithm used will guarantee that at least one DUT from a model
678 with the lowest spares buffer will be recommended. If the worst
679 spares buffer number is shared by more than one model, the algorithm
680 will tend to prefer repair sets that include more of those models
681 over sets that cover fewer models.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700682
Richard Barnette5de01eb2017-12-15 09:53:42 -0800683 @param inventory `_LabInventory` object from which to generate
684 recommendations.
J. Richard Barnette1df6a562015-06-09 10:06:17 -0700685 @param num_recommend Number of DUTs to recommend for repair.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700686 """
687 logging.debug('Creating DUT repair recommendations')
Richard Barnette5de01eb2017-12-15 09:53:42 -0800688 model_buffer_counts = {}
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700689 broken_list = []
Richard Barnettee8eee312018-04-27 13:12:04 -0400690 for model, counts in _reportable_models(inventory):
Richard Barnette5de01eb2017-12-15 09:53:42 -0800691 logging.debug('Listing failed DUTs for %s', model)
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700692 if counts.get_broken() != 0:
Richard Barnette5de01eb2017-12-15 09:53:42 -0800693 model_buffer_counts[model] = counts.get_spares_buffer()
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700694 broken_list.extend(counts.get_broken_list())
J. Richard Barnette55127432015-10-13 17:01:56 -0700695 # N.B. The logic inside this loop may seem complicated, but
J. Richard Barnettef6839282015-06-01 16:00:35 -0700696 # simplification is hard:
697 # * Calculating an initial recommendation outside of
698 # the loop likely would make things more complicated,
699 # not less.
700 # * It's necessary to calculate an initial lab slice once per
701 # lab _before_ the while loop, in case the number of broken
702 # DUTs in a lab is less than `num_recommend`.
J. Richard Barnette55127432015-10-13 17:01:56 -0700703 recommendation = None
704 best_score = None
705 for lab_duts in _sort_by_location(broken_list):
J. Richard Barnettef6839282015-06-01 16:00:35 -0700706 start = 0
707 end = num_recommend
708 lab_slice = lab_duts[start : end]
Richard Barnette5de01eb2017-12-15 09:53:42 -0800709 lab_score = _score_repair_set(model_buffer_counts, lab_slice)
J. Richard Barnettef6839282015-06-01 16:00:35 -0700710 while end < len(lab_duts):
711 start += 1
712 end += 1
713 new_slice = lab_duts[start : end]
Richard Barnette5de01eb2017-12-15 09:53:42 -0800714 new_score = _score_repair_set(model_buffer_counts, new_slice)
J. Richard Barnettef6839282015-06-01 16:00:35 -0700715 if new_score > lab_score:
716 lab_slice = new_slice
717 lab_score = new_score
718 if recommendation is None or lab_score > best_score:
719 recommendation = lab_slice
720 best_score = lab_score
Richard Barnette5de01eb2017-12-15 09:53:42 -0800721 # N.B. The trailing space in `line_fmt` is manadatory: Without it,
722 # Gmail will parse the URL wrong. Don't ask. If you simply _must_
J. Richard Barnette5d0fa512016-04-05 17:39:52 -0700723 # know more, go try it yourself...
724 line_fmt = '%-30s %-16s %-6s\n %s '
J. Richard Barnette1df6a562015-06-09 10:06:17 -0700725 message = ['Repair recommendations:\n',
Richard Barnette5de01eb2017-12-15 09:53:42 -0800726 line_fmt % ( 'Hostname', 'Model', 'Servo?', 'Logs URL')]
Richard Barnetteb14c7dc2018-09-17 14:16:48 -0700727 if recommendation:
728 for h in recommendation:
729 servo_name = servo_host.make_servo_hostname(h.host.hostname)
730 servo_present = utils.host_is_in_lab_zone(servo_name)
Richard Barnette59404262018-09-14 15:25:30 -0700731 event = _get_diagnosis(h).task
Richard Barnetteb14c7dc2018-09-17 14:16:48 -0700732 line = line_fmt % (
733 h.host.hostname, h.host_model,
734 'Yes' if servo_present else 'No', event.job_url)
735 message.append(line)
736 else:
737 message.append('(No DUTs to repair)')
J. Richard Barnettef6839282015-06-01 16:00:35 -0700738 return '\n'.join(message)
739
740
Richard Barnette5de01eb2017-12-15 09:53:42 -0800741def _generate_model_inventory_message(inventory):
742 """Generate the "model inventory" e-mail message.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700743
Richard Barnette5de01eb2017-12-15 09:53:42 -0800744 The model inventory is a list by model summarizing the number of
745 working, broken, and idle DUTs, and the total shortfall or surplus
J. Richard Barnette96db3492015-03-27 17:23:52 -0700746 of working devices relative to the minimum critical pool
747 requirement.
748
Richard Barnette5de01eb2017-12-15 09:53:42 -0800749 The report omits models with no DUTs in the spare pool or with no
750 DUTs in a critical pool.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700751
752 N.B. For sample output text formattted as users can expect to
753 see it in e-mail and log files, refer to the unit tests.
754
Richard Barnette5de01eb2017-12-15 09:53:42 -0800755 @param inventory `_LabInventory` object to be reported on.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700756 @return String with the inventory message to be sent.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700757 """
Richard Barnette5de01eb2017-12-15 09:53:42 -0800758 logging.debug('Creating model inventory')
J. Richard Barnetted3ba33a2015-10-14 11:20:49 -0700759 nworking = 0
760 nbroken = 0
xixuan12ce04f2016-03-10 13:16:30 -0800761 nidle = 0
Richard Barnette5de01eb2017-12-15 09:53:42 -0800762 nbroken_models = 0
763 ntotal_models = 0
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700764 summaries = []
Richard Barnette5de01eb2017-12-15 09:53:42 -0800765 column_names = (
766 'Model', 'Avail', 'Bad', 'Idle', 'Good', 'Spare', 'Total')
Richard Barnettee8eee312018-04-27 13:12:04 -0400767 for model, counts in _reportable_models(inventory):
Richard Barnette5de01eb2017-12-15 09:53:42 -0800768 logging.debug('Counting %2d DUTS for model %s',
769 counts.get_total(), model)
770 # Summary elements laid out in the same order as the column
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700771 # headers:
Richard Barnette5de01eb2017-12-15 09:53:42 -0800772 # Model Avail Bad Idle Good Spare Total
xixuan12ce04f2016-03-10 13:16:30 -0800773 # e[0] e[1] e[2] e[3] e[4] e[5] e[6]
Richard Barnette5de01eb2017-12-15 09:53:42 -0800774 element = (model,
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700775 counts.get_spares_buffer(),
776 counts.get_broken(),
xixuan12ce04f2016-03-10 13:16:30 -0800777 counts.get_idle(),
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700778 counts.get_working(),
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700779 counts.get_total(SPARE_POOL),
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700780 counts.get_total())
J. Richard Barnetteea5a4ba2016-02-18 16:34:50 -0800781 if element[2]:
782 summaries.append(element)
Richard Barnette5de01eb2017-12-15 09:53:42 -0800783 nbroken_models += 1
784 ntotal_models += 1
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700785 nbroken += element[2]
xixuan12ce04f2016-03-10 13:16:30 -0800786 nidle += element[3]
787 nworking += element[4]
788 ntotal = nworking + nbroken + nidle
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700789 summaries = sorted(summaries, key=lambda e: (e[1], -e[2]))
J. Richard Barnetted3ba33a2015-10-14 11:20:49 -0700790 broken_percent = int(round(100.0 * nbroken / ntotal))
xixuan12ce04f2016-03-10 13:16:30 -0800791 idle_percent = int(round(100.0 * nidle / ntotal))
792 working_percent = 100 - broken_percent - idle_percent
J. Richard Barnetted3ba33a2015-10-14 11:20:49 -0700793 message = ['Summary of DUTs in inventory:',
xixuan12ce04f2016-03-10 13:16:30 -0800794 '%10s %10s %10s %6s' % ('Bad', 'Idle', 'Good', 'Total'),
795 '%5d %3d%% %5d %3d%% %5d %3d%% %6d' % (
J. Richard Barnetted3ba33a2015-10-14 11:20:49 -0700796 nbroken, broken_percent,
xixuan12ce04f2016-03-10 13:16:30 -0800797 nidle, idle_percent,
J. Richard Barnetted3ba33a2015-10-14 11:20:49 -0700798 nworking, working_percent,
799 ntotal),
800 '',
Richard Barnette5de01eb2017-12-15 09:53:42 -0800801 'Models with failures: %d' % nbroken_models,
802 'Models in inventory: %d' % ntotal_models,
J. Richard Barnetted3ba33a2015-10-14 11:20:49 -0700803 '', '',
Richard Barnette5de01eb2017-12-15 09:53:42 -0800804 'Full model inventory:\n',
805 '%-22s %5s %5s %5s %5s %5s %5s' % column_names]
J. Richard Barnette96db3492015-03-27 17:23:52 -0700806 message.extend(
xixuan12ce04f2016-03-10 13:16:30 -0800807 ['%-22s %5d %5d %5d %5d %5d %5d' % e for e in summaries])
J. Richard Barnette96db3492015-03-27 17:23:52 -0700808 return '\n'.join(message)
809
810
J. Richard Barnette4845fcf2015-04-20 14:26:25 -0700811_POOL_INVENTORY_HEADER = '''\
Richard Barnette5de01eb2017-12-15 09:53:42 -0800812Notice to Infrastructure deputies: All models shown below are at
J. Richard Barnettec9a143c2015-06-04 11:11:19 -0700813less than full strength, please take action to resolve the issues.
814Once you're satisified that failures won't recur, failed DUTs can
815be replaced with spares by running `balance_pool`. Detailed
816instructions can be found here:
J. Richard Barnette4845fcf2015-04-20 14:26:25 -0700817 http://go/cros-manage-duts
818'''
819
820
J. Richard Barnette96db3492015-03-27 17:23:52 -0700821def _generate_pool_inventory_message(inventory):
822 """Generate the "pool inventory" e-mail message.
823
Richard Barnette5de01eb2017-12-15 09:53:42 -0800824 The pool inventory is a list by pool and model summarizing the
825 number of working and broken DUTs in the pool. Only models with
J. Richard Barnette96db3492015-03-27 17:23:52 -0700826 at least one broken DUT are included in the list.
827
Richard Barnette5de01eb2017-12-15 09:53:42 -0800828 N.B. For sample output text formattted as users can expect to see it
829 in e-mail and log files, refer to the unit tests.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700830
Richard Barnette5de01eb2017-12-15 09:53:42 -0800831 @param inventory `_LabInventory` object to be reported on.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700832 @return String with the inventory message to be sent.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700833 """
834 logging.debug('Creating pool inventory')
J. Richard Barnette4845fcf2015-04-20 14:26:25 -0700835 message = [_POOL_INVENTORY_HEADER]
J. Richard Barnette96db3492015-03-27 17:23:52 -0700836 newline = ''
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700837 for pool in CRITICAL_POOLS:
J. Richard Barnette96db3492015-03-27 17:23:52 -0700838 message.append(
Richard Barnette5de01eb2017-12-15 09:53:42 -0800839 '%sStatus for pool:%s, by model:' % (newline, pool))
J. Richard Barnette96db3492015-03-27 17:23:52 -0700840 message.append(
xixuan12ce04f2016-03-10 13:16:30 -0800841 '%-20s %5s %5s %5s %5s' % (
Richard Barnette5de01eb2017-12-15 09:53:42 -0800842 'Model', 'Bad', 'Idle', 'Good', 'Total'))
J. Richard Barnette96db3492015-03-27 17:23:52 -0700843 data_list = []
Richard Barnette5de01eb2017-12-15 09:53:42 -0800844 for model, counts in inventory.iteritems():
Richard Barnette254d5b42016-07-06 19:13:23 -0700845 logging.debug('Counting %2d DUTs for %s, %s',
Richard Barnette5de01eb2017-12-15 09:53:42 -0800846 counts.get_total(pool), model, pool)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700847 broken = counts.get_broken(pool)
xixuan12ce04f2016-03-10 13:16:30 -0800848 idle = counts.get_idle(pool)
Richard Barnette5de01eb2017-12-15 09:53:42 -0800849 # models at full strength are not reported
850 if not broken and not idle:
J. Richard Barnette96db3492015-03-27 17:23:52 -0700851 continue
852 working = counts.get_working(pool)
853 total = counts.get_total(pool)
Richard Barnette5de01eb2017-12-15 09:53:42 -0800854 data_list.append((model, broken, idle, working, total))
J. Richard Barnette96db3492015-03-27 17:23:52 -0700855 if data_list:
856 data_list = sorted(data_list, key=lambda d: -d[1])
857 message.extend(
xixuan12ce04f2016-03-10 13:16:30 -0800858 ['%-20s %5d %5d %5d %5d' % t for t in data_list])
J. Richard Barnette96db3492015-03-27 17:23:52 -0700859 else:
Richard Barnette5de01eb2017-12-15 09:53:42 -0800860 message.append('(All models at full strength)')
J. Richard Barnette96db3492015-03-27 17:23:52 -0700861 newline = '\n'
862 return '\n'.join(message)
863
864
xixuan12ce04f2016-03-10 13:16:30 -0800865_IDLE_INVENTORY_HEADER = '''\
866Notice to Infrastructure deputies: The hosts shown below haven't
867run any jobs for at least 24 hours. Please check each host; locked
868hosts should normally be unlocked; stuck jobs should normally be
869aborted.
870'''
871
872
873def _generate_idle_inventory_message(inventory):
874 """Generate the "idle inventory" e-mail message.
875
Richard Barnette5de01eb2017-12-15 09:53:42 -0800876 The idle inventory is a host list with corresponding pool and model,
Richard Barnettee8eee312018-04-27 13:12:04 -0400877 where the hosts are identified as idle.
xixuan12ce04f2016-03-10 13:16:30 -0800878
879 N.B. For sample output text format as users can expect to
880 see it in e-mail and log files, refer to the unit tests.
881
Richard Barnette5de01eb2017-12-15 09:53:42 -0800882 @param inventory `_LabInventory` object to be reported on.
xixuan12ce04f2016-03-10 13:16:30 -0800883 @return String with the inventory message to be sent.
xixuan12ce04f2016-03-10 13:16:30 -0800884 """
885 logging.debug('Creating idle inventory')
886 message = [_IDLE_INVENTORY_HEADER]
887 message.append('Idle Host List:')
Richard Barnette5de01eb2017-12-15 09:53:42 -0800888 message.append('%-30s %-20s %s' % ('Hostname', 'Model', 'Pool'))
xixuan12ce04f2016-03-10 13:16:30 -0800889 data_list = []
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700890 for pool in MANAGED_POOLS:
Richard Barnette5de01eb2017-12-15 09:53:42 -0800891 for model, counts in inventory.iteritems():
Richard Barnette254d5b42016-07-06 19:13:23 -0700892 logging.debug('Counting %2d DUTs for %s, %s',
Richard Barnette5de01eb2017-12-15 09:53:42 -0800893 counts.get_total(pool), model, pool)
894 data_list.extend([(dut.host.hostname, model, pool)
xixuan12ce04f2016-03-10 13:16:30 -0800895 for dut in counts.get_idle_list(pool)])
896 if data_list:
897 message.extend(['%-30s %-20s %s' % t for t in data_list])
898 else:
899 message.append('(No idle DUTs)')
900 return '\n'.join(message)
901
902
J. Richard Barnette96db3492015-03-27 17:23:52 -0700903def _send_email(arguments, tag, subject, recipients, body):
904 """Send an inventory e-mail message.
905
Richard Barnette5de01eb2017-12-15 09:53:42 -0800906 The message is logged in the selected log directory using `tag` for
907 the file name.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700908
Richard Barnette5de01eb2017-12-15 09:53:42 -0800909 If the --debug option was requested, the message is neither logged
910 nor sent, but merely printed on stdout.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700911
912 @param arguments Parsed command-line options.
913 @param tag Tag identifying the inventory for logging
914 purposes.
915 @param subject E-mail Subject: header line.
916 @param recipients E-mail addresses for the To: header line.
917 @param body E-mail message body.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700918 """
919 logging.debug('Generating email: "%s"', subject)
920 all_recipients = ', '.join(recipients)
921 report_body = '\n'.join([
922 'To: %s' % all_recipients,
923 'Subject: %s' % subject,
924 '', body, ''])
J. Richard Barnette02e82432015-10-13 16:02:47 -0700925 if arguments.debug:
J. Richard Barnette96db3492015-03-27 17:23:52 -0700926 print report_body
927 else:
928 filename = os.path.join(arguments.logdir, tag)
929 try:
930 report_file = open(filename, 'w')
931 report_file.write(report_body)
932 report_file.close()
933 except EnvironmentError as e:
934 logging.error('Failed to write %s: %s', filename, e)
935 try:
936 gmail_lib.send_email(all_recipients, subject, body)
937 except Exception as e:
938 logging.error('Failed to send e-mail to %s: %s',
939 all_recipients, e)
940
941
Richard Barnette5de01eb2017-12-15 09:53:42 -0800942def _populate_model_counts(inventory):
943 """Gather model counts while providing interactive feedback.
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700944
945 Gathering the status of all individual DUTs in the lab can take
946 considerable time (~30 minutes at the time of this writing).
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700947 Normally, we pay that cost by querying as we go. However, with
948 the `--debug` option, we expect a human being to be watching the
Richard Barnette5de01eb2017-12-15 09:53:42 -0800949 progress in real time. So, we force the first (expensive) queries
950 to happen up front, and provide simple ASCII output on sys.stdout
951 to show a progress bar and results.
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700952
Richard Barnette5de01eb2017-12-15 09:53:42 -0800953 @param inventory `_LabInventory` object from which to gather
954 counts.
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700955 """
956 n = 0
957 total_broken = 0
Richard Barnette5de01eb2017-12-15 09:53:42 -0800958 for counts in inventory.itervalues():
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700959 n += 1
960 if n % 10 == 5:
961 c = '+'
962 elif n % 10 == 0:
963 c = '%d' % ((n / 10) % 10)
964 else:
965 c = '.'
966 sys.stdout.write(c)
967 sys.stdout.flush()
Richard Barnette5de01eb2017-12-15 09:53:42 -0800968 # This next call is where all the time goes - it forces all of a
969 # model's `HostJobHistory` objects to query the database and
970 # cache their results.
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700971 total_broken += counts.get_broken()
972 sys.stdout.write('\n')
973 sys.stdout.write('Found %d broken DUTs\n' % total_broken)
974
975
Richard Barnette5de01eb2017-12-15 09:53:42 -0800976def _perform_model_inventory(arguments, inventory, timestamp):
977 """Perform the model inventory report.
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700978
Richard Barnette5de01eb2017-12-15 09:53:42 -0800979 The model inventory report consists of the following:
980 * A list of DUTs that are recommended to be repaired. This list
981 is optional, and only appears if the `--recommend` option is
982 present.
983 * A list of all models that have failed DUTs, with counts
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700984 of working, broken, and spare DUTs, among others.
985
986 @param arguments Command-line arguments as returned by
987 `ArgumentParser`
Richard Barnette5de01eb2017-12-15 09:53:42 -0800988 @param inventory `_LabInventory` object to be reported on.
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700989 @param timestamp A string used to identify this run's timestamp
990 in logs and email output.
991 """
992 if arguments.recommend:
993 recommend_message = _generate_repair_recommendation(
994 inventory, arguments.recommend) + '\n\n\n'
995 else:
996 recommend_message = ''
Richard Barnette5de01eb2017-12-15 09:53:42 -0800997 model_message = _generate_model_inventory_message(inventory)
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700998 _send_email(arguments,
Richard Barnette5de01eb2017-12-15 09:53:42 -0800999 'models-%s.txt' % timestamp,
1000 'DUT model inventory %s' % timestamp,
1001 arguments.model_notify,
1002 recommend_message + model_message)
Richard Barnette3dcbb6a2017-10-23 17:57:50 -07001003
1004
1005def _perform_pool_inventory(arguments, inventory, timestamp):
1006 """Perform the pool inventory report.
1007
1008 The pool inventory report consists of the following:
1009 * A list of all critical pools that have failed DUTs, with counts
1010 of working, broken, and idle DUTs.
Richard Barnette5de01eb2017-12-15 09:53:42 -08001011 * A list of all idle DUTs by hostname including the model and
Richard Barnette3dcbb6a2017-10-23 17:57:50 -07001012 pool.
1013
1014 @param arguments Command-line arguments as returned by
1015 `ArgumentParser`
Richard Barnette5de01eb2017-12-15 09:53:42 -08001016 @param inventory `_LabInventory` object to be reported on.
Richard Barnettecf5d8342017-10-24 18:13:11 -07001017 @param timestamp A string used to identify this run's timestamp in
1018 logs and email output.
Richard Barnette3dcbb6a2017-10-23 17:57:50 -07001019 """
1020 pool_message = _generate_pool_inventory_message(inventory)
1021 idle_message = _generate_idle_inventory_message(inventory)
1022 _send_email(arguments,
1023 'pools-%s.txt' % timestamp,
1024 'DUT pool inventory %s' % timestamp,
1025 arguments.pool_notify,
1026 pool_message + '\n\n\n' + idle_message)
1027
1028
Richard Barnettecf5d8342017-10-24 18:13:11 -07001029def _dut_in_repair_loop(history):
1030 """Return whether a DUT's history indicates a repair loop.
1031
1032 A DUT is considered looping if it runs no tests, and no tasks pass
1033 other than repair tasks.
1034
1035 @param history An instance of `status_history.HostJobHistory` to be
1036 scanned for a repair loop. The caller guarantees
1037 that this history corresponds to a working DUT.
1038 @returns Return a true value if the DUT's most recent history
1039 indicates a repair loop.
1040 """
1041 # Our caller passes only histories for working DUTs; that means
1042 # we've already paid the cost of fetching the diagnosis task, and
1043 # we know that the task was successful. The diagnosis task will be
1044 # one of the tasks we must scan to find a loop, so if the task isn't
1045 # a repair task, then our history includes a successful non-repair
1046 # task, and we're not looping.
1047 #
Richard Barnette1ca30e62018-04-09 16:45:58 -07001048 # The for loop below is very expensive, because it must fetch the
Richard Barnettecf5d8342017-10-24 18:13:11 -07001049 # full history, regardless of how many tasks we examine. At the
1050 # time of this writing, this check against the diagnosis task
1051 # reduces the cost of finding loops in the full inventory from hours
1052 # to minutes.
Richard Barnette59404262018-09-14 15:25:30 -07001053 if _get_diagnosis(history).task.name != 'Repair':
Richard Barnettecf5d8342017-10-24 18:13:11 -07001054 return False
1055 repair_ok_count = 0
1056 for task in history:
1057 if not task.is_special:
1058 # This is a test, so we're not looping.
1059 return False
1060 if task.diagnosis == status_history.BROKEN:
1061 # Failed a repair, so we're not looping.
1062 return False
1063 if (task.diagnosis == status_history.WORKING
1064 and task.name != 'Repair'):
1065 # Non-repair task succeeded, so we're not looping.
1066 return False
1067 # At this point, we have either a failed non-repair task, or
1068 # a successful repair.
1069 if task.name == 'Repair':
1070 repair_ok_count += 1
1071 if repair_ok_count >= _REPAIR_LOOP_THRESHOLD:
1072 return True
1073
1074
Richard Barnette1ca30e62018-04-09 16:45:58 -07001075def _report_untestable_dut(history, state):
1076 fields = {
1077 'dut_hostname': history.hostname,
1078 'model': history.host_model,
1079 'pool': history.host_pool,
1080 'state': state,
1081 }
Richard Barnettee8eee312018-04-27 13:12:04 -04001082 logging.info('DUT in state %(state)s: %(dut_hostname)s, '
1083 'model: %(model)s, pool: %(pool)s', fields)
Richard Barnette1ca30e62018-04-09 16:45:58 -07001084 _UNTESTABLE_PRESENCE_METRIC.set(True, fields=fields)
Richard Barnettecf5d8342017-10-24 18:13:11 -07001085
Richard Barnettecf5d8342017-10-24 18:13:11 -07001086
Richard Barnette1ca30e62018-04-09 16:45:58 -07001087def _report_untestable_dut_metrics(inventory):
1088 """Scan the inventory for DUTs unable to run tests.
1089
1090 DUTs in the inventory are judged "untestable" if they meet one of
1091 two criteria:
1092 * The DUT is stuck in a repair loop; that is, it regularly passes
1093 repair, but never passes other operations.
1094 * The DUT runs no tasks at all, but is not locked.
1095
1096 This routine walks through the given inventory looking for DUTs in
1097 either of these states. Results are reported via a Monarch presence
1098 metric.
1099
1100 Note: To make sure that DUTs aren't flagged as "idle" merely
1101 because there's no work, a separate job runs prior to regular
1102 inventory runs which schedules trivial work on any DUT that appears
1103 idle.
1104
1105 @param inventory `_LabInventory` object to be reported on.
1106 """
Richard Barnettee8eee312018-04-27 13:12:04 -04001107 logging.info('Scanning for untestable DUTs.')
1108 for history in _all_dut_histories(inventory):
1109 # Managed DUTs with names that don't match
1110 # _HOSTNAME_PATTERN shouldn't be possible. However, we
1111 # don't want arbitrary strings being attached to the
1112 # 'dut_hostname' field, so for safety, we exclude all
1113 # anomalies.
1114 if not _HOSTNAME_PATTERN.match(history.hostname):
1115 continue
1116 if _host_is_working(history):
1117 if _dut_in_repair_loop(history):
1118 _report_untestable_dut(history, 'repair_loop')
1119 elif _host_is_idle(history):
1120 if not history.host.locked:
1121 _report_untestable_dut(history, 'idle_unlocked')
Richard Barnettecf5d8342017-10-24 18:13:11 -07001122
1123
Richard Barnette3dcbb6a2017-10-23 17:57:50 -07001124def _log_startup(arguments, startup_time):
1125 """Log the start of this inventory run.
1126
1127 Print various log messages indicating the start of the run. Return
1128 a string based on `startup_time` that will be used to identify this
1129 run in log files and e-mail messages.
1130
1131 @param startup_time A UNIX timestamp marking the moment when
1132 this inventory run began.
1133 @returns A timestamp string that will be used to identify this run
1134 in logs and email output.
1135 """
Jacob Kopczynskief6c92e2018-08-09 11:05:37 -07001136 timestamp = time.strftime(_TIMESTAMP_FORMAT,
Richard Barnette3dcbb6a2017-10-23 17:57:50 -07001137 time.localtime(startup_time))
1138 logging.debug('Starting lab inventory for %s', timestamp)
Richard Barnette5de01eb2017-12-15 09:53:42 -08001139 if arguments.model_notify:
Richard Barnette3dcbb6a2017-10-23 17:57:50 -07001140 if arguments.recommend:
1141 logging.debug('Will include repair recommendations')
Richard Barnette5de01eb2017-12-15 09:53:42 -08001142 logging.debug('Will include model inventory')
Richard Barnette3dcbb6a2017-10-23 17:57:50 -07001143 if arguments.pool_notify:
1144 logging.debug('Will include pool inventory')
1145 return timestamp
1146
1147
1148def _create_inventory(arguments, end_time):
1149 """Create the `_LabInventory` instance to use for reporting.
1150
1151 @param end_time A UNIX timestamp for the end of the time range
1152 to be searched in this inventory run.
1153 """
1154 start_time = end_time - arguments.duration * 60 * 60
1155 afe = frontend_wrappers.RetryingAFE(server=None)
1156 inventory = _LabInventory.create_inventory(
Richard Barnette5de01eb2017-12-15 09:53:42 -08001157 afe, start_time, end_time, arguments.modelnames)
1158 logging.info('Found %d hosts across %d models',
Richard Barnette3dcbb6a2017-10-23 17:57:50 -07001159 inventory.get_num_duts(),
Richard Barnette5de01eb2017-12-15 09:53:42 -08001160 inventory.get_num_models())
Richard Barnette3dcbb6a2017-10-23 17:57:50 -07001161 return inventory
1162
1163
Richard Barnettecf5d8342017-10-24 18:13:11 -07001164def _perform_inventory_reports(arguments):
1165 """Perform all inventory checks requested on the command line.
1166
1167 Create the initial inventory and run through the inventory reports
1168 as called for by the parsed command-line arguments.
1169
1170 @param arguments Command-line arguments as returned by
1171 `ArgumentParser`.
1172 """
1173 startup_time = time.time()
1174 timestamp = _log_startup(arguments, startup_time)
1175 inventory = _create_inventory(arguments, startup_time)
1176 if arguments.debug:
Richard Barnette5de01eb2017-12-15 09:53:42 -08001177 _populate_model_counts(inventory)
1178 if arguments.model_notify:
1179 _perform_model_inventory(arguments, inventory, timestamp)
Richard Barnettecf5d8342017-10-24 18:13:11 -07001180 if arguments.pool_notify:
1181 _perform_pool_inventory(arguments, inventory, timestamp)
Richard Barnette1ca30e62018-04-09 16:45:58 -07001182 if arguments.report_untestable:
1183 _report_untestable_dut_metrics(inventory)
Richard Barnettecf5d8342017-10-24 18:13:11 -07001184
1185
J. Richard Barnette96db3492015-03-27 17:23:52 -07001186def _separate_email_addresses(address_list):
1187 """Parse a list of comma-separated lists of e-mail addresses.
1188
1189 @param address_list A list of strings containing comma
1190 separate e-mail addresses.
1191 @return A list of the individual e-mail addresses.
J. Richard Barnette96db3492015-03-27 17:23:52 -07001192 """
1193 newlist = []
1194 for arg in address_list:
1195 newlist.extend([email.strip() for email in arg.split(',')])
1196 return newlist
1197
1198
1199def _verify_arguments(arguments):
1200 """Validate command-line arguments.
1201
Richard Barnette5de01eb2017-12-15 09:53:42 -08001202 Join comma separated e-mail addresses for `--model-notify` and
J. Richard Barnette96db3492015-03-27 17:23:52 -07001203 `--pool-notify` in separate option arguments into a single list.
1204
Richard Barnette54150302018-02-26 10:42:46 -08001205 For non-debug uses, require that at least one inventory report be
1206 requested. For debug, if a report isn't specified, treat it as "run
1207 all the reports."
J. Richard Barnette02e82432015-10-13 16:02:47 -07001208
1209 The return value indicates success or failure; in the case of
1210 failure, we also write an error message to stderr.
1211
J. Richard Barnette96db3492015-03-27 17:23:52 -07001212 @param arguments Command-line arguments as returned by
1213 `ArgumentParser`
J. Richard Barnette02e82432015-10-13 16:02:47 -07001214 @return True if the arguments are semantically good, or False
1215 if the arguments don't meet requirements.
J. Richard Barnette96db3492015-03-27 17:23:52 -07001216 """
Richard Barnette5de01eb2017-12-15 09:53:42 -08001217 arguments.model_notify = _separate_email_addresses(
1218 arguments.model_notify)
J. Richard Barnette96db3492015-03-27 17:23:52 -07001219 arguments.pool_notify = _separate_email_addresses(
1220 arguments.pool_notify)
Richard Barnette54150302018-02-26 10:42:46 -08001221 if not any([arguments.model_notify, arguments.pool_notify,
Richard Barnette1ca30e62018-04-09 16:45:58 -07001222 arguments.report_untestable]):
J. Richard Barnette02e82432015-10-13 16:02:47 -07001223 if not arguments.debug:
Richard Barnette54150302018-02-26 10:42:46 -08001224 sys.stderr.write('Must request at least one report via '
1225 '--model-notify, --pool-notify, or '
Richard Barnette1ca30e62018-04-09 16:45:58 -07001226 '--report-untestable\n')
J. Richard Barnette02e82432015-10-13 16:02:47 -07001227 return False
1228 else:
Richard Barnette54150302018-02-26 10:42:46 -08001229 # We want to run all the e-mail reports. An empty notify
1230 # list will cause a report to be skipped, so make sure the
J. Richard Barnette02e82432015-10-13 16:02:47 -07001231 # lists are non-empty.
Richard Barnette5de01eb2017-12-15 09:53:42 -08001232 arguments.model_notify = ['']
J. Richard Barnette02e82432015-10-13 16:02:47 -07001233 arguments.pool_notify = ['']
1234 return True
J. Richard Barnette96db3492015-03-27 17:23:52 -07001235
1236
Richard Barnette3dcbb6a2017-10-23 17:57:50 -07001237def _get_default_logdir(script):
J. Richard Barnette96db3492015-03-27 17:23:52 -07001238 """Get the default directory for the `--logdir` option.
1239
1240 The default log directory is based on the parent directory
1241 containing this script.
1242
1243 @param script Path to this script file.
1244 @return A path to a directory.
J. Richard Barnette96db3492015-03-27 17:23:52 -07001245 """
1246 basedir = os.path.dirname(os.path.abspath(script))
1247 basedir = os.path.dirname(basedir)
1248 return os.path.join(basedir, _LOGDIR)
1249
1250
1251def _parse_command(argv):
1252 """Parse the command line arguments.
1253
1254 Create an argument parser for this command's syntax, parse the
1255 command line, and return the result of the ArgumentParser
1256 parse_args() method.
1257
1258 @param argv Standard command line argument vector; argv[0] is
1259 assumed to be the command name.
1260 @return Result returned by ArgumentParser.parse_args().
J. Richard Barnette96db3492015-03-27 17:23:52 -07001261 """
1262 parser = argparse.ArgumentParser(
1263 prog=argv[0],
1264 description='Gather and report lab inventory statistics')
1265 parser.add_argument('-d', '--duration', type=int,
1266 default=_DEFAULT_DURATION, metavar='HOURS',
1267 help='number of hours back to search for status'
1268 ' (default: %d)' % _DEFAULT_DURATION)
Richard Barnette5de01eb2017-12-15 09:53:42 -08001269 parser.add_argument('--model-notify', action='append',
J. Richard Barnette96db3492015-03-27 17:23:52 -07001270 default=[], metavar='ADDRESS',
Richard Barnette5de01eb2017-12-15 09:53:42 -08001271 help='Generate model inventory message, '
J. Richard Barnette96db3492015-03-27 17:23:52 -07001272 'and send it to the given e-mail address(es)')
1273 parser.add_argument('--pool-notify', action='append',
1274 default=[], metavar='ADDRESS',
1275 help='Generate pool inventory message, '
1276 'and send it to the given address(es)')
J. Richard Barnette1df6a562015-06-09 10:06:17 -07001277 parser.add_argument('-r', '--recommend', type=int, default=None,
J. Richard Barnettef6839282015-06-01 16:00:35 -07001278 help=('Specify how many DUTs should be '
J. Richard Barnette1df6a562015-06-09 10:06:17 -07001279 'recommended for repair (default: no '
1280 'recommendation)'))
Richard Barnette1ca30e62018-04-09 16:45:58 -07001281 parser.add_argument('--report-untestable', action='store_true',
1282 help='Check for devices unable to run tests.')
J. Richard Barnette02e82432015-10-13 16:02:47 -07001283 parser.add_argument('--debug', action='store_true',
Prathmesh Prabhu6b48ede2018-05-07 14:33:07 -07001284 help='Print e-mail, metrics messages on stdout '
J. Richard Barnette96db3492015-03-27 17:23:52 -07001285 'without sending them.')
Richard Barnettec4374692018-09-17 13:53:38 -07001286 parser.add_argument('--no-metrics', action='store_false',
1287 dest='use_metrics',
1288 help='Suppress generation of Monarch metrics.')
Richard Barnette3dcbb6a2017-10-23 17:57:50 -07001289 parser.add_argument('--logdir', default=_get_default_logdir(argv[0]),
J. Richard Barnette96db3492015-03-27 17:23:52 -07001290 help='Directory where logs will be written.')
Richard Barnette5de01eb2017-12-15 09:53:42 -08001291 parser.add_argument('modelnames', nargs='*',
1292 metavar='MODEL',
1293 help='names of models to report on '
1294 '(default: all models)')
J. Richard Barnette96db3492015-03-27 17:23:52 -07001295 arguments = parser.parse_args(argv[1:])
J. Richard Barnette02e82432015-10-13 16:02:47 -07001296 if not _verify_arguments(arguments):
1297 return None
J. Richard Barnette96db3492015-03-27 17:23:52 -07001298 return arguments
1299
1300
1301def _configure_logging(arguments):
1302 """Configure the `logging` module for our needs.
1303
Richard Barnette3dcbb6a2017-10-23 17:57:50 -07001304 How we log depends on whether the `--debug` option was provided on
1305 the command line.
1306 * Without the option, we configure the logging to capture all
1307 potentially relevant events in a log file. The log file is
1308 configured to rotate once a week on Friday evening, preserving
1309 ~3 months worth of history.
1310 * With the option, we expect stdout to contain other
1311 human-readable output (including the contents of the e-mail
Richard Barnettecf5d8342017-10-24 18:13:11 -07001312 messages), so we restrict the output to INFO level.
1313
1314 For convenience, when `--debug` is on, the logging format has
1315 no adornments, so that a call like `logging.info(msg)` simply writes
1316 `msg` to stdout, plus a trailing newline.
J. Richard Barnette96db3492015-03-27 17:23:52 -07001317
1318 @param arguments Command-line arguments as returned by
1319 `ArgumentParser`
J. Richard Barnette96db3492015-03-27 17:23:52 -07001320 """
J. Richard Barnettef6839282015-06-01 16:00:35 -07001321 root_logger = logging.getLogger()
J. Richard Barnette02e82432015-10-13 16:02:47 -07001322 if arguments.debug:
J. Richard Barnettef6839282015-06-01 16:00:35 -07001323 root_logger.setLevel(logging.INFO)
Jacob Kopczynskief6c92e2018-08-09 11:05:37 -07001324 logfile = sys.stdout
J. Richard Barnette96db3492015-03-27 17:23:52 -07001325 else:
Jacob Kopczynskief6c92e2018-08-09 11:05:37 -07001326 root_logger.setLevel(logging.DEBUG)
1327 logfile = open(os.path.join(
1328 arguments.logdir,
1329 _LOGFILE + datetime.datetime.today().strftime(_TIMESTAMP_FORMAT)
1330 ))
Richard Barnette5af97402016-04-18 11:00:26 -07001331 if not os.path.exists(arguments.logdir):
1332 os.mkdir(arguments.logdir)
Jacob Kopczynskief6c92e2018-08-09 11:05:37 -07001333 handler = logging.StreamHandler(logfile)
1334 formatter = logging.Formatter(
1335 _LOG_FORMAT, time_utils.TIME_FMT)
1336 handler.setFormatter(formatter)
J. Richard Barnettef6839282015-06-01 16:00:35 -07001337 # TODO(jrbarnette) This is gross. Importing client.bin.utils
1338 # implicitly imported logging_config, which calls
1339 # logging.basicConfig() *at module level*. That gives us an
1340 # extra logging handler that we don't want. So, clear out all
1341 # the handlers here.
1342 for h in root_logger.handlers:
1343 root_logger.removeHandler(h)
1344 root_logger.addHandler(handler)
J. Richard Barnette96db3492015-03-27 17:23:52 -07001345
1346
J. Richard Barnette96db3492015-03-27 17:23:52 -07001347def main(argv):
1348 """Standard main routine.
Richard Barnettecf5d8342017-10-24 18:13:11 -07001349
1350 @param argv Command line arguments, including `sys.argv[0]`.
J. Richard Barnette96db3492015-03-27 17:23:52 -07001351 """
1352 arguments = _parse_command(argv)
J. Richard Barnette02e82432015-10-13 16:02:47 -07001353 if not arguments:
1354 sys.exit(1)
J. Richard Barnette96db3492015-03-27 17:23:52 -07001355 _configure_logging(arguments)
Prathmesh Prabhu6b48ede2018-05-07 14:33:07 -07001356
Richard Barnettec4374692018-09-17 13:53:38 -07001357 try:
1358 if arguments.use_metrics:
1359 if arguments.debug:
1360 logging.info('Debug mode: Will not report metrics to monarch.')
1361 metrics_file = '/dev/null'
1362 else:
1363 metrics_file = None
1364 with site_utils.SetupTsMonGlobalState(
1365 'lab_inventory', debug_file=metrics_file,
1366 auto_flush=False):
1367 success = False
1368 try:
1369 with metrics.SecondsTimer('%s/duration' % _METRICS_PREFIX):
1370 _perform_inventory_reports(arguments)
1371 success = True
1372 finally:
1373 metrics.Counter('%s/tick' % _METRICS_PREFIX).increment(
1374 fields={'success': success})
1375 metrics.Flush()
1376 else:
1377 _perform_inventory_reports(arguments)
1378 except KeyboardInterrupt:
1379 pass
1380 except Exception:
1381 # Our cron setup doesn't preserve stderr, so drop extra breadcrumbs.
1382 logging.exception('Error escaped main')
1383 raise
J. Richard Barnette96db3492015-03-27 17:23:52 -07001384
1385
Kevin Chengcf0ad2b2016-04-19 14:51:39 -07001386def get_inventory(afe):
J. Richard Barnetteaa868932015-10-23 13:28:59 -07001387 end_time = int(time.time())
1388 start_time = end_time - 24 * 60 * 60
Kevin Chengcf0ad2b2016-04-19 14:51:39 -07001389 return _LabInventory.create_inventory(afe, start_time, end_time)
1390
1391
1392def get_managed_boards(afe):
Richard Barnette5de01eb2017-12-15 09:53:42 -08001393 return get_inventory(afe).get_boards()
J. Richard Barnetteaa868932015-10-23 13:28:59 -07001394
1395
J. Richard Barnette96db3492015-03-27 17:23:52 -07001396if __name__ == '__main__':
1397 main(sys.argv)