blob: 8bcfd43cd57ff1aff3cca9ccb7b9110d057a79da [file] [log] [blame]
J. Richard Barnette96db3492015-03-27 17:23:52 -07001#!/usr/bin/env python
2# Copyright 2015 The Chromium OS Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6"""Create e-mail reports of the Lab's DUT inventory.
7
8Gathers a list of all DUTs of interest in the Lab, segregated by
Richard Barnette5de01eb2017-12-15 09:53:42 -08009model and pool, and determines whether each DUT is working or
J. Richard Barnette96db3492015-03-27 17:23:52 -070010broken. Then, send one or more e-mail reports summarizing the
11status to e-mail addresses provided on the command line.
12
Richard Barnette5de01eb2017-12-15 09:53:42 -080013usage: lab_inventory.py [ options ] [ model ... ]
J. Richard Barnette96db3492015-03-27 17:23:52 -070014
15Options:
16--duration / -d <hours>
17 How far back in time to search job history to determine DUT
18 status.
19
Richard Barnette5de01eb2017-12-15 09:53:42 -080020--model-notify <address>[,<address>]
21 Send the "model status" e-mail to all the specified e-mail
J. Richard Barnette96db3492015-03-27 17:23:52 -070022 addresses.
23
24--pool-notify <address>[,<address>]
25 Send the "pool status" e-mail to all the specified e-mail
26 addresses.
27
J. Richard Barnette1df6a562015-06-09 10:06:17 -070028--recommend <number>
Richard Barnette5de01eb2017-12-15 09:53:42 -080029 When generating the "model status" e-mail, include a list of
J. Richard Barnette1df6a562015-06-09 10:06:17 -070030 <number> specific DUTs to be recommended for repair.
31
Richard Barnette1ca30e62018-04-09 16:45:58 -070032--report-untestable
33 Scan the inventory for DUTs that can't test because they're stuck in
34 repair loops, or because the scheduler can't give them work.
Richard Barnettecf5d8342017-10-24 18:13:11 -070035
J. Richard Barnette96db3492015-03-27 17:23:52 -070036--logdir <directory>
37 Log progress and actions in a file under this directory. Text
38 of any e-mail sent will also be logged in a timestamped file in
39 this directory.
40
J. Richard Barnette02e82432015-10-13 16:02:47 -070041--debug
Richard Barnettecf5d8342017-10-24 18:13:11 -070042 Suppress all logging, metrics reporting, and sending e-mail.
43 Instead, write the output that would be generated onto stdout.
J. Richard Barnette96db3492015-03-27 17:23:52 -070044
Richard Barnette5de01eb2017-12-15 09:53:42 -080045<model> arguments:
46 With no arguments, gathers the status for all models in the lab.
47 With one or more named models on the command line, restricts
48 reporting to just those models.
J. Richard Barnette96db3492015-03-27 17:23:52 -070049"""
50
51
52import argparse
Prathmesh Prabhu021e7842017-11-08 18:05:45 -080053import collections
Jacob Kopczynskief6c92e2018-08-09 11:05:37 -070054import datetime
J. Richard Barnette96db3492015-03-27 17:23:52 -070055import logging
56import logging.handlers
57import os
J. Richard Barnettef6839282015-06-01 16:00:35 -070058import re
J. Richard Barnette96db3492015-03-27 17:23:52 -070059import sys
60import time
61
62import common
J. Richard Barnettef6839282015-06-01 16:00:35 -070063from autotest_lib.client.bin import utils
Richard Barnette6f6ce322018-09-07 16:23:20 +000064from autotest_lib.client.common_lib import time_utils
Jacob Kopczynski7c4c9542018-08-13 17:24:41 -070065from autotest_lib.frontend.afe.json_rpc import proxy
Xixuan Wu93e646c2017-12-07 18:36:10 -080066from autotest_lib.server import constants
Richard Barnettecf5d8342017-10-24 18:13:11 -070067from autotest_lib.server import site_utils
J. Richard Barnettea7c514e2015-09-15 11:13:23 -070068from autotest_lib.server.cros.dynamic_suite import frontend_wrappers
J. Richard Barnettef6839282015-06-01 16:00:35 -070069from autotest_lib.server.hosts import servo_host
Aviv Keshet7ee95862016-08-30 15:18:27 -070070from autotest_lib.server.lib import status_history
J. Richard Barnette96db3492015-03-27 17:23:52 -070071from autotest_lib.site_utils import gmail_lib
Richard Barnettecf5d8342017-10-24 18:13:11 -070072from chromite.lib import metrics
J. Richard Barnette96db3492015-03-27 17:23:52 -070073
74
Richard Barnette673573b2016-12-12 09:46:39 -080075CRITICAL_POOLS = constants.Pools.CRITICAL_POOLS
76SPARE_POOL = constants.Pools.SPARE_POOL
77MANAGED_POOLS = constants.Pools.MANAGED_POOLS
J. Richard Barnette96db3492015-03-27 17:23:52 -070078
J. Richard Barnetteb8bc570c2016-03-17 17:03:57 -070079# _EXCLUDED_LABELS - A set of labels that disqualify a DUT from
Richard Barnetteeabcf392017-09-01 15:10:54 -070080# monitoring by this script. Currently, we're excluding these:
81# + 'adb' - We're not ready to monitor Android or Brillo hosts.
82# + 'board:guado_moblab' - These are maintained by a separate
83# process that doesn't use this script.
Aviv Keshet1ba0dec2018-07-12 17:14:08 -070084# + 'board:veyron_rialto' due to crbug.com/854404
Kevin Chengcf0ad2b2016-04-19 14:51:39 -070085
C Shapiro7de04422018-08-29 14:46:11 -060086_EXCLUDED_LABELS = {'adb', 'board:guado_moblab',
Aviv Keshet1ba0dec2018-07-12 17:14:08 -070087 'board:veyron_rialto'}
J. Richard Barnetteb8bc570c2016-03-17 17:03:57 -070088
J. Richard Barnette96db3492015-03-27 17:23:52 -070089# _DEFAULT_DURATION:
90# Default value used for the --duration command line option.
91# Specifies how far back in time to search in order to determine
92# DUT status.
93
94_DEFAULT_DURATION = 24
95
J. Richard Barnette96db3492015-03-27 17:23:52 -070096# _LOGDIR:
Richard Barnettecf5d8342017-10-24 18:13:11 -070097# Relative path used in the calculation of the default setting for
98# the --logdir option. The full path is relative to the root of the
99# autotest directory, as determined from sys.argv[0].
J. Richard Barnette96db3492015-03-27 17:23:52 -0700100# _LOGFILE:
101# Basename of a file to which general log information will be
102# written.
103# _LOG_FORMAT:
104# Format string for log messages.
105
106_LOGDIR = os.path.join('logs', 'dut-data')
107_LOGFILE = 'lab-inventory.log'
108_LOG_FORMAT = '%(asctime)s | %(levelname)-10s | %(message)s'
109
J. Richard Barnettef6839282015-06-01 16:00:35 -0700110# Pattern describing location-based host names in the Chrome OS test
111# labs. Each DUT hostname designates the DUT's location:
112# * A lab (room) that's physically separated from other labs
113# (i.e. there's a door).
114# * A row (or aisle) of DUTs within the lab.
115# * A vertical rack of shelves on the row.
116# * A specific host on one shelf of the rack.
117
118_HOSTNAME_PATTERN = re.compile(
119 r'(chromeos\d+)-row(\d+)-rack(\d+)-host(\d+)')
120
Richard Barnettecf5d8342017-10-24 18:13:11 -0700121# _REPAIR_LOOP_THRESHOLD:
122# The number of repeated Repair tasks that must be seen to declare
123# that a DUT is stuck in a repair loop.
124
125_REPAIR_LOOP_THRESHOLD = 4
126
J. Richard Barnette96db3492015-03-27 17:23:52 -0700127
Prathmesh Prabhub69a6cc2018-05-07 14:49:33 -0700128_METRICS_PREFIX = 'chromeos/autotest/inventory'
Richard Barnette1ca30e62018-04-09 16:45:58 -0700129_UNTESTABLE_PRESENCE_METRIC = metrics.BooleanMetric(
Jacob Kopczynski7c4c9542018-08-13 17:24:41 -0700130 _METRICS_PREFIX + '/untestable',
Richard Barnette1ca30e62018-04-09 16:45:58 -0700131 'DUTs that cannot be scheduled for testing')
132
Jacob Kopczynski7c4c9542018-08-13 17:24:41 -0700133_MISSING_DUT_METRIC = metrics.Counter(
134 _METRICS_PREFIX + '/missing', 'DUTs which cannot be found by lookup queries'
135 ' because they are invalid or deleted')
136
Jacob Kopczynskief6c92e2018-08-09 11:05:37 -0700137_TIMESTAMP_FORMAT = '%Y-%m-%d.%H'
138
Jacob Kopczynski7c4c9542018-08-13 17:24:41 -0700139def _get_diagnosis_safely(history, prop='diagnosis'):
140 return_prop = {'diagnosis': 0, 'task': 1}[prop]
141 dut_present = True
142 try:
143 return history.last_diagnosis()[return_prop]
144 except proxy.JSONRPCException as e:
145 logging.warn(e)
146 dut_present = False
147 finally:
148 _MISSING_DUT_METRIC.increment(
149 fields={'host': history.hostname, 'presence': dut_present})
Richard Barnette1ca30e62018-04-09 16:45:58 -0700150
Richard Barnettee8eee312018-04-27 13:12:04 -0400151def _host_is_working(history):
Jacob Kopczynski7c4c9542018-08-13 17:24:41 -0700152 return _get_diagnosis_safely(history) == status_history.WORKING
Richard Barnettee8eee312018-04-27 13:12:04 -0400153
154
155def _host_is_broken(history):
Jacob Kopczynski7c4c9542018-08-13 17:24:41 -0700156 return _get_diagnosis_safely(history) == status_history.BROKEN
Richard Barnettee8eee312018-04-27 13:12:04 -0400157
158
159def _host_is_idle(history):
160 idle_statuses = {status_history.UNUSED, status_history.UNKNOWN}
Jacob Kopczynski7c4c9542018-08-13 17:24:41 -0700161 return _get_diagnosis_safely(history) in idle_statuses
Richard Barnettee8eee312018-04-27 13:12:04 -0400162
163
Richard Barnette5de01eb2017-12-15 09:53:42 -0800164class _HostSetInventory(object):
165 """Maintains a set of related `HostJobHistory` objects.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700166
Richard Barnettee8eee312018-04-27 13:12:04 -0400167 Current usage of this class is that all DUTs are part of a single
168 scheduling pool of DUTs for a single model; however, this class make
169 no assumptions about the actual relationship among the DUTs.
170
Richard Barnette5de01eb2017-12-15 09:53:42 -0800171 The collection is segregated into disjoint categories of "working",
172 "broken", and "idle" DUTs. Accessor methods allow finding both the
173 list of DUTs in each category, as well as counts of each category.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700174
J. Richard Barnettef6839282015-06-01 16:00:35 -0700175 Performance note: Certain methods in this class are potentially
176 expensive:
177 * `get_working()`
178 * `get_working_list()`
179 * `get_broken()`
180 * `get_broken_list()`
xixuan12ce04f2016-03-10 13:16:30 -0800181 * `get_idle()`
182 * `get_idle_list()`
J. Richard Barnettef6839282015-06-01 16:00:35 -0700183 The first time any one of these methods is called, it causes
184 multiple RPC calls with a relatively expensive set of database
185 queries. However, the results of the queries are cached in the
186 individual `HostJobHistory` objects, so only the first call
187 actually pays the full cost.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700188
xixuan12ce04f2016-03-10 13:16:30 -0800189 Additionally, `get_working_list()`, `get_broken_list()` and
190 `get_idle_list()` cache their return values to avoid recalculating
Richard Barnette5de01eb2017-12-15 09:53:42 -0800191 lists at every call; this caching is separate from the caching of
192 RPC results described above.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700193
Richard Barnette5de01eb2017-12-15 09:53:42 -0800194 This class is deliberately constructed to delay the RPC cost until
195 the accessor methods are called (rather than to query in
J. Richard Barnette96db3492015-03-27 17:23:52 -0700196 `record_host()`) so that it's possible to construct a complete
197 `_LabInventory` without making the expensive queries at creation
Richard Barnette5de01eb2017-12-15 09:53:42 -0800198 time. `_populate_model_counts()`, below, assumes this behavior.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700199 """
200
201 def __init__(self):
202 self._histories = []
J. Richard Barnettef6839282015-06-01 16:00:35 -0700203 self._working_list = None
204 self._broken_list = None
xixuan12ce04f2016-03-10 13:16:30 -0800205 self._idle_list = None
J. Richard Barnette96db3492015-03-27 17:23:52 -0700206
J. Richard Barnette96db3492015-03-27 17:23:52 -0700207 def record_host(self, host_history):
208 """Add one `HostJobHistory` object to the collection.
209
210 @param host_history The `HostJobHistory` object to be
211 remembered.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700212 """
J. Richard Barnettef6839282015-06-01 16:00:35 -0700213 self._working_list = None
214 self._broken_list = None
xixuan12ce04f2016-03-10 13:16:30 -0800215 self._idle_list = None
J. Richard Barnette96db3492015-03-27 17:23:52 -0700216 self._histories.append(host_history)
217
J. Richard Barnettef6839282015-06-01 16:00:35 -0700218 def get_working_list(self):
219 """Return a list of all working DUTs in the pool.
220
Richard Barnettee8eee312018-04-27 13:12:04 -0400221 Filter `self._histories` for histories where the DUT is
222 diagnosed as working.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700223
224 Cache the result so that we only cacluate it once.
225
226 @return A list of HostJobHistory objects.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700227 """
228 if self._working_list is None:
229 self._working_list = [h for h in self._histories
Richard Barnettee8eee312018-04-27 13:12:04 -0400230 if _host_is_working(h)]
J. Richard Barnettef6839282015-06-01 16:00:35 -0700231 return self._working_list
232
J. Richard Barnette96db3492015-03-27 17:23:52 -0700233 def get_working(self):
J. Richard Barnettef6839282015-06-01 16:00:35 -0700234 """Return the number of working DUTs in the pool."""
235 return len(self.get_working_list())
236
J. Richard Barnettef6839282015-06-01 16:00:35 -0700237 def get_broken_list(self):
238 """Return a list of all broken DUTs in the pool.
239
Richard Barnettee8eee312018-04-27 13:12:04 -0400240 Filter `self._histories` for histories where the DUT is
241 diagnosed as broken.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700242
243 Cache the result so that we only cacluate it once.
244
245 @return A list of HostJobHistory objects.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700246 """
247 if self._broken_list is None:
248 self._broken_list = [h for h in self._histories
Richard Barnettee8eee312018-04-27 13:12:04 -0400249 if _host_is_broken(h)]
J. Richard Barnettef6839282015-06-01 16:00:35 -0700250 return self._broken_list
J. Richard Barnette96db3492015-03-27 17:23:52 -0700251
J. Richard Barnette96db3492015-03-27 17:23:52 -0700252 def get_broken(self):
J. Richard Barnettef6839282015-06-01 16:00:35 -0700253 """Return the number of broken DUTs in the pool."""
254 return len(self.get_broken_list())
J. Richard Barnette96db3492015-03-27 17:23:52 -0700255
xixuan12ce04f2016-03-10 13:16:30 -0800256 def get_idle_list(self):
257 """Return a list of all idle DUTs in the pool.
258
Richard Barnettee8eee312018-04-27 13:12:04 -0400259 Filter `self._histories` for histories where the DUT is
260 diagnosed as idle.
xixuan12ce04f2016-03-10 13:16:30 -0800261
262 Cache the result so that we only cacluate it once.
263
264 @return A list of HostJobHistory objects.
xixuan12ce04f2016-03-10 13:16:30 -0800265 """
xixuan12ce04f2016-03-10 13:16:30 -0800266 if self._idle_list is None:
267 self._idle_list = [h for h in self._histories
Richard Barnettee8eee312018-04-27 13:12:04 -0400268 if _host_is_idle(h)]
xixuan12ce04f2016-03-10 13:16:30 -0800269 return self._idle_list
270
xixuan12ce04f2016-03-10 13:16:30 -0800271 def get_idle(self):
272 """Return the number of idle DUTs in the pool."""
273 return len(self.get_idle_list())
274
J. Richard Barnette96db3492015-03-27 17:23:52 -0700275 def get_total(self):
J. Richard Barnettef6839282015-06-01 16:00:35 -0700276 """Return the total number of DUTs in the pool."""
J. Richard Barnette96db3492015-03-27 17:23:52 -0700277 return len(self._histories)
278
Richard Barnettee8eee312018-04-27 13:12:04 -0400279 def get_all_histories(self):
280 return self._histories
281
J. Richard Barnette96db3492015-03-27 17:23:52 -0700282
Richard Barnette5de01eb2017-12-15 09:53:42 -0800283class _PoolSetInventory(object):
284 """Maintains a set of `HostJobHistory`s for a set of pools.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700285
Richard Barnette5de01eb2017-12-15 09:53:42 -0800286 The collection is segregated into disjoint categories of "working",
287 "broken", and "idle" DUTs. Accessor methods allow finding both the
288 list of DUTs in each category, as well as counts of each category.
289 Accessor queries can be for an individual pool, or against all
290 pools.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700291
Richard Barnette5de01eb2017-12-15 09:53:42 -0800292 Performance note: This class relies on `_HostSetInventory`. Public
293 methods in this class generally rely on methods of the same name in
294 the underlying class, and so will have the same underlying
295 performance characteristics.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700296 """
297
Richard Barnette5de01eb2017-12-15 09:53:42 -0800298 def __init__(self, pools):
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800299 self._histories_by_pool = {
Richard Barnette5de01eb2017-12-15 09:53:42 -0800300 pool: _HostSetInventory() for pool in pools
J. Richard Barnette96db3492015-03-27 17:23:52 -0700301 }
302
303 def record_host(self, host_history):
304 """Add one `HostJobHistory` object to the collection.
305
306 @param host_history The `HostJobHistory` object to be
307 remembered.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700308 """
J. Richard Barnette3d0590a2015-04-29 12:56:12 -0700309 pool = host_history.host_pool
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800310 self._histories_by_pool[pool].record_host(host_history)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700311
J. Richard Barnette96db3492015-03-27 17:23:52 -0700312 def _count_pool(self, get_pool_count, pool=None):
313 """Internal helper to count hosts in a given pool.
314
315 The `get_pool_count` parameter is a function to calculate
316 the exact count of interest for the pool.
317
318 @param get_pool_count Function to return a count from a
319 _PoolCount object.
320 @param pool The pool to be counted. If `None`,
321 return the total across all pools.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700322 """
323 if pool is None:
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800324 return sum([get_pool_count(cached_history) for cached_history in
325 self._histories_by_pool.values()])
J. Richard Barnette96db3492015-03-27 17:23:52 -0700326 else:
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800327 return get_pool_count(self._histories_by_pool[pool])
J. Richard Barnette96db3492015-03-27 17:23:52 -0700328
J. Richard Barnettef6839282015-06-01 16:00:35 -0700329 def get_working_list(self):
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800330 """Return a list of all working DUTs (across all pools).
J. Richard Barnettef6839282015-06-01 16:00:35 -0700331
Richard Barnettee8eee312018-04-27 13:12:04 -0400332 Go through all HostJobHistory objects across all pools,
333 selecting all DUTs identified as working.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700334
335 @return A list of HostJobHistory objects.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700336 """
337 l = []
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800338 for p in self._histories_by_pool.values():
J. Richard Barnettef6839282015-06-01 16:00:35 -0700339 l.extend(p.get_working_list())
340 return l
341
J. Richard Barnette96db3492015-03-27 17:23:52 -0700342 def get_working(self, pool=None):
343 """Return the number of working DUTs in a pool.
344
345 @param pool The pool to be counted. If `None`, return the
346 total across all pools.
347
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700348 @return The total number of working DUTs in the selected
349 pool(s).
J. Richard Barnette96db3492015-03-27 17:23:52 -0700350 """
Richard Barnette5de01eb2017-12-15 09:53:42 -0800351 return self._count_pool(_HostSetInventory.get_working, pool)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700352
J. Richard Barnettef6839282015-06-01 16:00:35 -0700353 def get_broken_list(self):
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800354 """Return a list of all broken DUTs (across all pools).
J. Richard Barnettef6839282015-06-01 16:00:35 -0700355
Richard Barnettee8eee312018-04-27 13:12:04 -0400356 Go through all HostJobHistory objects across all pools,
357 selecting all DUTs identified as broken.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700358
359 @return A list of HostJobHistory objects.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700360 """
361 l = []
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800362 for p in self._histories_by_pool.values():
J. Richard Barnettef6839282015-06-01 16:00:35 -0700363 l.extend(p.get_broken_list())
364 return l
365
J. Richard Barnette96db3492015-03-27 17:23:52 -0700366 def get_broken(self, pool=None):
367 """Return the number of broken DUTs in a pool.
368
369 @param pool The pool to be counted. If `None`, return the
370 total across all pools.
371
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700372 @return The total number of broken DUTs in the selected pool(s).
J. Richard Barnette96db3492015-03-27 17:23:52 -0700373 """
Richard Barnette5de01eb2017-12-15 09:53:42 -0800374 return self._count_pool(_HostSetInventory.get_broken, pool)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700375
xixuan12ce04f2016-03-10 13:16:30 -0800376 def get_idle_list(self, pool=None):
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800377 """Return a list of all idle DUTs in the given pool.
xixuan12ce04f2016-03-10 13:16:30 -0800378
Richard Barnettee8eee312018-04-27 13:12:04 -0400379 Go through all HostJobHistory objects across all pools,
380 selecting all DUTs identified as idle.
xixuan12ce04f2016-03-10 13:16:30 -0800381
382 @param pool: The pool to be counted. If `None`, return the total list
383 across all pools.
384
385 @return A list of HostJobHistory objects.
xixuan12ce04f2016-03-10 13:16:30 -0800386 """
387 if pool is None:
388 l = []
Richard Barnette5de01eb2017-12-15 09:53:42 -0800389 for p in self._histories_by_pool.itervalues():
xixuan12ce04f2016-03-10 13:16:30 -0800390 l.extend(p.get_idle_list())
391 return l
392 else:
Richard Barnette5de01eb2017-12-15 09:53:42 -0800393 return self._histories_by_pool[pool].get_idle_list()
xixuan12ce04f2016-03-10 13:16:30 -0800394
xixuan12ce04f2016-03-10 13:16:30 -0800395 def get_idle(self, pool=None):
396 """Return the number of idle DUTs in a pool.
397
398 @param pool: The pool to be counted. If `None`, return the total
399 across all pools.
400
401 @return The total number of idle DUTs in the selected pool(s).
402 """
Richard Barnette5de01eb2017-12-15 09:53:42 -0800403 return self._count_pool(_HostSetInventory.get_idle, pool)
xixuan12ce04f2016-03-10 13:16:30 -0800404
Richard Barnette5de01eb2017-12-15 09:53:42 -0800405 def get_spares_buffer(self, spare_pool=SPARE_POOL):
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700406 """Return the the nominal number of working spares.
407
408 Calculates and returns how many working spares there would
409 be in the spares pool if all broken DUTs were in the spares
410 pool. This number may be negative, indicating a shortfall
411 in the critical pools.
412
413 @return The total number DUTs in the spares pool, less the total
414 number of broken DUTs in all pools.
415 """
Richard Barnette5de01eb2017-12-15 09:53:42 -0800416 return self.get_total(spare_pool) - self.get_broken()
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700417
J. Richard Barnette96db3492015-03-27 17:23:52 -0700418 def get_total(self, pool=None):
419 """Return the total number of DUTs in a pool.
420
421 @param pool The pool to be counted. If `None`, return the
422 total across all pools.
423
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700424 @return The total number of DUTs in the selected pool(s).
J. Richard Barnette96db3492015-03-27 17:23:52 -0700425 """
Richard Barnette5de01eb2017-12-15 09:53:42 -0800426 return self._count_pool(_HostSetInventory.get_total, pool)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700427
Richard Barnettee8eee312018-04-27 13:12:04 -0400428 def get_all_histories(self, pool=None):
429 if pool is None:
430 for p in self._histories_by_pool.itervalues():
431 for h in p.get_all_histories():
432 yield h
433 else:
434 for h in self._histories_by_pool[pool].get_all_histories():
435 yield h
436
J. Richard Barnette96db3492015-03-27 17:23:52 -0700437
Richard Barnette5de01eb2017-12-15 09:53:42 -0800438def _eligible_host(afehost):
439 """Return whether this host is eligible for monitoring.
440
441 A host is eligible if it has a (unique) 'model' label, it's in
442 exactly one pool, and it has no labels from the
443 `_EXCLUDED_LABELS` set.
444
445 @param afehost The host to be tested for eligibility.
446 """
447 # DUTs without an existing, unique 'model' or 'pool' label
448 # aren't meant to exist in the managed inventory; their presence
449 # generally indicates an error in the database. Unfortunately
450 # such errors have been seen to occur from time to time.
451 #
452 # The _LabInventory constructor requires hosts to conform to the
453 # label restrictions, and may fail if they don't. Failing an
454 # inventory run for a single bad entry is the wrong thing, so we
455 # ignore the problem children here, to keep them out of the
456 # inventory.
457 models = [l for l in afehost.labels
458 if l.startswith(constants.Labels.MODEL_PREFIX)]
459 pools = [l for l in afehost.labels
460 if l.startswith(constants.Labels.POOL_PREFIX)]
461 excluded = _EXCLUDED_LABELS.intersection(afehost.labels)
462 return len(models) == 1 and len(pools) == 1 and not excluded
463
464
465class _LabInventory(collections.Mapping):
J. Richard Barnette96db3492015-03-27 17:23:52 -0700466 """Collection of `HostJobHistory` objects for the Lab's inventory.
467
Richard Barnette5de01eb2017-12-15 09:53:42 -0800468 This is a dict-like collection indexed by model. Indexing returns
469 the _PoolSetInventory object associated with the model.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700470 """
471
472 @classmethod
Richard Barnette5de01eb2017-12-15 09:53:42 -0800473 def create_inventory(cls, afe, start_time, end_time, modellist=[]):
J. Richard Barnette96db3492015-03-27 17:23:52 -0700474 """Return a Lab inventory with specified parameters.
475
Richard Barnette5de01eb2017-12-15 09:53:42 -0800476 By default, gathers inventory from `HostJobHistory` objects for
477 all DUTs in the `MANAGED_POOLS` list. If `modellist` is
478 supplied, the inventory will be restricted to only the given
479 models.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700480
Richard Barnette5de01eb2017-12-15 09:53:42 -0800481 @param afe AFE object for constructing the
482 `HostJobHistory` objects.
483 @param start_time Start time for the `HostJobHistory` objects.
484 @param end_time End time for the `HostJobHistory` objects.
485 @param modellist List of models to include. If empty,
486 include all available models.
487 @return A `_LabInventory` object for the specified models.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700488 """
Richard Barnette5de01eb2017-12-15 09:53:42 -0800489 target_pools = MANAGED_POOLS
490 label_list = [constants.Labels.POOL_PREFIX + l for l in target_pools]
J. Richard Barnette96db3492015-03-27 17:23:52 -0700491 afehosts = afe.get_hosts(labels__name__in=label_list)
Richard Barnette5de01eb2017-12-15 09:53:42 -0800492 if modellist:
J. Richard Barnetteb8bc570c2016-03-17 17:03:57 -0700493 # We're deliberately not checking host eligibility in this
494 # code path. This is a debug path, not used in production;
495 # it may be useful to include ineligible hosts here.
Richard Barnette5de01eb2017-12-15 09:53:42 -0800496 modelhosts = []
497 for model in modellist:
498 model_label = constants.Labels.MODEL_PREFIX + model
J. Richard Barnette96db3492015-03-27 17:23:52 -0700499 host_list = [h for h in afehosts
Richard Barnette5de01eb2017-12-15 09:53:42 -0800500 if model_label in h.labels]
501 modelhosts.extend(host_list)
502 afehosts = modelhosts
J. Richard Barnetteb8bc570c2016-03-17 17:03:57 -0700503 else:
Richard Barnette3a404492018-02-08 13:57:01 -0800504 afehosts = [h for h in afehosts if _eligible_host(h)]
J. Richard Barnette96db3492015-03-27 17:23:52 -0700505 create = lambda host: (
506 status_history.HostJobHistory(afe, host,
507 start_time, end_time))
Richard Barnette5de01eb2017-12-15 09:53:42 -0800508 return cls([create(host) for host in afehosts], target_pools)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700509
Richard Barnette5de01eb2017-12-15 09:53:42 -0800510 def __init__(self, histories, pools):
511 models = {h.host_model for h in histories}
512 self._modeldata = {model: _PoolSetInventory(pools) for model in models}
J. Richard Barnette96db3492015-03-27 17:23:52 -0700513 self._dut_count = len(histories)
Richard Barnette5de01eb2017-12-15 09:53:42 -0800514 for h in histories:
515 self[h.host_model].record_host(h)
516 self._boards = {h.host_board for h in histories}
Prathmesh Prabhu154cb2b2017-11-08 17:36:51 -0800517
Richard Barnette5de01eb2017-12-15 09:53:42 -0800518 def __getitem__(self, key):
519 return self._modeldata.__getitem__(key)
Prathmesh Prabhu021e7842017-11-08 18:05:45 -0800520
Richard Barnette5de01eb2017-12-15 09:53:42 -0800521 def __len__(self):
522 return self._modeldata.__len__()
523
Richard Barnette5de01eb2017-12-15 09:53:42 -0800524 def __iter__(self):
525 return self._modeldata.__iter__()
526
J. Richard Barnette96db3492015-03-27 17:23:52 -0700527 def get_num_duts(self):
528 """Return the total number of DUTs in the inventory."""
529 return self._dut_count
530
Prathmesh Prabhua5a0e3d2017-11-09 08:53:53 -0800531 def get_num_models(self):
532 """Return the total number of models in the inventory."""
Richard Barnette5de01eb2017-12-15 09:53:42 -0800533 return len(self)
534
Richard Barnette5de01eb2017-12-15 09:53:42 -0800535 def get_pool_models(self, pool):
536 """Return all models in `pool`.
537
538 @param pool The pool to be inventoried for models.
539 """
540 return {m for m, h in self.iteritems() if h.get_total(pool)}
541
Richard Barnette5de01eb2017-12-15 09:53:42 -0800542 def get_boards(self):
543 return self._boards
Prathmesh Prabhua5a0e3d2017-11-09 08:53:53 -0800544
545
Richard Barnettee8eee312018-04-27 13:12:04 -0400546def _reportable_models(inventory, spare_pool=SPARE_POOL):
547 """Iterate over all models subject to reporting.
548
549 Yields the contents of `inventory.iteritems()` filtered to include
550 only reportable models. A model is reportable if it has DUTs in
551 both `spare_pool` and at least one other pool.
552
553 @param spare_pool The spare pool to be tested for reporting.
554 """
555 for model, poolset in inventory.iteritems():
556 spares = poolset.get_total(spare_pool)
557 total = poolset.get_total()
558 if spares != 0 and spares != total:
559 yield model, poolset
560
561
562def _all_dut_histories(inventory):
563 for poolset in inventory.itervalues():
564 for h in poolset.get_all_histories():
565 yield h
566
567
J. Richard Barnettef6839282015-06-01 16:00:35 -0700568def _sort_by_location(inventory_list):
569 """Return a list of DUTs, organized by location.
570
571 Take the given list of `HostJobHistory` objects, separate it
572 into a list per lab, and sort each lab's list by location. The
573 order of sorting within a lab is
574 * By row number within the lab,
575 * then by rack number within the row,
576 * then by host shelf number within the rack.
577
578 Return a list of the sorted lists.
579
580 Implementation note: host locations are sorted by converting
581 each location into a base 100 number. If row, rack or
582 host numbers exceed the range [0..99], then sorting will
583 break down.
584
585 @return A list of sorted lists of DUTs.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700586 """
587 BASE = 100
588 lab_lists = {}
589 for history in inventory_list:
590 location = _HOSTNAME_PATTERN.match(history.host.hostname)
591 if location:
592 lab = location.group(1)
593 key = 0
594 for idx in location.group(2, 3, 4):
595 key = BASE * key + int(idx)
596 lab_lists.setdefault(lab, []).append((key, history))
597 return_list = []
598 for dut_list in lab_lists.values():
599 dut_list.sort(key=lambda t: t[0])
600 return_list.append([t[1] for t in dut_list])
601 return return_list
602
603
604def _score_repair_set(buffer_counts, repair_list):
605 """Return a numeric score rating a set of DUTs to be repaired.
606
Richard Barnette5de01eb2017-12-15 09:53:42 -0800607 `buffer_counts` is a dictionary mapping model names to the size of
608 the model's spares buffer.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700609
Richard Barnette5de01eb2017-12-15 09:53:42 -0800610 `repair_list` is a list of `HostJobHistory` objects for the DUTs to
611 be repaired.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700612
613 This function calculates the new set of buffer counts that would
Richard Barnette5de01eb2017-12-15 09:53:42 -0800614 result from the proposed repairs, and scores the new set using two
615 numbers:
616 * Worst case buffer count for any model (higher is better). This
617 is the more significant number for comparison.
618 * Number of models at the worst case (lower is better). This is
619 the less significant number.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700620
Richard Barnette5de01eb2017-12-15 09:53:42 -0800621 Implementation note: The score could fail to reflect the intended
622 criteria if there are more than 1000 models in the inventory.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700623
Richard Barnette5de01eb2017-12-15 09:53:42 -0800624 @param spare_counts A dictionary mapping models to buffer counts.
625 @param repair_list A list of `HostJobHistory` objects for the
626 DUTs to be repaired.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700627 @return A numeric score.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700628 """
629 # Go through `buffer_counts`, and create a list of new counts
Richard Barnette5de01eb2017-12-15 09:53:42 -0800630 # that records the buffer count for each model after repair.
631 # The new list of counts discards the model names, as they don't
J. Richard Barnettef6839282015-06-01 16:00:35 -0700632 # contribute to the final score.
Richard Barnette5de01eb2017-12-15 09:53:42 -0800633 _NMODELS = 1000
634 pools = {h.host_pool for h in repair_list}
635 repair_inventory = _LabInventory(repair_list, pools)
J. Richard Barnettef6839282015-06-01 16:00:35 -0700636 new_counts = []
Richard Barnette5de01eb2017-12-15 09:53:42 -0800637 for m, c in buffer_counts.iteritems():
638 if m in repair_inventory:
639 newcount = repair_inventory[m].get_total()
J. Richard Barnettef6839282015-06-01 16:00:35 -0700640 else:
641 newcount = 0
642 new_counts.append(c + newcount)
643 # Go through the new list of counts. Find the worst available
644 # spares count, and count how many times that worst case occurs.
645 worst_count = new_counts[0]
646 num_worst = 1
647 for c in new_counts[1:]:
648 if c == worst_count:
649 num_worst += 1
650 elif c < worst_count:
651 worst_count = c
652 num_worst = 1
653 # Return the calculated score
Richard Barnette5de01eb2017-12-15 09:53:42 -0800654 return _NMODELS * worst_count - num_worst
J. Richard Barnettef6839282015-06-01 16:00:35 -0700655
656
657def _generate_repair_recommendation(inventory, num_recommend):
658 """Return a summary of selected DUTs needing repair.
659
Richard Barnette5de01eb2017-12-15 09:53:42 -0800660 Returns a message recommending a list of broken DUTs to be repaired.
661 The list of DUTs is selected based on these criteria:
J. Richard Barnettef6839282015-06-01 16:00:35 -0700662 * No more than `num_recommend` DUTs will be listed.
663 * All DUTs must be in the same lab.
Richard Barnette5de01eb2017-12-15 09:53:42 -0800664 * DUTs should be selected for some degree of physical proximity.
665 * DUTs for models with a low spares buffer are more important than
666 DUTs with larger buffers.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700667
Richard Barnette5de01eb2017-12-15 09:53:42 -0800668 The algorithm used will guarantee that at least one DUT from a model
669 with the lowest spares buffer will be recommended. If the worst
670 spares buffer number is shared by more than one model, the algorithm
671 will tend to prefer repair sets that include more of those models
672 over sets that cover fewer models.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700673
Richard Barnette5de01eb2017-12-15 09:53:42 -0800674 @param inventory `_LabInventory` object from which to generate
675 recommendations.
J. Richard Barnette1df6a562015-06-09 10:06:17 -0700676 @param num_recommend Number of DUTs to recommend for repair.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700677 """
678 logging.debug('Creating DUT repair recommendations')
Richard Barnette5de01eb2017-12-15 09:53:42 -0800679 model_buffer_counts = {}
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700680 broken_list = []
Richard Barnettee8eee312018-04-27 13:12:04 -0400681 for model, counts in _reportable_models(inventory):
Richard Barnette5de01eb2017-12-15 09:53:42 -0800682 logging.debug('Listing failed DUTs for %s', model)
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700683 if counts.get_broken() != 0:
Richard Barnette5de01eb2017-12-15 09:53:42 -0800684 model_buffer_counts[model] = counts.get_spares_buffer()
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700685 broken_list.extend(counts.get_broken_list())
J. Richard Barnette55127432015-10-13 17:01:56 -0700686 # N.B. The logic inside this loop may seem complicated, but
J. Richard Barnettef6839282015-06-01 16:00:35 -0700687 # simplification is hard:
688 # * Calculating an initial recommendation outside of
689 # the loop likely would make things more complicated,
690 # not less.
691 # * It's necessary to calculate an initial lab slice once per
692 # lab _before_ the while loop, in case the number of broken
693 # DUTs in a lab is less than `num_recommend`.
J. Richard Barnette55127432015-10-13 17:01:56 -0700694 recommendation = None
695 best_score = None
696 for lab_duts in _sort_by_location(broken_list):
J. Richard Barnettef6839282015-06-01 16:00:35 -0700697 start = 0
698 end = num_recommend
699 lab_slice = lab_duts[start : end]
Richard Barnette5de01eb2017-12-15 09:53:42 -0800700 lab_score = _score_repair_set(model_buffer_counts, lab_slice)
J. Richard Barnettef6839282015-06-01 16:00:35 -0700701 while end < len(lab_duts):
702 start += 1
703 end += 1
704 new_slice = lab_duts[start : end]
Richard Barnette5de01eb2017-12-15 09:53:42 -0800705 new_score = _score_repair_set(model_buffer_counts, new_slice)
J. Richard Barnettef6839282015-06-01 16:00:35 -0700706 if new_score > lab_score:
707 lab_slice = new_slice
708 lab_score = new_score
709 if recommendation is None or lab_score > best_score:
710 recommendation = lab_slice
711 best_score = lab_score
Richard Barnette5de01eb2017-12-15 09:53:42 -0800712 # N.B. The trailing space in `line_fmt` is manadatory: Without it,
713 # Gmail will parse the URL wrong. Don't ask. If you simply _must_
J. Richard Barnette5d0fa512016-04-05 17:39:52 -0700714 # know more, go try it yourself...
715 line_fmt = '%-30s %-16s %-6s\n %s '
J. Richard Barnette1df6a562015-06-09 10:06:17 -0700716 message = ['Repair recommendations:\n',
Richard Barnette5de01eb2017-12-15 09:53:42 -0800717 line_fmt % ( 'Hostname', 'Model', 'Servo?', 'Logs URL')]
J. Richard Barnettef6839282015-06-01 16:00:35 -0700718 for h in recommendation:
719 servo_name = servo_host.make_servo_hostname(h.host.hostname)
J. Richard Barnette5d0fa512016-04-05 17:39:52 -0700720 servo_present = utils.host_is_in_lab_zone(servo_name)
Jacob Kopczynski7c4c9542018-08-13 17:24:41 -0700721 event = _get_diagnosis_safely(h, 'task')
J. Richard Barnette5d0fa512016-04-05 17:39:52 -0700722 line = line_fmt % (
Richard Barnette5de01eb2017-12-15 09:53:42 -0800723 h.host.hostname, h.host_model,
J. Richard Barnette5d0fa512016-04-05 17:39:52 -0700724 'Yes' if servo_present else 'No', event.job_url)
J. Richard Barnettef6839282015-06-01 16:00:35 -0700725 message.append(line)
726 return '\n'.join(message)
727
728
Richard Barnette5de01eb2017-12-15 09:53:42 -0800729def _generate_model_inventory_message(inventory):
730 """Generate the "model inventory" e-mail message.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700731
Richard Barnette5de01eb2017-12-15 09:53:42 -0800732 The model inventory is a list by model summarizing the number of
733 working, broken, and idle DUTs, and the total shortfall or surplus
J. Richard Barnette96db3492015-03-27 17:23:52 -0700734 of working devices relative to the minimum critical pool
735 requirement.
736
Richard Barnette5de01eb2017-12-15 09:53:42 -0800737 The report omits models with no DUTs in the spare pool or with no
738 DUTs in a critical pool.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700739
740 N.B. For sample output text formattted as users can expect to
741 see it in e-mail and log files, refer to the unit tests.
742
Richard Barnette5de01eb2017-12-15 09:53:42 -0800743 @param inventory `_LabInventory` object to be reported on.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700744 @return String with the inventory message to be sent.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700745 """
Richard Barnette5de01eb2017-12-15 09:53:42 -0800746 logging.debug('Creating model inventory')
J. Richard Barnetted3ba33a2015-10-14 11:20:49 -0700747 nworking = 0
748 nbroken = 0
xixuan12ce04f2016-03-10 13:16:30 -0800749 nidle = 0
Richard Barnette5de01eb2017-12-15 09:53:42 -0800750 nbroken_models = 0
751 ntotal_models = 0
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700752 summaries = []
Richard Barnette5de01eb2017-12-15 09:53:42 -0800753 column_names = (
754 'Model', 'Avail', 'Bad', 'Idle', 'Good', 'Spare', 'Total')
Richard Barnettee8eee312018-04-27 13:12:04 -0400755 for model, counts in _reportable_models(inventory):
Richard Barnette5de01eb2017-12-15 09:53:42 -0800756 logging.debug('Counting %2d DUTS for model %s',
757 counts.get_total(), model)
758 # Summary elements laid out in the same order as the column
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700759 # headers:
Richard Barnette5de01eb2017-12-15 09:53:42 -0800760 # Model Avail Bad Idle Good Spare Total
xixuan12ce04f2016-03-10 13:16:30 -0800761 # e[0] e[1] e[2] e[3] e[4] e[5] e[6]
Richard Barnette5de01eb2017-12-15 09:53:42 -0800762 element = (model,
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700763 counts.get_spares_buffer(),
764 counts.get_broken(),
xixuan12ce04f2016-03-10 13:16:30 -0800765 counts.get_idle(),
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700766 counts.get_working(),
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700767 counts.get_total(SPARE_POOL),
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700768 counts.get_total())
J. Richard Barnetteea5a4ba2016-02-18 16:34:50 -0800769 if element[2]:
770 summaries.append(element)
Richard Barnette5de01eb2017-12-15 09:53:42 -0800771 nbroken_models += 1
772 ntotal_models += 1
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700773 nbroken += element[2]
xixuan12ce04f2016-03-10 13:16:30 -0800774 nidle += element[3]
775 nworking += element[4]
776 ntotal = nworking + nbroken + nidle
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700777 summaries = sorted(summaries, key=lambda e: (e[1], -e[2]))
J. Richard Barnetted3ba33a2015-10-14 11:20:49 -0700778 broken_percent = int(round(100.0 * nbroken / ntotal))
xixuan12ce04f2016-03-10 13:16:30 -0800779 idle_percent = int(round(100.0 * nidle / ntotal))
780 working_percent = 100 - broken_percent - idle_percent
J. Richard Barnetted3ba33a2015-10-14 11:20:49 -0700781 message = ['Summary of DUTs in inventory:',
xixuan12ce04f2016-03-10 13:16:30 -0800782 '%10s %10s %10s %6s' % ('Bad', 'Idle', 'Good', 'Total'),
783 '%5d %3d%% %5d %3d%% %5d %3d%% %6d' % (
J. Richard Barnetted3ba33a2015-10-14 11:20:49 -0700784 nbroken, broken_percent,
xixuan12ce04f2016-03-10 13:16:30 -0800785 nidle, idle_percent,
J. Richard Barnetted3ba33a2015-10-14 11:20:49 -0700786 nworking, working_percent,
787 ntotal),
788 '',
Richard Barnette5de01eb2017-12-15 09:53:42 -0800789 'Models with failures: %d' % nbroken_models,
790 'Models in inventory: %d' % ntotal_models,
J. Richard Barnetted3ba33a2015-10-14 11:20:49 -0700791 '', '',
Richard Barnette5de01eb2017-12-15 09:53:42 -0800792 'Full model inventory:\n',
793 '%-22s %5s %5s %5s %5s %5s %5s' % column_names]
J. Richard Barnette96db3492015-03-27 17:23:52 -0700794 message.extend(
xixuan12ce04f2016-03-10 13:16:30 -0800795 ['%-22s %5d %5d %5d %5d %5d %5d' % e for e in summaries])
J. Richard Barnette96db3492015-03-27 17:23:52 -0700796 return '\n'.join(message)
797
798
J. Richard Barnette4845fcf2015-04-20 14:26:25 -0700799_POOL_INVENTORY_HEADER = '''\
Richard Barnette5de01eb2017-12-15 09:53:42 -0800800Notice to Infrastructure deputies: All models shown below are at
J. Richard Barnettec9a143c2015-06-04 11:11:19 -0700801less than full strength, please take action to resolve the issues.
802Once you're satisified that failures won't recur, failed DUTs can
803be replaced with spares by running `balance_pool`. Detailed
804instructions can be found here:
J. Richard Barnette4845fcf2015-04-20 14:26:25 -0700805 http://go/cros-manage-duts
806'''
807
808
J. Richard Barnette96db3492015-03-27 17:23:52 -0700809def _generate_pool_inventory_message(inventory):
810 """Generate the "pool inventory" e-mail message.
811
Richard Barnette5de01eb2017-12-15 09:53:42 -0800812 The pool inventory is a list by pool and model summarizing the
813 number of working and broken DUTs in the pool. Only models with
J. Richard Barnette96db3492015-03-27 17:23:52 -0700814 at least one broken DUT are included in the list.
815
Richard Barnette5de01eb2017-12-15 09:53:42 -0800816 N.B. For sample output text formattted as users can expect to see it
817 in e-mail and log files, refer to the unit tests.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700818
Richard Barnette5de01eb2017-12-15 09:53:42 -0800819 @param inventory `_LabInventory` object to be reported on.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700820 @return String with the inventory message to be sent.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700821 """
822 logging.debug('Creating pool inventory')
J. Richard Barnette4845fcf2015-04-20 14:26:25 -0700823 message = [_POOL_INVENTORY_HEADER]
J. Richard Barnette96db3492015-03-27 17:23:52 -0700824 newline = ''
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700825 for pool in CRITICAL_POOLS:
J. Richard Barnette96db3492015-03-27 17:23:52 -0700826 message.append(
Richard Barnette5de01eb2017-12-15 09:53:42 -0800827 '%sStatus for pool:%s, by model:' % (newline, pool))
J. Richard Barnette96db3492015-03-27 17:23:52 -0700828 message.append(
xixuan12ce04f2016-03-10 13:16:30 -0800829 '%-20s %5s %5s %5s %5s' % (
Richard Barnette5de01eb2017-12-15 09:53:42 -0800830 'Model', 'Bad', 'Idle', 'Good', 'Total'))
J. Richard Barnette96db3492015-03-27 17:23:52 -0700831 data_list = []
Richard Barnette5de01eb2017-12-15 09:53:42 -0800832 for model, counts in inventory.iteritems():
Richard Barnette254d5b42016-07-06 19:13:23 -0700833 logging.debug('Counting %2d DUTs for %s, %s',
Richard Barnette5de01eb2017-12-15 09:53:42 -0800834 counts.get_total(pool), model, pool)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700835 broken = counts.get_broken(pool)
xixuan12ce04f2016-03-10 13:16:30 -0800836 idle = counts.get_idle(pool)
Richard Barnette5de01eb2017-12-15 09:53:42 -0800837 # models at full strength are not reported
838 if not broken and not idle:
J. Richard Barnette96db3492015-03-27 17:23:52 -0700839 continue
840 working = counts.get_working(pool)
841 total = counts.get_total(pool)
Richard Barnette5de01eb2017-12-15 09:53:42 -0800842 data_list.append((model, broken, idle, working, total))
J. Richard Barnette96db3492015-03-27 17:23:52 -0700843 if data_list:
844 data_list = sorted(data_list, key=lambda d: -d[1])
845 message.extend(
xixuan12ce04f2016-03-10 13:16:30 -0800846 ['%-20s %5d %5d %5d %5d' % t for t in data_list])
J. Richard Barnette96db3492015-03-27 17:23:52 -0700847 else:
Richard Barnette5de01eb2017-12-15 09:53:42 -0800848 message.append('(All models at full strength)')
J. Richard Barnette96db3492015-03-27 17:23:52 -0700849 newline = '\n'
850 return '\n'.join(message)
851
852
xixuan12ce04f2016-03-10 13:16:30 -0800853_IDLE_INVENTORY_HEADER = '''\
854Notice to Infrastructure deputies: The hosts shown below haven't
855run any jobs for at least 24 hours. Please check each host; locked
856hosts should normally be unlocked; stuck jobs should normally be
857aborted.
858'''
859
860
861def _generate_idle_inventory_message(inventory):
862 """Generate the "idle inventory" e-mail message.
863
Richard Barnette5de01eb2017-12-15 09:53:42 -0800864 The idle inventory is a host list with corresponding pool and model,
Richard Barnettee8eee312018-04-27 13:12:04 -0400865 where the hosts are identified as idle.
xixuan12ce04f2016-03-10 13:16:30 -0800866
867 N.B. For sample output text format as users can expect to
868 see it in e-mail and log files, refer to the unit tests.
869
Richard Barnette5de01eb2017-12-15 09:53:42 -0800870 @param inventory `_LabInventory` object to be reported on.
xixuan12ce04f2016-03-10 13:16:30 -0800871 @return String with the inventory message to be sent.
xixuan12ce04f2016-03-10 13:16:30 -0800872 """
873 logging.debug('Creating idle inventory')
874 message = [_IDLE_INVENTORY_HEADER]
875 message.append('Idle Host List:')
Richard Barnette5de01eb2017-12-15 09:53:42 -0800876 message.append('%-30s %-20s %s' % ('Hostname', 'Model', 'Pool'))
xixuan12ce04f2016-03-10 13:16:30 -0800877 data_list = []
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700878 for pool in MANAGED_POOLS:
Richard Barnette5de01eb2017-12-15 09:53:42 -0800879 for model, counts in inventory.iteritems():
Richard Barnette254d5b42016-07-06 19:13:23 -0700880 logging.debug('Counting %2d DUTs for %s, %s',
Richard Barnette5de01eb2017-12-15 09:53:42 -0800881 counts.get_total(pool), model, pool)
882 data_list.extend([(dut.host.hostname, model, pool)
xixuan12ce04f2016-03-10 13:16:30 -0800883 for dut in counts.get_idle_list(pool)])
884 if data_list:
885 message.extend(['%-30s %-20s %s' % t for t in data_list])
886 else:
887 message.append('(No idle DUTs)')
888 return '\n'.join(message)
889
890
J. Richard Barnette96db3492015-03-27 17:23:52 -0700891def _send_email(arguments, tag, subject, recipients, body):
892 """Send an inventory e-mail message.
893
Richard Barnette5de01eb2017-12-15 09:53:42 -0800894 The message is logged in the selected log directory using `tag` for
895 the file name.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700896
Richard Barnette5de01eb2017-12-15 09:53:42 -0800897 If the --debug option was requested, the message is neither logged
898 nor sent, but merely printed on stdout.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700899
900 @param arguments Parsed command-line options.
901 @param tag Tag identifying the inventory for logging
902 purposes.
903 @param subject E-mail Subject: header line.
904 @param recipients E-mail addresses for the To: header line.
905 @param body E-mail message body.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700906 """
907 logging.debug('Generating email: "%s"', subject)
908 all_recipients = ', '.join(recipients)
909 report_body = '\n'.join([
910 'To: %s' % all_recipients,
911 'Subject: %s' % subject,
912 '', body, ''])
J. Richard Barnette02e82432015-10-13 16:02:47 -0700913 if arguments.debug:
J. Richard Barnette96db3492015-03-27 17:23:52 -0700914 print report_body
915 else:
916 filename = os.path.join(arguments.logdir, tag)
917 try:
918 report_file = open(filename, 'w')
919 report_file.write(report_body)
920 report_file.close()
921 except EnvironmentError as e:
922 logging.error('Failed to write %s: %s', filename, e)
923 try:
924 gmail_lib.send_email(all_recipients, subject, body)
925 except Exception as e:
926 logging.error('Failed to send e-mail to %s: %s',
927 all_recipients, e)
928
929
Richard Barnette5de01eb2017-12-15 09:53:42 -0800930def _populate_model_counts(inventory):
931 """Gather model counts while providing interactive feedback.
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700932
933 Gathering the status of all individual DUTs in the lab can take
934 considerable time (~30 minutes at the time of this writing).
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700935 Normally, we pay that cost by querying as we go. However, with
936 the `--debug` option, we expect a human being to be watching the
Richard Barnette5de01eb2017-12-15 09:53:42 -0800937 progress in real time. So, we force the first (expensive) queries
938 to happen up front, and provide simple ASCII output on sys.stdout
939 to show a progress bar and results.
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700940
Richard Barnette5de01eb2017-12-15 09:53:42 -0800941 @param inventory `_LabInventory` object from which to gather
942 counts.
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700943 """
944 n = 0
945 total_broken = 0
Richard Barnette5de01eb2017-12-15 09:53:42 -0800946 for counts in inventory.itervalues():
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700947 n += 1
948 if n % 10 == 5:
949 c = '+'
950 elif n % 10 == 0:
951 c = '%d' % ((n / 10) % 10)
952 else:
953 c = '.'
954 sys.stdout.write(c)
955 sys.stdout.flush()
Richard Barnette5de01eb2017-12-15 09:53:42 -0800956 # This next call is where all the time goes - it forces all of a
957 # model's `HostJobHistory` objects to query the database and
958 # cache their results.
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700959 total_broken += counts.get_broken()
960 sys.stdout.write('\n')
961 sys.stdout.write('Found %d broken DUTs\n' % total_broken)
962
963
Richard Barnette5de01eb2017-12-15 09:53:42 -0800964def _perform_model_inventory(arguments, inventory, timestamp):
965 """Perform the model inventory report.
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700966
Richard Barnette5de01eb2017-12-15 09:53:42 -0800967 The model inventory report consists of the following:
968 * A list of DUTs that are recommended to be repaired. This list
969 is optional, and only appears if the `--recommend` option is
970 present.
971 * A list of all models that have failed DUTs, with counts
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700972 of working, broken, and spare DUTs, among others.
973
974 @param arguments Command-line arguments as returned by
975 `ArgumentParser`
Richard Barnette5de01eb2017-12-15 09:53:42 -0800976 @param inventory `_LabInventory` object to be reported on.
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700977 @param timestamp A string used to identify this run's timestamp
978 in logs and email output.
979 """
980 if arguments.recommend:
981 recommend_message = _generate_repair_recommendation(
982 inventory, arguments.recommend) + '\n\n\n'
983 else:
984 recommend_message = ''
Richard Barnette5de01eb2017-12-15 09:53:42 -0800985 model_message = _generate_model_inventory_message(inventory)
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700986 _send_email(arguments,
Richard Barnette5de01eb2017-12-15 09:53:42 -0800987 'models-%s.txt' % timestamp,
988 'DUT model inventory %s' % timestamp,
989 arguments.model_notify,
990 recommend_message + model_message)
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700991
992
993def _perform_pool_inventory(arguments, inventory, timestamp):
994 """Perform the pool inventory report.
995
996 The pool inventory report consists of the following:
997 * A list of all critical pools that have failed DUTs, with counts
998 of working, broken, and idle DUTs.
Richard Barnette5de01eb2017-12-15 09:53:42 -0800999 * A list of all idle DUTs by hostname including the model and
Richard Barnette3dcbb6a2017-10-23 17:57:50 -07001000 pool.
1001
1002 @param arguments Command-line arguments as returned by
1003 `ArgumentParser`
Richard Barnette5de01eb2017-12-15 09:53:42 -08001004 @param inventory `_LabInventory` object to be reported on.
Richard Barnettecf5d8342017-10-24 18:13:11 -07001005 @param timestamp A string used to identify this run's timestamp in
1006 logs and email output.
Richard Barnette3dcbb6a2017-10-23 17:57:50 -07001007 """
1008 pool_message = _generate_pool_inventory_message(inventory)
1009 idle_message = _generate_idle_inventory_message(inventory)
1010 _send_email(arguments,
1011 'pools-%s.txt' % timestamp,
1012 'DUT pool inventory %s' % timestamp,
1013 arguments.pool_notify,
1014 pool_message + '\n\n\n' + idle_message)
1015
1016
Richard Barnettecf5d8342017-10-24 18:13:11 -07001017def _dut_in_repair_loop(history):
1018 """Return whether a DUT's history indicates a repair loop.
1019
1020 A DUT is considered looping if it runs no tests, and no tasks pass
1021 other than repair tasks.
1022
1023 @param history An instance of `status_history.HostJobHistory` to be
1024 scanned for a repair loop. The caller guarantees
1025 that this history corresponds to a working DUT.
1026 @returns Return a true value if the DUT's most recent history
1027 indicates a repair loop.
1028 """
1029 # Our caller passes only histories for working DUTs; that means
1030 # we've already paid the cost of fetching the diagnosis task, and
1031 # we know that the task was successful. The diagnosis task will be
1032 # one of the tasks we must scan to find a loop, so if the task isn't
1033 # a repair task, then our history includes a successful non-repair
1034 # task, and we're not looping.
1035 #
Richard Barnette1ca30e62018-04-09 16:45:58 -07001036 # The for loop below is very expensive, because it must fetch the
Richard Barnettecf5d8342017-10-24 18:13:11 -07001037 # full history, regardless of how many tasks we examine. At the
1038 # time of this writing, this check against the diagnosis task
1039 # reduces the cost of finding loops in the full inventory from hours
1040 # to minutes.
Jacob Kopczynski7c4c9542018-08-13 17:24:41 -07001041 if _get_diagnosis_safely(history, 'task').name != 'Repair':
Richard Barnettecf5d8342017-10-24 18:13:11 -07001042 return False
1043 repair_ok_count = 0
1044 for task in history:
1045 if not task.is_special:
1046 # This is a test, so we're not looping.
1047 return False
1048 if task.diagnosis == status_history.BROKEN:
1049 # Failed a repair, so we're not looping.
1050 return False
1051 if (task.diagnosis == status_history.WORKING
1052 and task.name != 'Repair'):
1053 # Non-repair task succeeded, so we're not looping.
1054 return False
1055 # At this point, we have either a failed non-repair task, or
1056 # a successful repair.
1057 if task.name == 'Repair':
1058 repair_ok_count += 1
1059 if repair_ok_count >= _REPAIR_LOOP_THRESHOLD:
1060 return True
1061
1062
Richard Barnette1ca30e62018-04-09 16:45:58 -07001063def _report_untestable_dut(history, state):
1064 fields = {
1065 'dut_hostname': history.hostname,
1066 'model': history.host_model,
1067 'pool': history.host_pool,
1068 'state': state,
1069 }
Richard Barnettee8eee312018-04-27 13:12:04 -04001070 logging.info('DUT in state %(state)s: %(dut_hostname)s, '
1071 'model: %(model)s, pool: %(pool)s', fields)
Richard Barnette1ca30e62018-04-09 16:45:58 -07001072 _UNTESTABLE_PRESENCE_METRIC.set(True, fields=fields)
Richard Barnettecf5d8342017-10-24 18:13:11 -07001073
Richard Barnettecf5d8342017-10-24 18:13:11 -07001074
Richard Barnette1ca30e62018-04-09 16:45:58 -07001075def _report_untestable_dut_metrics(inventory):
1076 """Scan the inventory for DUTs unable to run tests.
1077
1078 DUTs in the inventory are judged "untestable" if they meet one of
1079 two criteria:
1080 * The DUT is stuck in a repair loop; that is, it regularly passes
1081 repair, but never passes other operations.
1082 * The DUT runs no tasks at all, but is not locked.
1083
1084 This routine walks through the given inventory looking for DUTs in
1085 either of these states. Results are reported via a Monarch presence
1086 metric.
1087
1088 Note: To make sure that DUTs aren't flagged as "idle" merely
1089 because there's no work, a separate job runs prior to regular
1090 inventory runs which schedules trivial work on any DUT that appears
1091 idle.
1092
1093 @param inventory `_LabInventory` object to be reported on.
1094 """
Richard Barnettee8eee312018-04-27 13:12:04 -04001095 logging.info('Scanning for untestable DUTs.')
1096 for history in _all_dut_histories(inventory):
1097 # Managed DUTs with names that don't match
1098 # _HOSTNAME_PATTERN shouldn't be possible. However, we
1099 # don't want arbitrary strings being attached to the
1100 # 'dut_hostname' field, so for safety, we exclude all
1101 # anomalies.
1102 if not _HOSTNAME_PATTERN.match(history.hostname):
1103 continue
1104 if _host_is_working(history):
1105 if _dut_in_repair_loop(history):
1106 _report_untestable_dut(history, 'repair_loop')
1107 elif _host_is_idle(history):
1108 if not history.host.locked:
1109 _report_untestable_dut(history, 'idle_unlocked')
Richard Barnettecf5d8342017-10-24 18:13:11 -07001110
1111
Richard Barnette3dcbb6a2017-10-23 17:57:50 -07001112def _log_startup(arguments, startup_time):
1113 """Log the start of this inventory run.
1114
1115 Print various log messages indicating the start of the run. Return
1116 a string based on `startup_time` that will be used to identify this
1117 run in log files and e-mail messages.
1118
1119 @param startup_time A UNIX timestamp marking the moment when
1120 this inventory run began.
1121 @returns A timestamp string that will be used to identify this run
1122 in logs and email output.
1123 """
Jacob Kopczynskief6c92e2018-08-09 11:05:37 -07001124 timestamp = time.strftime(_TIMESTAMP_FORMAT,
Richard Barnette3dcbb6a2017-10-23 17:57:50 -07001125 time.localtime(startup_time))
1126 logging.debug('Starting lab inventory for %s', timestamp)
Richard Barnette5de01eb2017-12-15 09:53:42 -08001127 if arguments.model_notify:
Richard Barnette3dcbb6a2017-10-23 17:57:50 -07001128 if arguments.recommend:
1129 logging.debug('Will include repair recommendations')
Richard Barnette5de01eb2017-12-15 09:53:42 -08001130 logging.debug('Will include model inventory')
Richard Barnette3dcbb6a2017-10-23 17:57:50 -07001131 if arguments.pool_notify:
1132 logging.debug('Will include pool inventory')
1133 return timestamp
1134
1135
1136def _create_inventory(arguments, end_time):
1137 """Create the `_LabInventory` instance to use for reporting.
1138
1139 @param end_time A UNIX timestamp for the end of the time range
1140 to be searched in this inventory run.
1141 """
1142 start_time = end_time - arguments.duration * 60 * 60
1143 afe = frontend_wrappers.RetryingAFE(server=None)
1144 inventory = _LabInventory.create_inventory(
Richard Barnette5de01eb2017-12-15 09:53:42 -08001145 afe, start_time, end_time, arguments.modelnames)
1146 logging.info('Found %d hosts across %d models',
Richard Barnette3dcbb6a2017-10-23 17:57:50 -07001147 inventory.get_num_duts(),
Richard Barnette5de01eb2017-12-15 09:53:42 -08001148 inventory.get_num_models())
Richard Barnette3dcbb6a2017-10-23 17:57:50 -07001149 return inventory
1150
1151
Richard Barnettecf5d8342017-10-24 18:13:11 -07001152def _perform_inventory_reports(arguments):
1153 """Perform all inventory checks requested on the command line.
1154
1155 Create the initial inventory and run through the inventory reports
1156 as called for by the parsed command-line arguments.
1157
1158 @param arguments Command-line arguments as returned by
1159 `ArgumentParser`.
1160 """
1161 startup_time = time.time()
1162 timestamp = _log_startup(arguments, startup_time)
1163 inventory = _create_inventory(arguments, startup_time)
1164 if arguments.debug:
Richard Barnette5de01eb2017-12-15 09:53:42 -08001165 _populate_model_counts(inventory)
1166 if arguments.model_notify:
1167 _perform_model_inventory(arguments, inventory, timestamp)
Richard Barnettecf5d8342017-10-24 18:13:11 -07001168 if arguments.pool_notify:
1169 _perform_pool_inventory(arguments, inventory, timestamp)
Richard Barnette1ca30e62018-04-09 16:45:58 -07001170 if arguments.report_untestable:
1171 _report_untestable_dut_metrics(inventory)
Richard Barnettecf5d8342017-10-24 18:13:11 -07001172
1173
J. Richard Barnette96db3492015-03-27 17:23:52 -07001174def _separate_email_addresses(address_list):
1175 """Parse a list of comma-separated lists of e-mail addresses.
1176
1177 @param address_list A list of strings containing comma
1178 separate e-mail addresses.
1179 @return A list of the individual e-mail addresses.
J. Richard Barnette96db3492015-03-27 17:23:52 -07001180 """
1181 newlist = []
1182 for arg in address_list:
1183 newlist.extend([email.strip() for email in arg.split(',')])
1184 return newlist
1185
1186
1187def _verify_arguments(arguments):
1188 """Validate command-line arguments.
1189
Richard Barnette5de01eb2017-12-15 09:53:42 -08001190 Join comma separated e-mail addresses for `--model-notify` and
J. Richard Barnette96db3492015-03-27 17:23:52 -07001191 `--pool-notify` in separate option arguments into a single list.
1192
Richard Barnette54150302018-02-26 10:42:46 -08001193 For non-debug uses, require that at least one inventory report be
1194 requested. For debug, if a report isn't specified, treat it as "run
1195 all the reports."
J. Richard Barnette02e82432015-10-13 16:02:47 -07001196
1197 The return value indicates success or failure; in the case of
1198 failure, we also write an error message to stderr.
1199
J. Richard Barnette96db3492015-03-27 17:23:52 -07001200 @param arguments Command-line arguments as returned by
1201 `ArgumentParser`
J. Richard Barnette02e82432015-10-13 16:02:47 -07001202 @return True if the arguments are semantically good, or False
1203 if the arguments don't meet requirements.
J. Richard Barnette96db3492015-03-27 17:23:52 -07001204 """
Richard Barnette5de01eb2017-12-15 09:53:42 -08001205 arguments.model_notify = _separate_email_addresses(
1206 arguments.model_notify)
J. Richard Barnette96db3492015-03-27 17:23:52 -07001207 arguments.pool_notify = _separate_email_addresses(
1208 arguments.pool_notify)
Richard Barnette54150302018-02-26 10:42:46 -08001209 if not any([arguments.model_notify, arguments.pool_notify,
Richard Barnette1ca30e62018-04-09 16:45:58 -07001210 arguments.report_untestable]):
J. Richard Barnette02e82432015-10-13 16:02:47 -07001211 if not arguments.debug:
Richard Barnette54150302018-02-26 10:42:46 -08001212 sys.stderr.write('Must request at least one report via '
1213 '--model-notify, --pool-notify, or '
Richard Barnette1ca30e62018-04-09 16:45:58 -07001214 '--report-untestable\n')
J. Richard Barnette02e82432015-10-13 16:02:47 -07001215 return False
1216 else:
Richard Barnette54150302018-02-26 10:42:46 -08001217 # We want to run all the e-mail reports. An empty notify
1218 # list will cause a report to be skipped, so make sure the
J. Richard Barnette02e82432015-10-13 16:02:47 -07001219 # lists are non-empty.
Richard Barnette5de01eb2017-12-15 09:53:42 -08001220 arguments.model_notify = ['']
J. Richard Barnette02e82432015-10-13 16:02:47 -07001221 arguments.pool_notify = ['']
1222 return True
J. Richard Barnette96db3492015-03-27 17:23:52 -07001223
1224
Richard Barnette3dcbb6a2017-10-23 17:57:50 -07001225def _get_default_logdir(script):
J. Richard Barnette96db3492015-03-27 17:23:52 -07001226 """Get the default directory for the `--logdir` option.
1227
1228 The default log directory is based on the parent directory
1229 containing this script.
1230
1231 @param script Path to this script file.
1232 @return A path to a directory.
J. Richard Barnette96db3492015-03-27 17:23:52 -07001233 """
1234 basedir = os.path.dirname(os.path.abspath(script))
1235 basedir = os.path.dirname(basedir)
1236 return os.path.join(basedir, _LOGDIR)
1237
1238
1239def _parse_command(argv):
1240 """Parse the command line arguments.
1241
1242 Create an argument parser for this command's syntax, parse the
1243 command line, and return the result of the ArgumentParser
1244 parse_args() method.
1245
1246 @param argv Standard command line argument vector; argv[0] is
1247 assumed to be the command name.
1248 @return Result returned by ArgumentParser.parse_args().
J. Richard Barnette96db3492015-03-27 17:23:52 -07001249 """
1250 parser = argparse.ArgumentParser(
1251 prog=argv[0],
1252 description='Gather and report lab inventory statistics')
1253 parser.add_argument('-d', '--duration', type=int,
1254 default=_DEFAULT_DURATION, metavar='HOURS',
1255 help='number of hours back to search for status'
1256 ' (default: %d)' % _DEFAULT_DURATION)
Richard Barnette5de01eb2017-12-15 09:53:42 -08001257 parser.add_argument('--model-notify', action='append',
J. Richard Barnette96db3492015-03-27 17:23:52 -07001258 default=[], metavar='ADDRESS',
Richard Barnette5de01eb2017-12-15 09:53:42 -08001259 help='Generate model inventory message, '
J. Richard Barnette96db3492015-03-27 17:23:52 -07001260 'and send it to the given e-mail address(es)')
1261 parser.add_argument('--pool-notify', action='append',
1262 default=[], metavar='ADDRESS',
1263 help='Generate pool inventory message, '
1264 'and send it to the given address(es)')
J. Richard Barnette1df6a562015-06-09 10:06:17 -07001265 parser.add_argument('-r', '--recommend', type=int, default=None,
J. Richard Barnettef6839282015-06-01 16:00:35 -07001266 help=('Specify how many DUTs should be '
J. Richard Barnette1df6a562015-06-09 10:06:17 -07001267 'recommended for repair (default: no '
1268 'recommendation)'))
Richard Barnette1ca30e62018-04-09 16:45:58 -07001269 parser.add_argument('--report-untestable', action='store_true',
1270 help='Check for devices unable to run tests.')
J. Richard Barnette02e82432015-10-13 16:02:47 -07001271 parser.add_argument('--debug', action='store_true',
Prathmesh Prabhu6b48ede2018-05-07 14:33:07 -07001272 help='Print e-mail, metrics messages on stdout '
J. Richard Barnette96db3492015-03-27 17:23:52 -07001273 'without sending them.')
Richard Barnettec4374692018-09-17 13:53:38 -07001274 parser.add_argument('--no-metrics', action='store_false',
1275 dest='use_metrics',
1276 help='Suppress generation of Monarch metrics.')
Richard Barnette3dcbb6a2017-10-23 17:57:50 -07001277 parser.add_argument('--logdir', default=_get_default_logdir(argv[0]),
J. Richard Barnette96db3492015-03-27 17:23:52 -07001278 help='Directory where logs will be written.')
Richard Barnette5de01eb2017-12-15 09:53:42 -08001279 parser.add_argument('modelnames', nargs='*',
1280 metavar='MODEL',
1281 help='names of models to report on '
1282 '(default: all models)')
J. Richard Barnette96db3492015-03-27 17:23:52 -07001283 arguments = parser.parse_args(argv[1:])
J. Richard Barnette02e82432015-10-13 16:02:47 -07001284 if not _verify_arguments(arguments):
1285 return None
J. Richard Barnette96db3492015-03-27 17:23:52 -07001286 return arguments
1287
1288
1289def _configure_logging(arguments):
1290 """Configure the `logging` module for our needs.
1291
Richard Barnette3dcbb6a2017-10-23 17:57:50 -07001292 How we log depends on whether the `--debug` option was provided on
1293 the command line.
1294 * Without the option, we configure the logging to capture all
1295 potentially relevant events in a log file. The log file is
1296 configured to rotate once a week on Friday evening, preserving
1297 ~3 months worth of history.
1298 * With the option, we expect stdout to contain other
1299 human-readable output (including the contents of the e-mail
Richard Barnettecf5d8342017-10-24 18:13:11 -07001300 messages), so we restrict the output to INFO level.
1301
1302 For convenience, when `--debug` is on, the logging format has
1303 no adornments, so that a call like `logging.info(msg)` simply writes
1304 `msg` to stdout, plus a trailing newline.
J. Richard Barnette96db3492015-03-27 17:23:52 -07001305
1306 @param arguments Command-line arguments as returned by
1307 `ArgumentParser`
J. Richard Barnette96db3492015-03-27 17:23:52 -07001308 """
J. Richard Barnettef6839282015-06-01 16:00:35 -07001309 root_logger = logging.getLogger()
J. Richard Barnette02e82432015-10-13 16:02:47 -07001310 if arguments.debug:
J. Richard Barnettef6839282015-06-01 16:00:35 -07001311 root_logger.setLevel(logging.INFO)
Jacob Kopczynskief6c92e2018-08-09 11:05:37 -07001312 logfile = sys.stdout
J. Richard Barnette96db3492015-03-27 17:23:52 -07001313 else:
Jacob Kopczynskief6c92e2018-08-09 11:05:37 -07001314 root_logger.setLevel(logging.DEBUG)
1315 logfile = open(os.path.join(
1316 arguments.logdir,
1317 _LOGFILE + datetime.datetime.today().strftime(_TIMESTAMP_FORMAT)
1318 ))
Richard Barnette5af97402016-04-18 11:00:26 -07001319 if not os.path.exists(arguments.logdir):
1320 os.mkdir(arguments.logdir)
Jacob Kopczynskief6c92e2018-08-09 11:05:37 -07001321 handler = logging.StreamHandler(logfile)
1322 formatter = logging.Formatter(
1323 _LOG_FORMAT, time_utils.TIME_FMT)
1324 handler.setFormatter(formatter)
J. Richard Barnettef6839282015-06-01 16:00:35 -07001325 # TODO(jrbarnette) This is gross. Importing client.bin.utils
1326 # implicitly imported logging_config, which calls
1327 # logging.basicConfig() *at module level*. That gives us an
1328 # extra logging handler that we don't want. So, clear out all
1329 # the handlers here.
1330 for h in root_logger.handlers:
1331 root_logger.removeHandler(h)
1332 root_logger.addHandler(handler)
J. Richard Barnette96db3492015-03-27 17:23:52 -07001333
1334
J. Richard Barnette96db3492015-03-27 17:23:52 -07001335def main(argv):
1336 """Standard main routine.
Richard Barnettecf5d8342017-10-24 18:13:11 -07001337
1338 @param argv Command line arguments, including `sys.argv[0]`.
J. Richard Barnette96db3492015-03-27 17:23:52 -07001339 """
1340 arguments = _parse_command(argv)
J. Richard Barnette02e82432015-10-13 16:02:47 -07001341 if not arguments:
1342 sys.exit(1)
J. Richard Barnette96db3492015-03-27 17:23:52 -07001343 _configure_logging(arguments)
Prathmesh Prabhu6b48ede2018-05-07 14:33:07 -07001344
Richard Barnettec4374692018-09-17 13:53:38 -07001345 try:
1346 if arguments.use_metrics:
1347 if arguments.debug:
1348 logging.info('Debug mode: Will not report metrics to monarch.')
1349 metrics_file = '/dev/null'
1350 else:
1351 metrics_file = None
1352 with site_utils.SetupTsMonGlobalState(
1353 'lab_inventory', debug_file=metrics_file,
1354 auto_flush=False):
1355 success = False
1356 try:
1357 with metrics.SecondsTimer('%s/duration' % _METRICS_PREFIX):
1358 _perform_inventory_reports(arguments)
1359 success = True
1360 finally:
1361 metrics.Counter('%s/tick' % _METRICS_PREFIX).increment(
1362 fields={'success': success})
1363 metrics.Flush()
1364 else:
1365 _perform_inventory_reports(arguments)
1366 except KeyboardInterrupt:
1367 pass
1368 except Exception:
1369 # Our cron setup doesn't preserve stderr, so drop extra breadcrumbs.
1370 logging.exception('Error escaped main')
1371 raise
J. Richard Barnette96db3492015-03-27 17:23:52 -07001372
1373
Kevin Chengcf0ad2b2016-04-19 14:51:39 -07001374def get_inventory(afe):
J. Richard Barnetteaa868932015-10-23 13:28:59 -07001375 end_time = int(time.time())
1376 start_time = end_time - 24 * 60 * 60
Kevin Chengcf0ad2b2016-04-19 14:51:39 -07001377 return _LabInventory.create_inventory(afe, start_time, end_time)
1378
1379
1380def get_managed_boards(afe):
Richard Barnette5de01eb2017-12-15 09:53:42 -08001381 return get_inventory(afe).get_boards()
J. Richard Barnetteaa868932015-10-23 13:28:59 -07001382
1383
J. Richard Barnette96db3492015-03-27 17:23:52 -07001384if __name__ == '__main__':
1385 main(sys.argv)