blob: 91487449c5c154438f4f752e4e4d4d6392f119dc [file] [log] [blame]
J. Richard Barnette96db3492015-03-27 17:23:52 -07001#!/usr/bin/env python
2# Copyright 2015 The Chromium OS Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6"""Create e-mail reports of the Lab's DUT inventory.
7
8Gathers a list of all DUTs of interest in the Lab, segregated by
Richard Barnette5de01eb2017-12-15 09:53:42 -08009model and pool, and determines whether each DUT is working or
J. Richard Barnette96db3492015-03-27 17:23:52 -070010broken. Then, send one or more e-mail reports summarizing the
11status to e-mail addresses provided on the command line.
12
Richard Barnette5de01eb2017-12-15 09:53:42 -080013usage: lab_inventory.py [ options ] [ model ... ]
J. Richard Barnette96db3492015-03-27 17:23:52 -070014
15Options:
16--duration / -d <hours>
17 How far back in time to search job history to determine DUT
18 status.
19
Richard Barnette5de01eb2017-12-15 09:53:42 -080020--model-notify <address>[,<address>]
21 Send the "model status" e-mail to all the specified e-mail
J. Richard Barnette96db3492015-03-27 17:23:52 -070022 addresses.
23
24--pool-notify <address>[,<address>]
25 Send the "pool status" e-mail to all the specified e-mail
26 addresses.
27
J. Richard Barnette1df6a562015-06-09 10:06:17 -070028--recommend <number>
Richard Barnette5de01eb2017-12-15 09:53:42 -080029 When generating the "model status" e-mail, include a list of
J. Richard Barnette1df6a562015-06-09 10:06:17 -070030 <number> specific DUTs to be recommended for repair.
31
Richard Barnette1ca30e62018-04-09 16:45:58 -070032--report-untestable
33 Scan the inventory for DUTs that can't test because they're stuck in
34 repair loops, or because the scheduler can't give them work.
Richard Barnettecf5d8342017-10-24 18:13:11 -070035
J. Richard Barnette96db3492015-03-27 17:23:52 -070036--logdir <directory>
37 Log progress and actions in a file under this directory. Text
38 of any e-mail sent will also be logged in a timestamped file in
39 this directory.
40
J. Richard Barnette02e82432015-10-13 16:02:47 -070041--debug
Richard Barnettecf5d8342017-10-24 18:13:11 -070042 Suppress all logging, metrics reporting, and sending e-mail.
43 Instead, write the output that would be generated onto stdout.
J. Richard Barnette96db3492015-03-27 17:23:52 -070044
Richard Barnette5de01eb2017-12-15 09:53:42 -080045<model> arguments:
46 With no arguments, gathers the status for all models in the lab.
47 With one or more named models on the command line, restricts
48 reporting to just those models.
J. Richard Barnette96db3492015-03-27 17:23:52 -070049"""
50
51
52import argparse
Prathmesh Prabhu021e7842017-11-08 18:05:45 -080053import collections
J. Richard Barnette96db3492015-03-27 17:23:52 -070054import logging
55import logging.handlers
56import os
J. Richard Barnettef6839282015-06-01 16:00:35 -070057import re
J. Richard Barnette96db3492015-03-27 17:23:52 -070058import sys
59import time
60
61import common
J. Richard Barnettef6839282015-06-01 16:00:35 -070062from autotest_lib.client.bin import utils
Richard Barnette6f6ce322018-09-07 16:23:20 +000063from autotest_lib.client.common_lib import time_utils
Jacob Kopczynski7c4c9542018-08-13 17:24:41 -070064from autotest_lib.frontend.afe.json_rpc import proxy
Xixuan Wu93e646c2017-12-07 18:36:10 -080065from autotest_lib.server import constants
Richard Barnettecf5d8342017-10-24 18:13:11 -070066from autotest_lib.server import site_utils
J. Richard Barnettea7c514e2015-09-15 11:13:23 -070067from autotest_lib.server.cros.dynamic_suite import frontend_wrappers
J. Richard Barnettef6839282015-06-01 16:00:35 -070068from autotest_lib.server.hosts import servo_host
Aviv Keshet7ee95862016-08-30 15:18:27 -070069from autotest_lib.server.lib import status_history
J. Richard Barnette96db3492015-03-27 17:23:52 -070070from autotest_lib.site_utils import gmail_lib
Richard Barnettecf5d8342017-10-24 18:13:11 -070071from chromite.lib import metrics
J. Richard Barnette96db3492015-03-27 17:23:52 -070072
73
Richard Barnette673573b2016-12-12 09:46:39 -080074CRITICAL_POOLS = constants.Pools.CRITICAL_POOLS
75SPARE_POOL = constants.Pools.SPARE_POOL
76MANAGED_POOLS = constants.Pools.MANAGED_POOLS
J. Richard Barnette96db3492015-03-27 17:23:52 -070077
J. Richard Barnetteb8bc570c2016-03-17 17:03:57 -070078# _EXCLUDED_LABELS - A set of labels that disqualify a DUT from
Richard Barnetteeabcf392017-09-01 15:10:54 -070079# monitoring by this script. Currently, we're excluding these:
80# + 'adb' - We're not ready to monitor Android or Brillo hosts.
81# + 'board:guado_moblab' - These are maintained by a separate
82# process that doesn't use this script.
Aviv Keshet1ba0dec2018-07-12 17:14:08 -070083# + 'board:veyron_rialto' due to crbug.com/854404
Kevin Chengcf0ad2b2016-04-19 14:51:39 -070084
C Shapiro7de04422018-08-29 14:46:11 -060085_EXCLUDED_LABELS = {'adb', 'board:guado_moblab',
Aviv Keshet1ba0dec2018-07-12 17:14:08 -070086 'board:veyron_rialto'}
J. Richard Barnetteb8bc570c2016-03-17 17:03:57 -070087
J. Richard Barnette96db3492015-03-27 17:23:52 -070088# _DEFAULT_DURATION:
89# Default value used for the --duration command line option.
90# Specifies how far back in time to search in order to determine
91# DUT status.
92
93_DEFAULT_DURATION = 24
94
J. Richard Barnette96db3492015-03-27 17:23:52 -070095# _LOGDIR:
Richard Barnettecf5d8342017-10-24 18:13:11 -070096# Relative path used in the calculation of the default setting for
97# the --logdir option. The full path is relative to the root of the
98# autotest directory, as determined from sys.argv[0].
J. Richard Barnette96db3492015-03-27 17:23:52 -070099# _LOGFILE:
100# Basename of a file to which general log information will be
101# written.
102# _LOG_FORMAT:
103# Format string for log messages.
104
105_LOGDIR = os.path.join('logs', 'dut-data')
106_LOGFILE = 'lab-inventory.log'
107_LOG_FORMAT = '%(asctime)s | %(levelname)-10s | %(message)s'
108
J. Richard Barnettef6839282015-06-01 16:00:35 -0700109# Pattern describing location-based host names in the Chrome OS test
110# labs. Each DUT hostname designates the DUT's location:
111# * A lab (room) that's physically separated from other labs
112# (i.e. there's a door).
113# * A row (or aisle) of DUTs within the lab.
114# * A vertical rack of shelves on the row.
115# * A specific host on one shelf of the rack.
116
117_HOSTNAME_PATTERN = re.compile(
118 r'(chromeos\d+)-row(\d+)-rack(\d+)-host(\d+)')
119
Richard Barnettecf5d8342017-10-24 18:13:11 -0700120# _REPAIR_LOOP_THRESHOLD:
121# The number of repeated Repair tasks that must be seen to declare
122# that a DUT is stuck in a repair loop.
123
124_REPAIR_LOOP_THRESHOLD = 4
125
J. Richard Barnette96db3492015-03-27 17:23:52 -0700126
Prathmesh Prabhub69a6cc2018-05-07 14:49:33 -0700127_METRICS_PREFIX = 'chromeos/autotest/inventory'
Richard Barnette1ca30e62018-04-09 16:45:58 -0700128_UNTESTABLE_PRESENCE_METRIC = metrics.BooleanMetric(
Jacob Kopczynski7c4c9542018-08-13 17:24:41 -0700129 _METRICS_PREFIX + '/untestable',
Richard Barnette1ca30e62018-04-09 16:45:58 -0700130 'DUTs that cannot be scheduled for testing')
131
Jacob Kopczynski7c4c9542018-08-13 17:24:41 -0700132_MISSING_DUT_METRIC = metrics.Counter(
133 _METRICS_PREFIX + '/missing', 'DUTs which cannot be found by lookup queries'
134 ' because they are invalid or deleted')
135
Richard Barnette59404262018-09-14 15:25:30 -0700136# _Diagnosis - namedtuple corresponding to the return value from
137# `HostHistory.last_diagnosis()`
138_Diagnosis = collections.namedtuple('_Diagnosis', ['status', 'task'])
139
Richard Barnette59404262018-09-14 15:25:30 -0700140def _get_diagnosis(history):
Jacob Kopczynski7c4c9542018-08-13 17:24:41 -0700141 dut_present = True
142 try:
Richard Barnette59404262018-09-14 15:25:30 -0700143 diagnosis = _Diagnosis(*history.last_diagnosis())
144 if (diagnosis.status == status_history.BROKEN
145 and diagnosis.task.end_time < history.start_time):
146 return _Diagnosis(status_history.UNUSED, diagnosis.task)
147 else:
148 return diagnosis
Jacob Kopczynski7c4c9542018-08-13 17:24:41 -0700149 except proxy.JSONRPCException as e:
150 logging.warn(e)
151 dut_present = False
152 finally:
153 _MISSING_DUT_METRIC.increment(
154 fields={'host': history.hostname, 'presence': dut_present})
Richard Barnette1ca30e62018-04-09 16:45:58 -0700155
Richard Barnettee8eee312018-04-27 13:12:04 -0400156def _host_is_working(history):
Richard Barnette59404262018-09-14 15:25:30 -0700157 return _get_diagnosis(history).status == status_history.WORKING
Richard Barnettee8eee312018-04-27 13:12:04 -0400158
159
160def _host_is_broken(history):
Richard Barnette59404262018-09-14 15:25:30 -0700161 return _get_diagnosis(history).status == status_history.BROKEN
Richard Barnettee8eee312018-04-27 13:12:04 -0400162
163
164def _host_is_idle(history):
165 idle_statuses = {status_history.UNUSED, status_history.UNKNOWN}
Richard Barnette59404262018-09-14 15:25:30 -0700166 return _get_diagnosis(history).status in idle_statuses
Richard Barnettee8eee312018-04-27 13:12:04 -0400167
168
Richard Barnette5de01eb2017-12-15 09:53:42 -0800169class _HostSetInventory(object):
170 """Maintains a set of related `HostJobHistory` objects.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700171
Richard Barnettee8eee312018-04-27 13:12:04 -0400172 Current usage of this class is that all DUTs are part of a single
173 scheduling pool of DUTs for a single model; however, this class make
174 no assumptions about the actual relationship among the DUTs.
175
Richard Barnette5de01eb2017-12-15 09:53:42 -0800176 The collection is segregated into disjoint categories of "working",
177 "broken", and "idle" DUTs. Accessor methods allow finding both the
178 list of DUTs in each category, as well as counts of each category.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700179
J. Richard Barnettef6839282015-06-01 16:00:35 -0700180 Performance note: Certain methods in this class are potentially
181 expensive:
182 * `get_working()`
183 * `get_working_list()`
184 * `get_broken()`
185 * `get_broken_list()`
xixuan12ce04f2016-03-10 13:16:30 -0800186 * `get_idle()`
187 * `get_idle_list()`
J. Richard Barnettef6839282015-06-01 16:00:35 -0700188 The first time any one of these methods is called, it causes
189 multiple RPC calls with a relatively expensive set of database
190 queries. However, the results of the queries are cached in the
191 individual `HostJobHistory` objects, so only the first call
192 actually pays the full cost.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700193
xixuan12ce04f2016-03-10 13:16:30 -0800194 Additionally, `get_working_list()`, `get_broken_list()` and
195 `get_idle_list()` cache their return values to avoid recalculating
Richard Barnette5de01eb2017-12-15 09:53:42 -0800196 lists at every call; this caching is separate from the caching of
197 RPC results described above.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700198
Richard Barnette5de01eb2017-12-15 09:53:42 -0800199 This class is deliberately constructed to delay the RPC cost until
200 the accessor methods are called (rather than to query in
J. Richard Barnette96db3492015-03-27 17:23:52 -0700201 `record_host()`) so that it's possible to construct a complete
202 `_LabInventory` without making the expensive queries at creation
Richard Barnette5de01eb2017-12-15 09:53:42 -0800203 time. `_populate_model_counts()`, below, assumes this behavior.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700204 """
205
206 def __init__(self):
207 self._histories = []
J. Richard Barnettef6839282015-06-01 16:00:35 -0700208 self._working_list = None
209 self._broken_list = None
xixuan12ce04f2016-03-10 13:16:30 -0800210 self._idle_list = None
J. Richard Barnette96db3492015-03-27 17:23:52 -0700211
J. Richard Barnette96db3492015-03-27 17:23:52 -0700212 def record_host(self, host_history):
213 """Add one `HostJobHistory` object to the collection.
214
215 @param host_history The `HostJobHistory` object to be
216 remembered.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700217 """
J. Richard Barnettef6839282015-06-01 16:00:35 -0700218 self._working_list = None
219 self._broken_list = None
xixuan12ce04f2016-03-10 13:16:30 -0800220 self._idle_list = None
J. Richard Barnette96db3492015-03-27 17:23:52 -0700221 self._histories.append(host_history)
222
J. Richard Barnettef6839282015-06-01 16:00:35 -0700223 def get_working_list(self):
224 """Return a list of all working DUTs in the pool.
225
Richard Barnettee8eee312018-04-27 13:12:04 -0400226 Filter `self._histories` for histories where the DUT is
227 diagnosed as working.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700228
229 Cache the result so that we only cacluate it once.
230
231 @return A list of HostJobHistory objects.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700232 """
233 if self._working_list is None:
234 self._working_list = [h for h in self._histories
Richard Barnettee8eee312018-04-27 13:12:04 -0400235 if _host_is_working(h)]
J. Richard Barnettef6839282015-06-01 16:00:35 -0700236 return self._working_list
237
J. Richard Barnette96db3492015-03-27 17:23:52 -0700238 def get_working(self):
J. Richard Barnettef6839282015-06-01 16:00:35 -0700239 """Return the number of working DUTs in the pool."""
240 return len(self.get_working_list())
241
J. Richard Barnettef6839282015-06-01 16:00:35 -0700242 def get_broken_list(self):
243 """Return a list of all broken DUTs in the pool.
244
Richard Barnettee8eee312018-04-27 13:12:04 -0400245 Filter `self._histories` for histories where the DUT is
246 diagnosed as broken.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700247
248 Cache the result so that we only cacluate it once.
249
250 @return A list of HostJobHistory objects.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700251 """
252 if self._broken_list is None:
253 self._broken_list = [h for h in self._histories
Richard Barnettee8eee312018-04-27 13:12:04 -0400254 if _host_is_broken(h)]
J. Richard Barnettef6839282015-06-01 16:00:35 -0700255 return self._broken_list
J. Richard Barnette96db3492015-03-27 17:23:52 -0700256
J. Richard Barnette96db3492015-03-27 17:23:52 -0700257 def get_broken(self):
J. Richard Barnettef6839282015-06-01 16:00:35 -0700258 """Return the number of broken DUTs in the pool."""
259 return len(self.get_broken_list())
J. Richard Barnette96db3492015-03-27 17:23:52 -0700260
xixuan12ce04f2016-03-10 13:16:30 -0800261 def get_idle_list(self):
262 """Return a list of all idle DUTs in the pool.
263
Richard Barnettee8eee312018-04-27 13:12:04 -0400264 Filter `self._histories` for histories where the DUT is
265 diagnosed as idle.
xixuan12ce04f2016-03-10 13:16:30 -0800266
267 Cache the result so that we only cacluate it once.
268
269 @return A list of HostJobHistory objects.
xixuan12ce04f2016-03-10 13:16:30 -0800270 """
xixuan12ce04f2016-03-10 13:16:30 -0800271 if self._idle_list is None:
272 self._idle_list = [h for h in self._histories
Richard Barnettee8eee312018-04-27 13:12:04 -0400273 if _host_is_idle(h)]
xixuan12ce04f2016-03-10 13:16:30 -0800274 return self._idle_list
275
xixuan12ce04f2016-03-10 13:16:30 -0800276 def get_idle(self):
277 """Return the number of idle DUTs in the pool."""
278 return len(self.get_idle_list())
279
J. Richard Barnette96db3492015-03-27 17:23:52 -0700280 def get_total(self):
J. Richard Barnettef6839282015-06-01 16:00:35 -0700281 """Return the total number of DUTs in the pool."""
J. Richard Barnette96db3492015-03-27 17:23:52 -0700282 return len(self._histories)
283
Richard Barnettee8eee312018-04-27 13:12:04 -0400284 def get_all_histories(self):
285 return self._histories
286
J. Richard Barnette96db3492015-03-27 17:23:52 -0700287
Richard Barnette5de01eb2017-12-15 09:53:42 -0800288class _PoolSetInventory(object):
289 """Maintains a set of `HostJobHistory`s for a set of pools.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700290
Richard Barnette5de01eb2017-12-15 09:53:42 -0800291 The collection is segregated into disjoint categories of "working",
292 "broken", and "idle" DUTs. Accessor methods allow finding both the
293 list of DUTs in each category, as well as counts of each category.
294 Accessor queries can be for an individual pool, or against all
295 pools.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700296
Richard Barnette5de01eb2017-12-15 09:53:42 -0800297 Performance note: This class relies on `_HostSetInventory`. Public
298 methods in this class generally rely on methods of the same name in
299 the underlying class, and so will have the same underlying
300 performance characteristics.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700301 """
302
Richard Barnette5de01eb2017-12-15 09:53:42 -0800303 def __init__(self, pools):
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800304 self._histories_by_pool = {
Richard Barnette5de01eb2017-12-15 09:53:42 -0800305 pool: _HostSetInventory() for pool in pools
J. Richard Barnette96db3492015-03-27 17:23:52 -0700306 }
307
308 def record_host(self, host_history):
309 """Add one `HostJobHistory` object to the collection.
310
311 @param host_history The `HostJobHistory` object to be
312 remembered.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700313 """
J. Richard Barnette3d0590a2015-04-29 12:56:12 -0700314 pool = host_history.host_pool
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800315 self._histories_by_pool[pool].record_host(host_history)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700316
J. Richard Barnette96db3492015-03-27 17:23:52 -0700317 def _count_pool(self, get_pool_count, pool=None):
318 """Internal helper to count hosts in a given pool.
319
320 The `get_pool_count` parameter is a function to calculate
321 the exact count of interest for the pool.
322
323 @param get_pool_count Function to return a count from a
324 _PoolCount object.
325 @param pool The pool to be counted. If `None`,
326 return the total across all pools.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700327 """
328 if pool is None:
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800329 return sum([get_pool_count(cached_history) for cached_history in
330 self._histories_by_pool.values()])
J. Richard Barnette96db3492015-03-27 17:23:52 -0700331 else:
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800332 return get_pool_count(self._histories_by_pool[pool])
J. Richard Barnette96db3492015-03-27 17:23:52 -0700333
J. Richard Barnettef6839282015-06-01 16:00:35 -0700334 def get_working_list(self):
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800335 """Return a list of all working DUTs (across all pools).
J. Richard Barnettef6839282015-06-01 16:00:35 -0700336
Richard Barnettee8eee312018-04-27 13:12:04 -0400337 Go through all HostJobHistory objects across all pools,
338 selecting all DUTs identified as working.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700339
340 @return A list of HostJobHistory objects.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700341 """
342 l = []
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800343 for p in self._histories_by_pool.values():
J. Richard Barnettef6839282015-06-01 16:00:35 -0700344 l.extend(p.get_working_list())
345 return l
346
J. Richard Barnette96db3492015-03-27 17:23:52 -0700347 def get_working(self, pool=None):
348 """Return the number of working DUTs in a pool.
349
350 @param pool The pool to be counted. If `None`, return the
351 total across all pools.
352
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700353 @return The total number of working DUTs in the selected
354 pool(s).
J. Richard Barnette96db3492015-03-27 17:23:52 -0700355 """
Richard Barnette5de01eb2017-12-15 09:53:42 -0800356 return self._count_pool(_HostSetInventory.get_working, pool)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700357
J. Richard Barnettef6839282015-06-01 16:00:35 -0700358 def get_broken_list(self):
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800359 """Return a list of all broken DUTs (across all pools).
J. Richard Barnettef6839282015-06-01 16:00:35 -0700360
Richard Barnettee8eee312018-04-27 13:12:04 -0400361 Go through all HostJobHistory objects across all pools,
362 selecting all DUTs identified as broken.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700363
364 @return A list of HostJobHistory objects.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700365 """
366 l = []
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800367 for p in self._histories_by_pool.values():
J. Richard Barnettef6839282015-06-01 16:00:35 -0700368 l.extend(p.get_broken_list())
369 return l
370
J. Richard Barnette96db3492015-03-27 17:23:52 -0700371 def get_broken(self, pool=None):
372 """Return the number of broken DUTs in a pool.
373
374 @param pool The pool to be counted. If `None`, return the
375 total across all pools.
376
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700377 @return The total number of broken DUTs in the selected pool(s).
J. Richard Barnette96db3492015-03-27 17:23:52 -0700378 """
Richard Barnette5de01eb2017-12-15 09:53:42 -0800379 return self._count_pool(_HostSetInventory.get_broken, pool)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700380
xixuan12ce04f2016-03-10 13:16:30 -0800381 def get_idle_list(self, pool=None):
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800382 """Return a list of all idle DUTs in the given pool.
xixuan12ce04f2016-03-10 13:16:30 -0800383
Richard Barnettee8eee312018-04-27 13:12:04 -0400384 Go through all HostJobHistory objects across all pools,
385 selecting all DUTs identified as idle.
xixuan12ce04f2016-03-10 13:16:30 -0800386
387 @param pool: The pool to be counted. If `None`, return the total list
388 across all pools.
389
390 @return A list of HostJobHistory objects.
xixuan12ce04f2016-03-10 13:16:30 -0800391 """
392 if pool is None:
393 l = []
Richard Barnette5de01eb2017-12-15 09:53:42 -0800394 for p in self._histories_by_pool.itervalues():
xixuan12ce04f2016-03-10 13:16:30 -0800395 l.extend(p.get_idle_list())
396 return l
397 else:
Richard Barnette5de01eb2017-12-15 09:53:42 -0800398 return self._histories_by_pool[pool].get_idle_list()
xixuan12ce04f2016-03-10 13:16:30 -0800399
xixuan12ce04f2016-03-10 13:16:30 -0800400 def get_idle(self, pool=None):
401 """Return the number of idle DUTs in a pool.
402
403 @param pool: The pool to be counted. If `None`, return the total
404 across all pools.
405
406 @return The total number of idle DUTs in the selected pool(s).
407 """
Richard Barnette5de01eb2017-12-15 09:53:42 -0800408 return self._count_pool(_HostSetInventory.get_idle, pool)
xixuan12ce04f2016-03-10 13:16:30 -0800409
Richard Barnette5de01eb2017-12-15 09:53:42 -0800410 def get_spares_buffer(self, spare_pool=SPARE_POOL):
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700411 """Return the the nominal number of working spares.
412
413 Calculates and returns how many working spares there would
414 be in the spares pool if all broken DUTs were in the spares
415 pool. This number may be negative, indicating a shortfall
416 in the critical pools.
417
418 @return The total number DUTs in the spares pool, less the total
419 number of broken DUTs in all pools.
420 """
Richard Barnette5de01eb2017-12-15 09:53:42 -0800421 return self.get_total(spare_pool) - self.get_broken()
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700422
J. Richard Barnette96db3492015-03-27 17:23:52 -0700423 def get_total(self, pool=None):
424 """Return the total number of DUTs in a pool.
425
426 @param pool The pool to be counted. If `None`, return the
427 total across all pools.
428
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700429 @return The total number of DUTs in the selected pool(s).
J. Richard Barnette96db3492015-03-27 17:23:52 -0700430 """
Richard Barnette5de01eb2017-12-15 09:53:42 -0800431 return self._count_pool(_HostSetInventory.get_total, pool)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700432
Richard Barnettee8eee312018-04-27 13:12:04 -0400433 def get_all_histories(self, pool=None):
434 if pool is None:
435 for p in self._histories_by_pool.itervalues():
436 for h in p.get_all_histories():
437 yield h
438 else:
439 for h in self._histories_by_pool[pool].get_all_histories():
440 yield h
441
J. Richard Barnette96db3492015-03-27 17:23:52 -0700442
Richard Barnette5de01eb2017-12-15 09:53:42 -0800443def _eligible_host(afehost):
444 """Return whether this host is eligible for monitoring.
445
446 A host is eligible if it has a (unique) 'model' label, it's in
447 exactly one pool, and it has no labels from the
448 `_EXCLUDED_LABELS` set.
449
450 @param afehost The host to be tested for eligibility.
451 """
452 # DUTs without an existing, unique 'model' or 'pool' label
453 # aren't meant to exist in the managed inventory; their presence
454 # generally indicates an error in the database. Unfortunately
455 # such errors have been seen to occur from time to time.
456 #
457 # The _LabInventory constructor requires hosts to conform to the
458 # label restrictions, and may fail if they don't. Failing an
459 # inventory run for a single bad entry is the wrong thing, so we
460 # ignore the problem children here, to keep them out of the
461 # inventory.
462 models = [l for l in afehost.labels
463 if l.startswith(constants.Labels.MODEL_PREFIX)]
464 pools = [l for l in afehost.labels
465 if l.startswith(constants.Labels.POOL_PREFIX)]
466 excluded = _EXCLUDED_LABELS.intersection(afehost.labels)
467 return len(models) == 1 and len(pools) == 1 and not excluded
468
469
470class _LabInventory(collections.Mapping):
J. Richard Barnette96db3492015-03-27 17:23:52 -0700471 """Collection of `HostJobHistory` objects for the Lab's inventory.
472
Richard Barnette5de01eb2017-12-15 09:53:42 -0800473 This is a dict-like collection indexed by model. Indexing returns
474 the _PoolSetInventory object associated with the model.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700475 """
476
477 @classmethod
Richard Barnette5de01eb2017-12-15 09:53:42 -0800478 def create_inventory(cls, afe, start_time, end_time, modellist=[]):
J. Richard Barnette96db3492015-03-27 17:23:52 -0700479 """Return a Lab inventory with specified parameters.
480
Richard Barnette5de01eb2017-12-15 09:53:42 -0800481 By default, gathers inventory from `HostJobHistory` objects for
482 all DUTs in the `MANAGED_POOLS` list. If `modellist` is
483 supplied, the inventory will be restricted to only the given
484 models.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700485
Richard Barnette5de01eb2017-12-15 09:53:42 -0800486 @param afe AFE object for constructing the
487 `HostJobHistory` objects.
488 @param start_time Start time for the `HostJobHistory` objects.
489 @param end_time End time for the `HostJobHistory` objects.
490 @param modellist List of models to include. If empty,
491 include all available models.
492 @return A `_LabInventory` object for the specified models.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700493 """
Richard Barnette5de01eb2017-12-15 09:53:42 -0800494 target_pools = MANAGED_POOLS
495 label_list = [constants.Labels.POOL_PREFIX + l for l in target_pools]
J. Richard Barnette96db3492015-03-27 17:23:52 -0700496 afehosts = afe.get_hosts(labels__name__in=label_list)
Richard Barnette5de01eb2017-12-15 09:53:42 -0800497 if modellist:
J. Richard Barnetteb8bc570c2016-03-17 17:03:57 -0700498 # We're deliberately not checking host eligibility in this
499 # code path. This is a debug path, not used in production;
500 # it may be useful to include ineligible hosts here.
Richard Barnette5de01eb2017-12-15 09:53:42 -0800501 modelhosts = []
502 for model in modellist:
503 model_label = constants.Labels.MODEL_PREFIX + model
J. Richard Barnette96db3492015-03-27 17:23:52 -0700504 host_list = [h for h in afehosts
Richard Barnette5de01eb2017-12-15 09:53:42 -0800505 if model_label in h.labels]
506 modelhosts.extend(host_list)
507 afehosts = modelhosts
J. Richard Barnetteb8bc570c2016-03-17 17:03:57 -0700508 else:
Richard Barnette3a404492018-02-08 13:57:01 -0800509 afehosts = [h for h in afehosts if _eligible_host(h)]
J. Richard Barnette96db3492015-03-27 17:23:52 -0700510 create = lambda host: (
511 status_history.HostJobHistory(afe, host,
512 start_time, end_time))
Richard Barnette5de01eb2017-12-15 09:53:42 -0800513 return cls([create(host) for host in afehosts], target_pools)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700514
Richard Barnette5de01eb2017-12-15 09:53:42 -0800515 def __init__(self, histories, pools):
516 models = {h.host_model for h in histories}
517 self._modeldata = {model: _PoolSetInventory(pools) for model in models}
J. Richard Barnette96db3492015-03-27 17:23:52 -0700518 self._dut_count = len(histories)
Richard Barnette5de01eb2017-12-15 09:53:42 -0800519 for h in histories:
520 self[h.host_model].record_host(h)
521 self._boards = {h.host_board for h in histories}
Prathmesh Prabhu154cb2b2017-11-08 17:36:51 -0800522
Richard Barnette5de01eb2017-12-15 09:53:42 -0800523 def __getitem__(self, key):
524 return self._modeldata.__getitem__(key)
Prathmesh Prabhu021e7842017-11-08 18:05:45 -0800525
Richard Barnette5de01eb2017-12-15 09:53:42 -0800526 def __len__(self):
527 return self._modeldata.__len__()
528
Richard Barnette5de01eb2017-12-15 09:53:42 -0800529 def __iter__(self):
530 return self._modeldata.__iter__()
531
J. Richard Barnette96db3492015-03-27 17:23:52 -0700532 def get_num_duts(self):
533 """Return the total number of DUTs in the inventory."""
534 return self._dut_count
535
Prathmesh Prabhua5a0e3d2017-11-09 08:53:53 -0800536 def get_num_models(self):
537 """Return the total number of models in the inventory."""
Richard Barnette5de01eb2017-12-15 09:53:42 -0800538 return len(self)
539
Richard Barnette5de01eb2017-12-15 09:53:42 -0800540 def get_pool_models(self, pool):
541 """Return all models in `pool`.
542
543 @param pool The pool to be inventoried for models.
544 """
545 return {m for m, h in self.iteritems() if h.get_total(pool)}
546
Richard Barnette5de01eb2017-12-15 09:53:42 -0800547 def get_boards(self):
548 return self._boards
Prathmesh Prabhua5a0e3d2017-11-09 08:53:53 -0800549
550
Richard Barnettee8eee312018-04-27 13:12:04 -0400551def _reportable_models(inventory, spare_pool=SPARE_POOL):
552 """Iterate over all models subject to reporting.
553
554 Yields the contents of `inventory.iteritems()` filtered to include
555 only reportable models. A model is reportable if it has DUTs in
556 both `spare_pool` and at least one other pool.
557
558 @param spare_pool The spare pool to be tested for reporting.
559 """
560 for model, poolset in inventory.iteritems():
561 spares = poolset.get_total(spare_pool)
562 total = poolset.get_total()
563 if spares != 0 and spares != total:
564 yield model, poolset
565
566
567def _all_dut_histories(inventory):
568 for poolset in inventory.itervalues():
569 for h in poolset.get_all_histories():
570 yield h
571
572
J. Richard Barnettef6839282015-06-01 16:00:35 -0700573def _sort_by_location(inventory_list):
574 """Return a list of DUTs, organized by location.
575
576 Take the given list of `HostJobHistory` objects, separate it
577 into a list per lab, and sort each lab's list by location. The
578 order of sorting within a lab is
579 * By row number within the lab,
580 * then by rack number within the row,
581 * then by host shelf number within the rack.
582
583 Return a list of the sorted lists.
584
585 Implementation note: host locations are sorted by converting
586 each location into a base 100 number. If row, rack or
587 host numbers exceed the range [0..99], then sorting will
588 break down.
589
590 @return A list of sorted lists of DUTs.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700591 """
592 BASE = 100
593 lab_lists = {}
594 for history in inventory_list:
595 location = _HOSTNAME_PATTERN.match(history.host.hostname)
596 if location:
597 lab = location.group(1)
598 key = 0
599 for idx in location.group(2, 3, 4):
600 key = BASE * key + int(idx)
601 lab_lists.setdefault(lab, []).append((key, history))
602 return_list = []
603 for dut_list in lab_lists.values():
604 dut_list.sort(key=lambda t: t[0])
605 return_list.append([t[1] for t in dut_list])
606 return return_list
607
608
609def _score_repair_set(buffer_counts, repair_list):
610 """Return a numeric score rating a set of DUTs to be repaired.
611
Richard Barnette5de01eb2017-12-15 09:53:42 -0800612 `buffer_counts` is a dictionary mapping model names to the size of
613 the model's spares buffer.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700614
Richard Barnette5de01eb2017-12-15 09:53:42 -0800615 `repair_list` is a list of `HostJobHistory` objects for the DUTs to
616 be repaired.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700617
618 This function calculates the new set of buffer counts that would
Richard Barnette5de01eb2017-12-15 09:53:42 -0800619 result from the proposed repairs, and scores the new set using two
620 numbers:
621 * Worst case buffer count for any model (higher is better). This
622 is the more significant number for comparison.
623 * Number of models at the worst case (lower is better). This is
624 the less significant number.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700625
Richard Barnette5de01eb2017-12-15 09:53:42 -0800626 Implementation note: The score could fail to reflect the intended
627 criteria if there are more than 1000 models in the inventory.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700628
Richard Barnette5de01eb2017-12-15 09:53:42 -0800629 @param spare_counts A dictionary mapping models to buffer counts.
630 @param repair_list A list of `HostJobHistory` objects for the
631 DUTs to be repaired.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700632 @return A numeric score.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700633 """
634 # Go through `buffer_counts`, and create a list of new counts
Richard Barnette5de01eb2017-12-15 09:53:42 -0800635 # that records the buffer count for each model after repair.
636 # The new list of counts discards the model names, as they don't
J. Richard Barnettef6839282015-06-01 16:00:35 -0700637 # contribute to the final score.
Richard Barnette5de01eb2017-12-15 09:53:42 -0800638 _NMODELS = 1000
639 pools = {h.host_pool for h in repair_list}
640 repair_inventory = _LabInventory(repair_list, pools)
J. Richard Barnettef6839282015-06-01 16:00:35 -0700641 new_counts = []
Richard Barnette5de01eb2017-12-15 09:53:42 -0800642 for m, c in buffer_counts.iteritems():
643 if m in repair_inventory:
644 newcount = repair_inventory[m].get_total()
J. Richard Barnettef6839282015-06-01 16:00:35 -0700645 else:
646 newcount = 0
647 new_counts.append(c + newcount)
648 # Go through the new list of counts. Find the worst available
649 # spares count, and count how many times that worst case occurs.
650 worst_count = new_counts[0]
651 num_worst = 1
652 for c in new_counts[1:]:
653 if c == worst_count:
654 num_worst += 1
655 elif c < worst_count:
656 worst_count = c
657 num_worst = 1
658 # Return the calculated score
Richard Barnette5de01eb2017-12-15 09:53:42 -0800659 return _NMODELS * worst_count - num_worst
J. Richard Barnettef6839282015-06-01 16:00:35 -0700660
661
662def _generate_repair_recommendation(inventory, num_recommend):
663 """Return a summary of selected DUTs needing repair.
664
Richard Barnette5de01eb2017-12-15 09:53:42 -0800665 Returns a message recommending a list of broken DUTs to be repaired.
666 The list of DUTs is selected based on these criteria:
J. Richard Barnettef6839282015-06-01 16:00:35 -0700667 * No more than `num_recommend` DUTs will be listed.
668 * All DUTs must be in the same lab.
Richard Barnette5de01eb2017-12-15 09:53:42 -0800669 * DUTs should be selected for some degree of physical proximity.
670 * DUTs for models with a low spares buffer are more important than
671 DUTs with larger buffers.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700672
Richard Barnette5de01eb2017-12-15 09:53:42 -0800673 The algorithm used will guarantee that at least one DUT from a model
674 with the lowest spares buffer will be recommended. If the worst
675 spares buffer number is shared by more than one model, the algorithm
676 will tend to prefer repair sets that include more of those models
677 over sets that cover fewer models.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700678
Richard Barnette5de01eb2017-12-15 09:53:42 -0800679 @param inventory `_LabInventory` object from which to generate
680 recommendations.
J. Richard Barnette1df6a562015-06-09 10:06:17 -0700681 @param num_recommend Number of DUTs to recommend for repair.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700682 """
683 logging.debug('Creating DUT repair recommendations')
Richard Barnette5de01eb2017-12-15 09:53:42 -0800684 model_buffer_counts = {}
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700685 broken_list = []
Richard Barnettee8eee312018-04-27 13:12:04 -0400686 for model, counts in _reportable_models(inventory):
Richard Barnette5de01eb2017-12-15 09:53:42 -0800687 logging.debug('Listing failed DUTs for %s', model)
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700688 if counts.get_broken() != 0:
Richard Barnette5de01eb2017-12-15 09:53:42 -0800689 model_buffer_counts[model] = counts.get_spares_buffer()
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700690 broken_list.extend(counts.get_broken_list())
J. Richard Barnette55127432015-10-13 17:01:56 -0700691 # N.B. The logic inside this loop may seem complicated, but
J. Richard Barnettef6839282015-06-01 16:00:35 -0700692 # simplification is hard:
693 # * Calculating an initial recommendation outside of
694 # the loop likely would make things more complicated,
695 # not less.
696 # * It's necessary to calculate an initial lab slice once per
697 # lab _before_ the while loop, in case the number of broken
698 # DUTs in a lab is less than `num_recommend`.
J. Richard Barnette55127432015-10-13 17:01:56 -0700699 recommendation = None
700 best_score = None
701 for lab_duts in _sort_by_location(broken_list):
J. Richard Barnettef6839282015-06-01 16:00:35 -0700702 start = 0
703 end = num_recommend
704 lab_slice = lab_duts[start : end]
Richard Barnette5de01eb2017-12-15 09:53:42 -0800705 lab_score = _score_repair_set(model_buffer_counts, lab_slice)
J. Richard Barnettef6839282015-06-01 16:00:35 -0700706 while end < len(lab_duts):
707 start += 1
708 end += 1
709 new_slice = lab_duts[start : end]
Richard Barnette5de01eb2017-12-15 09:53:42 -0800710 new_score = _score_repair_set(model_buffer_counts, new_slice)
J. Richard Barnettef6839282015-06-01 16:00:35 -0700711 if new_score > lab_score:
712 lab_slice = new_slice
713 lab_score = new_score
714 if recommendation is None or lab_score > best_score:
715 recommendation = lab_slice
716 best_score = lab_score
Richard Barnette5de01eb2017-12-15 09:53:42 -0800717 # N.B. The trailing space in `line_fmt` is manadatory: Without it,
718 # Gmail will parse the URL wrong. Don't ask. If you simply _must_
J. Richard Barnette5d0fa512016-04-05 17:39:52 -0700719 # know more, go try it yourself...
720 line_fmt = '%-30s %-16s %-6s\n %s '
J. Richard Barnette1df6a562015-06-09 10:06:17 -0700721 message = ['Repair recommendations:\n',
Richard Barnette5de01eb2017-12-15 09:53:42 -0800722 line_fmt % ( 'Hostname', 'Model', 'Servo?', 'Logs URL')]
Richard Barnetteb14c7dc2018-09-17 14:16:48 -0700723 if recommendation:
724 for h in recommendation:
725 servo_name = servo_host.make_servo_hostname(h.host.hostname)
726 servo_present = utils.host_is_in_lab_zone(servo_name)
Richard Barnette59404262018-09-14 15:25:30 -0700727 event = _get_diagnosis(h).task
Richard Barnetteb14c7dc2018-09-17 14:16:48 -0700728 line = line_fmt % (
729 h.host.hostname, h.host_model,
730 'Yes' if servo_present else 'No', event.job_url)
731 message.append(line)
732 else:
733 message.append('(No DUTs to repair)')
J. Richard Barnettef6839282015-06-01 16:00:35 -0700734 return '\n'.join(message)
735
736
Richard Barnette5de01eb2017-12-15 09:53:42 -0800737def _generate_model_inventory_message(inventory):
738 """Generate the "model inventory" e-mail message.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700739
Richard Barnette5de01eb2017-12-15 09:53:42 -0800740 The model inventory is a list by model summarizing the number of
741 working, broken, and idle DUTs, and the total shortfall or surplus
J. Richard Barnette96db3492015-03-27 17:23:52 -0700742 of working devices relative to the minimum critical pool
743 requirement.
744
Richard Barnette5de01eb2017-12-15 09:53:42 -0800745 The report omits models with no DUTs in the spare pool or with no
746 DUTs in a critical pool.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700747
748 N.B. For sample output text formattted as users can expect to
749 see it in e-mail and log files, refer to the unit tests.
750
Richard Barnette5de01eb2017-12-15 09:53:42 -0800751 @param inventory `_LabInventory` object to be reported on.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700752 @return String with the inventory message to be sent.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700753 """
Richard Barnette5de01eb2017-12-15 09:53:42 -0800754 logging.debug('Creating model inventory')
J. Richard Barnetted3ba33a2015-10-14 11:20:49 -0700755 nworking = 0
756 nbroken = 0
xixuan12ce04f2016-03-10 13:16:30 -0800757 nidle = 0
Richard Barnette5de01eb2017-12-15 09:53:42 -0800758 nbroken_models = 0
759 ntotal_models = 0
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700760 summaries = []
Richard Barnette5de01eb2017-12-15 09:53:42 -0800761 column_names = (
762 'Model', 'Avail', 'Bad', 'Idle', 'Good', 'Spare', 'Total')
Richard Barnettee8eee312018-04-27 13:12:04 -0400763 for model, counts in _reportable_models(inventory):
Richard Barnette5de01eb2017-12-15 09:53:42 -0800764 logging.debug('Counting %2d DUTS for model %s',
765 counts.get_total(), model)
766 # Summary elements laid out in the same order as the column
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700767 # headers:
Richard Barnette5de01eb2017-12-15 09:53:42 -0800768 # Model Avail Bad Idle Good Spare Total
xixuan12ce04f2016-03-10 13:16:30 -0800769 # e[0] e[1] e[2] e[3] e[4] e[5] e[6]
Richard Barnette5de01eb2017-12-15 09:53:42 -0800770 element = (model,
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700771 counts.get_spares_buffer(),
772 counts.get_broken(),
xixuan12ce04f2016-03-10 13:16:30 -0800773 counts.get_idle(),
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700774 counts.get_working(),
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700775 counts.get_total(SPARE_POOL),
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700776 counts.get_total())
J. Richard Barnetteea5a4ba2016-02-18 16:34:50 -0800777 if element[2]:
778 summaries.append(element)
Richard Barnette5de01eb2017-12-15 09:53:42 -0800779 nbroken_models += 1
780 ntotal_models += 1
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700781 nbroken += element[2]
xixuan12ce04f2016-03-10 13:16:30 -0800782 nidle += element[3]
783 nworking += element[4]
784 ntotal = nworking + nbroken + nidle
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700785 summaries = sorted(summaries, key=lambda e: (e[1], -e[2]))
J. Richard Barnetted3ba33a2015-10-14 11:20:49 -0700786 broken_percent = int(round(100.0 * nbroken / ntotal))
xixuan12ce04f2016-03-10 13:16:30 -0800787 idle_percent = int(round(100.0 * nidle / ntotal))
788 working_percent = 100 - broken_percent - idle_percent
J. Richard Barnetted3ba33a2015-10-14 11:20:49 -0700789 message = ['Summary of DUTs in inventory:',
xixuan12ce04f2016-03-10 13:16:30 -0800790 '%10s %10s %10s %6s' % ('Bad', 'Idle', 'Good', 'Total'),
791 '%5d %3d%% %5d %3d%% %5d %3d%% %6d' % (
J. Richard Barnetted3ba33a2015-10-14 11:20:49 -0700792 nbroken, broken_percent,
xixuan12ce04f2016-03-10 13:16:30 -0800793 nidle, idle_percent,
J. Richard Barnetted3ba33a2015-10-14 11:20:49 -0700794 nworking, working_percent,
795 ntotal),
796 '',
Richard Barnette5de01eb2017-12-15 09:53:42 -0800797 'Models with failures: %d' % nbroken_models,
798 'Models in inventory: %d' % ntotal_models,
J. Richard Barnetted3ba33a2015-10-14 11:20:49 -0700799 '', '',
Richard Barnette5de01eb2017-12-15 09:53:42 -0800800 'Full model inventory:\n',
801 '%-22s %5s %5s %5s %5s %5s %5s' % column_names]
J. Richard Barnette96db3492015-03-27 17:23:52 -0700802 message.extend(
xixuan12ce04f2016-03-10 13:16:30 -0800803 ['%-22s %5d %5d %5d %5d %5d %5d' % e for e in summaries])
J. Richard Barnette96db3492015-03-27 17:23:52 -0700804 return '\n'.join(message)
805
806
J. Richard Barnette4845fcf2015-04-20 14:26:25 -0700807_POOL_INVENTORY_HEADER = '''\
Richard Barnette5de01eb2017-12-15 09:53:42 -0800808Notice to Infrastructure deputies: All models shown below are at
J. Richard Barnettec9a143c2015-06-04 11:11:19 -0700809less than full strength, please take action to resolve the issues.
810Once you're satisified that failures won't recur, failed DUTs can
811be replaced with spares by running `balance_pool`. Detailed
812instructions can be found here:
J. Richard Barnette4845fcf2015-04-20 14:26:25 -0700813 http://go/cros-manage-duts
814'''
815
816
J. Richard Barnette96db3492015-03-27 17:23:52 -0700817def _generate_pool_inventory_message(inventory):
818 """Generate the "pool inventory" e-mail message.
819
Richard Barnette5de01eb2017-12-15 09:53:42 -0800820 The pool inventory is a list by pool and model summarizing the
821 number of working and broken DUTs in the pool. Only models with
J. Richard Barnette96db3492015-03-27 17:23:52 -0700822 at least one broken DUT are included in the list.
823
Richard Barnette5de01eb2017-12-15 09:53:42 -0800824 N.B. For sample output text formattted as users can expect to see it
825 in e-mail and log files, refer to the unit tests.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700826
Richard Barnette5de01eb2017-12-15 09:53:42 -0800827 @param inventory `_LabInventory` object to be reported on.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700828 @return String with the inventory message to be sent.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700829 """
830 logging.debug('Creating pool inventory')
J. Richard Barnette4845fcf2015-04-20 14:26:25 -0700831 message = [_POOL_INVENTORY_HEADER]
J. Richard Barnette96db3492015-03-27 17:23:52 -0700832 newline = ''
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700833 for pool in CRITICAL_POOLS:
J. Richard Barnette96db3492015-03-27 17:23:52 -0700834 message.append(
Richard Barnette5de01eb2017-12-15 09:53:42 -0800835 '%sStatus for pool:%s, by model:' % (newline, pool))
J. Richard Barnette96db3492015-03-27 17:23:52 -0700836 message.append(
xixuan12ce04f2016-03-10 13:16:30 -0800837 '%-20s %5s %5s %5s %5s' % (
Richard Barnette5de01eb2017-12-15 09:53:42 -0800838 'Model', 'Bad', 'Idle', 'Good', 'Total'))
J. Richard Barnette96db3492015-03-27 17:23:52 -0700839 data_list = []
Richard Barnette5de01eb2017-12-15 09:53:42 -0800840 for model, counts in inventory.iteritems():
Richard Barnette254d5b42016-07-06 19:13:23 -0700841 logging.debug('Counting %2d DUTs for %s, %s',
Richard Barnette5de01eb2017-12-15 09:53:42 -0800842 counts.get_total(pool), model, pool)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700843 broken = counts.get_broken(pool)
xixuan12ce04f2016-03-10 13:16:30 -0800844 idle = counts.get_idle(pool)
Richard Barnette5de01eb2017-12-15 09:53:42 -0800845 # models at full strength are not reported
846 if not broken and not idle:
J. Richard Barnette96db3492015-03-27 17:23:52 -0700847 continue
848 working = counts.get_working(pool)
849 total = counts.get_total(pool)
Richard Barnette5de01eb2017-12-15 09:53:42 -0800850 data_list.append((model, broken, idle, working, total))
J. Richard Barnette96db3492015-03-27 17:23:52 -0700851 if data_list:
852 data_list = sorted(data_list, key=lambda d: -d[1])
853 message.extend(
xixuan12ce04f2016-03-10 13:16:30 -0800854 ['%-20s %5d %5d %5d %5d' % t for t in data_list])
J. Richard Barnette96db3492015-03-27 17:23:52 -0700855 else:
Richard Barnette5de01eb2017-12-15 09:53:42 -0800856 message.append('(All models at full strength)')
J. Richard Barnette96db3492015-03-27 17:23:52 -0700857 newline = '\n'
858 return '\n'.join(message)
859
860
xixuan12ce04f2016-03-10 13:16:30 -0800861_IDLE_INVENTORY_HEADER = '''\
862Notice to Infrastructure deputies: The hosts shown below haven't
863run any jobs for at least 24 hours. Please check each host; locked
864hosts should normally be unlocked; stuck jobs should normally be
865aborted.
866'''
867
868
869def _generate_idle_inventory_message(inventory):
870 """Generate the "idle inventory" e-mail message.
871
Richard Barnette5de01eb2017-12-15 09:53:42 -0800872 The idle inventory is a host list with corresponding pool and model,
Richard Barnettee8eee312018-04-27 13:12:04 -0400873 where the hosts are identified as idle.
xixuan12ce04f2016-03-10 13:16:30 -0800874
875 N.B. For sample output text format as users can expect to
876 see it in e-mail and log files, refer to the unit tests.
877
Richard Barnette5de01eb2017-12-15 09:53:42 -0800878 @param inventory `_LabInventory` object to be reported on.
xixuan12ce04f2016-03-10 13:16:30 -0800879 @return String with the inventory message to be sent.
xixuan12ce04f2016-03-10 13:16:30 -0800880 """
881 logging.debug('Creating idle inventory')
882 message = [_IDLE_INVENTORY_HEADER]
883 message.append('Idle Host List:')
Richard Barnette5de01eb2017-12-15 09:53:42 -0800884 message.append('%-30s %-20s %s' % ('Hostname', 'Model', 'Pool'))
xixuan12ce04f2016-03-10 13:16:30 -0800885 data_list = []
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700886 for pool in MANAGED_POOLS:
Richard Barnette5de01eb2017-12-15 09:53:42 -0800887 for model, counts in inventory.iteritems():
Richard Barnette254d5b42016-07-06 19:13:23 -0700888 logging.debug('Counting %2d DUTs for %s, %s',
Richard Barnette5de01eb2017-12-15 09:53:42 -0800889 counts.get_total(pool), model, pool)
890 data_list.extend([(dut.host.hostname, model, pool)
xixuan12ce04f2016-03-10 13:16:30 -0800891 for dut in counts.get_idle_list(pool)])
892 if data_list:
893 message.extend(['%-30s %-20s %s' % t for t in data_list])
894 else:
895 message.append('(No idle DUTs)')
896 return '\n'.join(message)
897
898
J. Richard Barnette96db3492015-03-27 17:23:52 -0700899def _send_email(arguments, tag, subject, recipients, body):
900 """Send an inventory e-mail message.
901
Richard Barnette5de01eb2017-12-15 09:53:42 -0800902 The message is logged in the selected log directory using `tag` for
903 the file name.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700904
Richard Barnette5de01eb2017-12-15 09:53:42 -0800905 If the --debug option was requested, the message is neither logged
906 nor sent, but merely printed on stdout.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700907
908 @param arguments Parsed command-line options.
909 @param tag Tag identifying the inventory for logging
910 purposes.
911 @param subject E-mail Subject: header line.
912 @param recipients E-mail addresses for the To: header line.
913 @param body E-mail message body.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700914 """
915 logging.debug('Generating email: "%s"', subject)
916 all_recipients = ', '.join(recipients)
917 report_body = '\n'.join([
918 'To: %s' % all_recipients,
919 'Subject: %s' % subject,
920 '', body, ''])
J. Richard Barnette02e82432015-10-13 16:02:47 -0700921 if arguments.debug:
J. Richard Barnette96db3492015-03-27 17:23:52 -0700922 print report_body
923 else:
924 filename = os.path.join(arguments.logdir, tag)
925 try:
926 report_file = open(filename, 'w')
927 report_file.write(report_body)
928 report_file.close()
929 except EnvironmentError as e:
930 logging.error('Failed to write %s: %s', filename, e)
931 try:
932 gmail_lib.send_email(all_recipients, subject, body)
933 except Exception as e:
934 logging.error('Failed to send e-mail to %s: %s',
935 all_recipients, e)
936
937
Richard Barnette5de01eb2017-12-15 09:53:42 -0800938def _populate_model_counts(inventory):
939 """Gather model counts while providing interactive feedback.
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700940
941 Gathering the status of all individual DUTs in the lab can take
942 considerable time (~30 minutes at the time of this writing).
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700943 Normally, we pay that cost by querying as we go. However, with
944 the `--debug` option, we expect a human being to be watching the
Richard Barnette5de01eb2017-12-15 09:53:42 -0800945 progress in real time. So, we force the first (expensive) queries
946 to happen up front, and provide simple ASCII output on sys.stdout
947 to show a progress bar and results.
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700948
Richard Barnette5de01eb2017-12-15 09:53:42 -0800949 @param inventory `_LabInventory` object from which to gather
950 counts.
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700951 """
952 n = 0
953 total_broken = 0
Richard Barnette5de01eb2017-12-15 09:53:42 -0800954 for counts in inventory.itervalues():
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700955 n += 1
956 if n % 10 == 5:
957 c = '+'
958 elif n % 10 == 0:
959 c = '%d' % ((n / 10) % 10)
960 else:
961 c = '.'
962 sys.stdout.write(c)
963 sys.stdout.flush()
Richard Barnette5de01eb2017-12-15 09:53:42 -0800964 # This next call is where all the time goes - it forces all of a
965 # model's `HostJobHistory` objects to query the database and
966 # cache their results.
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700967 total_broken += counts.get_broken()
968 sys.stdout.write('\n')
969 sys.stdout.write('Found %d broken DUTs\n' % total_broken)
970
971
Richard Barnette5de01eb2017-12-15 09:53:42 -0800972def _perform_model_inventory(arguments, inventory, timestamp):
973 """Perform the model inventory report.
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700974
Richard Barnette5de01eb2017-12-15 09:53:42 -0800975 The model inventory report consists of the following:
976 * A list of DUTs that are recommended to be repaired. This list
977 is optional, and only appears if the `--recommend` option is
978 present.
979 * A list of all models that have failed DUTs, with counts
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700980 of working, broken, and spare DUTs, among others.
981
982 @param arguments Command-line arguments as returned by
983 `ArgumentParser`
Richard Barnette5de01eb2017-12-15 09:53:42 -0800984 @param inventory `_LabInventory` object to be reported on.
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700985 @param timestamp A string used to identify this run's timestamp
986 in logs and email output.
987 """
988 if arguments.recommend:
989 recommend_message = _generate_repair_recommendation(
990 inventory, arguments.recommend) + '\n\n\n'
991 else:
992 recommend_message = ''
Richard Barnette5de01eb2017-12-15 09:53:42 -0800993 model_message = _generate_model_inventory_message(inventory)
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700994 _send_email(arguments,
Richard Barnette5de01eb2017-12-15 09:53:42 -0800995 'models-%s.txt' % timestamp,
996 'DUT model inventory %s' % timestamp,
997 arguments.model_notify,
998 recommend_message + model_message)
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700999
1000
1001def _perform_pool_inventory(arguments, inventory, timestamp):
1002 """Perform the pool inventory report.
1003
1004 The pool inventory report consists of the following:
1005 * A list of all critical pools that have failed DUTs, with counts
1006 of working, broken, and idle DUTs.
Richard Barnette5de01eb2017-12-15 09:53:42 -08001007 * A list of all idle DUTs by hostname including the model and
Richard Barnette3dcbb6a2017-10-23 17:57:50 -07001008 pool.
1009
1010 @param arguments Command-line arguments as returned by
1011 `ArgumentParser`
Richard Barnette5de01eb2017-12-15 09:53:42 -08001012 @param inventory `_LabInventory` object to be reported on.
Richard Barnettecf5d8342017-10-24 18:13:11 -07001013 @param timestamp A string used to identify this run's timestamp in
1014 logs and email output.
Richard Barnette3dcbb6a2017-10-23 17:57:50 -07001015 """
1016 pool_message = _generate_pool_inventory_message(inventory)
1017 idle_message = _generate_idle_inventory_message(inventory)
1018 _send_email(arguments,
1019 'pools-%s.txt' % timestamp,
1020 'DUT pool inventory %s' % timestamp,
1021 arguments.pool_notify,
1022 pool_message + '\n\n\n' + idle_message)
1023
1024
Richard Barnettecf5d8342017-10-24 18:13:11 -07001025def _dut_in_repair_loop(history):
1026 """Return whether a DUT's history indicates a repair loop.
1027
1028 A DUT is considered looping if it runs no tests, and no tasks pass
1029 other than repair tasks.
1030
1031 @param history An instance of `status_history.HostJobHistory` to be
1032 scanned for a repair loop. The caller guarantees
1033 that this history corresponds to a working DUT.
1034 @returns Return a true value if the DUT's most recent history
1035 indicates a repair loop.
1036 """
1037 # Our caller passes only histories for working DUTs; that means
1038 # we've already paid the cost of fetching the diagnosis task, and
1039 # we know that the task was successful. The diagnosis task will be
1040 # one of the tasks we must scan to find a loop, so if the task isn't
1041 # a repair task, then our history includes a successful non-repair
1042 # task, and we're not looping.
1043 #
Richard Barnette1ca30e62018-04-09 16:45:58 -07001044 # The for loop below is very expensive, because it must fetch the
Richard Barnettecf5d8342017-10-24 18:13:11 -07001045 # full history, regardless of how many tasks we examine. At the
1046 # time of this writing, this check against the diagnosis task
1047 # reduces the cost of finding loops in the full inventory from hours
1048 # to minutes.
Richard Barnette59404262018-09-14 15:25:30 -07001049 if _get_diagnosis(history).task.name != 'Repair':
Richard Barnettecf5d8342017-10-24 18:13:11 -07001050 return False
1051 repair_ok_count = 0
1052 for task in history:
1053 if not task.is_special:
1054 # This is a test, so we're not looping.
1055 return False
1056 if task.diagnosis == status_history.BROKEN:
1057 # Failed a repair, so we're not looping.
1058 return False
1059 if (task.diagnosis == status_history.WORKING
1060 and task.name != 'Repair'):
1061 # Non-repair task succeeded, so we're not looping.
1062 return False
1063 # At this point, we have either a failed non-repair task, or
1064 # a successful repair.
1065 if task.name == 'Repair':
1066 repair_ok_count += 1
1067 if repair_ok_count >= _REPAIR_LOOP_THRESHOLD:
1068 return True
1069
1070
Richard Barnette1ca30e62018-04-09 16:45:58 -07001071def _report_untestable_dut(history, state):
1072 fields = {
1073 'dut_hostname': history.hostname,
1074 'model': history.host_model,
1075 'pool': history.host_pool,
1076 'state': state,
1077 }
Richard Barnettee8eee312018-04-27 13:12:04 -04001078 logging.info('DUT in state %(state)s: %(dut_hostname)s, '
1079 'model: %(model)s, pool: %(pool)s', fields)
Richard Barnette1ca30e62018-04-09 16:45:58 -07001080 _UNTESTABLE_PRESENCE_METRIC.set(True, fields=fields)
Richard Barnettecf5d8342017-10-24 18:13:11 -07001081
Richard Barnettecf5d8342017-10-24 18:13:11 -07001082
Richard Barnette1ca30e62018-04-09 16:45:58 -07001083def _report_untestable_dut_metrics(inventory):
1084 """Scan the inventory for DUTs unable to run tests.
1085
1086 DUTs in the inventory are judged "untestable" if they meet one of
1087 two criteria:
1088 * The DUT is stuck in a repair loop; that is, it regularly passes
1089 repair, but never passes other operations.
1090 * The DUT runs no tasks at all, but is not locked.
1091
1092 This routine walks through the given inventory looking for DUTs in
1093 either of these states. Results are reported via a Monarch presence
1094 metric.
1095
1096 Note: To make sure that DUTs aren't flagged as "idle" merely
1097 because there's no work, a separate job runs prior to regular
1098 inventory runs which schedules trivial work on any DUT that appears
1099 idle.
1100
1101 @param inventory `_LabInventory` object to be reported on.
1102 """
Richard Barnettee8eee312018-04-27 13:12:04 -04001103 logging.info('Scanning for untestable DUTs.')
1104 for history in _all_dut_histories(inventory):
1105 # Managed DUTs with names that don't match
1106 # _HOSTNAME_PATTERN shouldn't be possible. However, we
1107 # don't want arbitrary strings being attached to the
1108 # 'dut_hostname' field, so for safety, we exclude all
1109 # anomalies.
1110 if not _HOSTNAME_PATTERN.match(history.hostname):
1111 continue
1112 if _host_is_working(history):
1113 if _dut_in_repair_loop(history):
1114 _report_untestable_dut(history, 'repair_loop')
1115 elif _host_is_idle(history):
1116 if not history.host.locked:
1117 _report_untestable_dut(history, 'idle_unlocked')
Richard Barnettecf5d8342017-10-24 18:13:11 -07001118
1119
Richard Barnette3dcbb6a2017-10-23 17:57:50 -07001120def _log_startup(arguments, startup_time):
1121 """Log the start of this inventory run.
1122
1123 Print various log messages indicating the start of the run. Return
1124 a string based on `startup_time` that will be used to identify this
1125 run in log files and e-mail messages.
1126
1127 @param startup_time A UNIX timestamp marking the moment when
1128 this inventory run began.
1129 @returns A timestamp string that will be used to identify this run
1130 in logs and email output.
1131 """
Alex Zamorzaev8e5317e52018-09-25 00:42:30 +00001132 timestamp = time.strftime('%Y-%m-%d.%H',
Richard Barnette3dcbb6a2017-10-23 17:57:50 -07001133 time.localtime(startup_time))
1134 logging.debug('Starting lab inventory for %s', timestamp)
Richard Barnette5de01eb2017-12-15 09:53:42 -08001135 if arguments.model_notify:
Richard Barnette3dcbb6a2017-10-23 17:57:50 -07001136 if arguments.recommend:
1137 logging.debug('Will include repair recommendations')
Richard Barnette5de01eb2017-12-15 09:53:42 -08001138 logging.debug('Will include model inventory')
Richard Barnette3dcbb6a2017-10-23 17:57:50 -07001139 if arguments.pool_notify:
1140 logging.debug('Will include pool inventory')
1141 return timestamp
1142
1143
1144def _create_inventory(arguments, end_time):
1145 """Create the `_LabInventory` instance to use for reporting.
1146
1147 @param end_time A UNIX timestamp for the end of the time range
1148 to be searched in this inventory run.
1149 """
1150 start_time = end_time - arguments.duration * 60 * 60
1151 afe = frontend_wrappers.RetryingAFE(server=None)
1152 inventory = _LabInventory.create_inventory(
Richard Barnette5de01eb2017-12-15 09:53:42 -08001153 afe, start_time, end_time, arguments.modelnames)
1154 logging.info('Found %d hosts across %d models',
Richard Barnette3dcbb6a2017-10-23 17:57:50 -07001155 inventory.get_num_duts(),
Richard Barnette5de01eb2017-12-15 09:53:42 -08001156 inventory.get_num_models())
Richard Barnette3dcbb6a2017-10-23 17:57:50 -07001157 return inventory
1158
1159
Richard Barnettecf5d8342017-10-24 18:13:11 -07001160def _perform_inventory_reports(arguments):
1161 """Perform all inventory checks requested on the command line.
1162
1163 Create the initial inventory and run through the inventory reports
1164 as called for by the parsed command-line arguments.
1165
1166 @param arguments Command-line arguments as returned by
1167 `ArgumentParser`.
1168 """
1169 startup_time = time.time()
1170 timestamp = _log_startup(arguments, startup_time)
1171 inventory = _create_inventory(arguments, startup_time)
1172 if arguments.debug:
Richard Barnette5de01eb2017-12-15 09:53:42 -08001173 _populate_model_counts(inventory)
1174 if arguments.model_notify:
1175 _perform_model_inventory(arguments, inventory, timestamp)
Richard Barnettecf5d8342017-10-24 18:13:11 -07001176 if arguments.pool_notify:
1177 _perform_pool_inventory(arguments, inventory, timestamp)
Richard Barnette1ca30e62018-04-09 16:45:58 -07001178 if arguments.report_untestable:
1179 _report_untestable_dut_metrics(inventory)
Richard Barnettecf5d8342017-10-24 18:13:11 -07001180
1181
J. Richard Barnette96db3492015-03-27 17:23:52 -07001182def _separate_email_addresses(address_list):
1183 """Parse a list of comma-separated lists of e-mail addresses.
1184
1185 @param address_list A list of strings containing comma
1186 separate e-mail addresses.
1187 @return A list of the individual e-mail addresses.
J. Richard Barnette96db3492015-03-27 17:23:52 -07001188 """
1189 newlist = []
1190 for arg in address_list:
1191 newlist.extend([email.strip() for email in arg.split(',')])
1192 return newlist
1193
1194
1195def _verify_arguments(arguments):
1196 """Validate command-line arguments.
1197
Richard Barnette5de01eb2017-12-15 09:53:42 -08001198 Join comma separated e-mail addresses for `--model-notify` and
J. Richard Barnette96db3492015-03-27 17:23:52 -07001199 `--pool-notify` in separate option arguments into a single list.
1200
Richard Barnette54150302018-02-26 10:42:46 -08001201 For non-debug uses, require that at least one inventory report be
1202 requested. For debug, if a report isn't specified, treat it as "run
1203 all the reports."
J. Richard Barnette02e82432015-10-13 16:02:47 -07001204
1205 The return value indicates success or failure; in the case of
1206 failure, we also write an error message to stderr.
1207
J. Richard Barnette96db3492015-03-27 17:23:52 -07001208 @param arguments Command-line arguments as returned by
1209 `ArgumentParser`
J. Richard Barnette02e82432015-10-13 16:02:47 -07001210 @return True if the arguments are semantically good, or False
1211 if the arguments don't meet requirements.
J. Richard Barnette96db3492015-03-27 17:23:52 -07001212 """
Richard Barnette5de01eb2017-12-15 09:53:42 -08001213 arguments.model_notify = _separate_email_addresses(
1214 arguments.model_notify)
J. Richard Barnette96db3492015-03-27 17:23:52 -07001215 arguments.pool_notify = _separate_email_addresses(
1216 arguments.pool_notify)
Richard Barnette54150302018-02-26 10:42:46 -08001217 if not any([arguments.model_notify, arguments.pool_notify,
Richard Barnette1ca30e62018-04-09 16:45:58 -07001218 arguments.report_untestable]):
J. Richard Barnette02e82432015-10-13 16:02:47 -07001219 if not arguments.debug:
Richard Barnette54150302018-02-26 10:42:46 -08001220 sys.stderr.write('Must request at least one report via '
1221 '--model-notify, --pool-notify, or '
Richard Barnette1ca30e62018-04-09 16:45:58 -07001222 '--report-untestable\n')
J. Richard Barnette02e82432015-10-13 16:02:47 -07001223 return False
1224 else:
Richard Barnette54150302018-02-26 10:42:46 -08001225 # We want to run all the e-mail reports. An empty notify
1226 # list will cause a report to be skipped, so make sure the
J. Richard Barnette02e82432015-10-13 16:02:47 -07001227 # lists are non-empty.
Richard Barnette5de01eb2017-12-15 09:53:42 -08001228 arguments.model_notify = ['']
J. Richard Barnette02e82432015-10-13 16:02:47 -07001229 arguments.pool_notify = ['']
1230 return True
J. Richard Barnette96db3492015-03-27 17:23:52 -07001231
1232
Richard Barnette3dcbb6a2017-10-23 17:57:50 -07001233def _get_default_logdir(script):
J. Richard Barnette96db3492015-03-27 17:23:52 -07001234 """Get the default directory for the `--logdir` option.
1235
1236 The default log directory is based on the parent directory
1237 containing this script.
1238
1239 @param script Path to this script file.
1240 @return A path to a directory.
J. Richard Barnette96db3492015-03-27 17:23:52 -07001241 """
1242 basedir = os.path.dirname(os.path.abspath(script))
1243 basedir = os.path.dirname(basedir)
1244 return os.path.join(basedir, _LOGDIR)
1245
1246
1247def _parse_command(argv):
1248 """Parse the command line arguments.
1249
1250 Create an argument parser for this command's syntax, parse the
1251 command line, and return the result of the ArgumentParser
1252 parse_args() method.
1253
1254 @param argv Standard command line argument vector; argv[0] is
1255 assumed to be the command name.
1256 @return Result returned by ArgumentParser.parse_args().
J. Richard Barnette96db3492015-03-27 17:23:52 -07001257 """
1258 parser = argparse.ArgumentParser(
1259 prog=argv[0],
1260 description='Gather and report lab inventory statistics')
1261 parser.add_argument('-d', '--duration', type=int,
1262 default=_DEFAULT_DURATION, metavar='HOURS',
1263 help='number of hours back to search for status'
1264 ' (default: %d)' % _DEFAULT_DURATION)
Richard Barnette5de01eb2017-12-15 09:53:42 -08001265 parser.add_argument('--model-notify', action='append',
J. Richard Barnette96db3492015-03-27 17:23:52 -07001266 default=[], metavar='ADDRESS',
Richard Barnette5de01eb2017-12-15 09:53:42 -08001267 help='Generate model inventory message, '
J. Richard Barnette96db3492015-03-27 17:23:52 -07001268 'and send it to the given e-mail address(es)')
1269 parser.add_argument('--pool-notify', action='append',
1270 default=[], metavar='ADDRESS',
1271 help='Generate pool inventory message, '
1272 'and send it to the given address(es)')
J. Richard Barnette1df6a562015-06-09 10:06:17 -07001273 parser.add_argument('-r', '--recommend', type=int, default=None,
J. Richard Barnettef6839282015-06-01 16:00:35 -07001274 help=('Specify how many DUTs should be '
J. Richard Barnette1df6a562015-06-09 10:06:17 -07001275 'recommended for repair (default: no '
1276 'recommendation)'))
Richard Barnette1ca30e62018-04-09 16:45:58 -07001277 parser.add_argument('--report-untestable', action='store_true',
1278 help='Check for devices unable to run tests.')
J. Richard Barnette02e82432015-10-13 16:02:47 -07001279 parser.add_argument('--debug', action='store_true',
Prathmesh Prabhu6b48ede2018-05-07 14:33:07 -07001280 help='Print e-mail, metrics messages on stdout '
J. Richard Barnette96db3492015-03-27 17:23:52 -07001281 'without sending them.')
Richard Barnettec4374692018-09-17 13:53:38 -07001282 parser.add_argument('--no-metrics', action='store_false',
1283 dest='use_metrics',
1284 help='Suppress generation of Monarch metrics.')
Richard Barnette3dcbb6a2017-10-23 17:57:50 -07001285 parser.add_argument('--logdir', default=_get_default_logdir(argv[0]),
J. Richard Barnette96db3492015-03-27 17:23:52 -07001286 help='Directory where logs will be written.')
Richard Barnette5de01eb2017-12-15 09:53:42 -08001287 parser.add_argument('modelnames', nargs='*',
1288 metavar='MODEL',
1289 help='names of models to report on '
1290 '(default: all models)')
J. Richard Barnette96db3492015-03-27 17:23:52 -07001291 arguments = parser.parse_args(argv[1:])
J. Richard Barnette02e82432015-10-13 16:02:47 -07001292 if not _verify_arguments(arguments):
1293 return None
J. Richard Barnette96db3492015-03-27 17:23:52 -07001294 return arguments
1295
1296
1297def _configure_logging(arguments):
1298 """Configure the `logging` module for our needs.
1299
Richard Barnette3dcbb6a2017-10-23 17:57:50 -07001300 How we log depends on whether the `--debug` option was provided on
1301 the command line.
1302 * Without the option, we configure the logging to capture all
1303 potentially relevant events in a log file. The log file is
1304 configured to rotate once a week on Friday evening, preserving
1305 ~3 months worth of history.
1306 * With the option, we expect stdout to contain other
1307 human-readable output (including the contents of the e-mail
Richard Barnettecf5d8342017-10-24 18:13:11 -07001308 messages), so we restrict the output to INFO level.
1309
1310 For convenience, when `--debug` is on, the logging format has
1311 no adornments, so that a call like `logging.info(msg)` simply writes
1312 `msg` to stdout, plus a trailing newline.
J. Richard Barnette96db3492015-03-27 17:23:52 -07001313
1314 @param arguments Command-line arguments as returned by
1315 `ArgumentParser`
J. Richard Barnette96db3492015-03-27 17:23:52 -07001316 """
J. Richard Barnettef6839282015-06-01 16:00:35 -07001317 root_logger = logging.getLogger()
J. Richard Barnette02e82432015-10-13 16:02:47 -07001318 if arguments.debug:
J. Richard Barnettef6839282015-06-01 16:00:35 -07001319 root_logger.setLevel(logging.INFO)
Alex Zamorzaev8e5317e52018-09-25 00:42:30 +00001320 handler = logging.StreamHandler(sys.stdout)
1321 handler.setFormatter(logging.Formatter())
J. Richard Barnette96db3492015-03-27 17:23:52 -07001322 else:
Richard Barnette5af97402016-04-18 11:00:26 -07001323 if not os.path.exists(arguments.logdir):
1324 os.mkdir(arguments.logdir)
Alex Zamorzaev8e5317e52018-09-25 00:42:30 +00001325 root_logger.setLevel(logging.DEBUG)
1326 logfile = os.path.join(arguments.logdir, _LOGFILE)
1327 handler = logging.handlers.TimedRotatingFileHandler(
1328 logfile, when='W4', backupCount=13)
1329 formatter = logging.Formatter(_LOG_FORMAT,
1330 time_utils.TIME_FMT)
1331 handler.setFormatter(formatter)
J. Richard Barnettef6839282015-06-01 16:00:35 -07001332 # TODO(jrbarnette) This is gross. Importing client.bin.utils
1333 # implicitly imported logging_config, which calls
1334 # logging.basicConfig() *at module level*. That gives us an
1335 # extra logging handler that we don't want. So, clear out all
1336 # the handlers here.
1337 for h in root_logger.handlers:
1338 root_logger.removeHandler(h)
1339 root_logger.addHandler(handler)
J. Richard Barnette96db3492015-03-27 17:23:52 -07001340
1341
J. Richard Barnette96db3492015-03-27 17:23:52 -07001342def main(argv):
1343 """Standard main routine.
Richard Barnettecf5d8342017-10-24 18:13:11 -07001344
1345 @param argv Command line arguments, including `sys.argv[0]`.
J. Richard Barnette96db3492015-03-27 17:23:52 -07001346 """
1347 arguments = _parse_command(argv)
J. Richard Barnette02e82432015-10-13 16:02:47 -07001348 if not arguments:
1349 sys.exit(1)
J. Richard Barnette96db3492015-03-27 17:23:52 -07001350 _configure_logging(arguments)
Prathmesh Prabhu6b48ede2018-05-07 14:33:07 -07001351
Richard Barnettec4374692018-09-17 13:53:38 -07001352 try:
1353 if arguments.use_metrics:
1354 if arguments.debug:
1355 logging.info('Debug mode: Will not report metrics to monarch.')
1356 metrics_file = '/dev/null'
1357 else:
1358 metrics_file = None
1359 with site_utils.SetupTsMonGlobalState(
1360 'lab_inventory', debug_file=metrics_file,
1361 auto_flush=False):
1362 success = False
1363 try:
1364 with metrics.SecondsTimer('%s/duration' % _METRICS_PREFIX):
1365 _perform_inventory_reports(arguments)
1366 success = True
1367 finally:
1368 metrics.Counter('%s/tick' % _METRICS_PREFIX).increment(
1369 fields={'success': success})
1370 metrics.Flush()
1371 else:
1372 _perform_inventory_reports(arguments)
1373 except KeyboardInterrupt:
1374 pass
1375 except Exception:
1376 # Our cron setup doesn't preserve stderr, so drop extra breadcrumbs.
1377 logging.exception('Error escaped main')
1378 raise
J. Richard Barnette96db3492015-03-27 17:23:52 -07001379
1380
Kevin Chengcf0ad2b2016-04-19 14:51:39 -07001381def get_inventory(afe):
J. Richard Barnetteaa868932015-10-23 13:28:59 -07001382 end_time = int(time.time())
1383 start_time = end_time - 24 * 60 * 60
Kevin Chengcf0ad2b2016-04-19 14:51:39 -07001384 return _LabInventory.create_inventory(afe, start_time, end_time)
1385
1386
1387def get_managed_boards(afe):
Richard Barnette5de01eb2017-12-15 09:53:42 -08001388 return get_inventory(afe).get_boards()
J. Richard Barnetteaa868932015-10-23 13:28:59 -07001389
1390
J. Richard Barnette96db3492015-03-27 17:23:52 -07001391if __name__ == '__main__':
1392 main(sys.argv)