blob: 31a29a50fe22f66b1370d024a58a4d2159211b88 [file] [log] [blame]
Mike Frysingerd03e6b52019-08-03 12:49:01 -04001#!/usr/bin/env python2
J. Richard Barnette96db3492015-03-27 17:23:52 -07002# Copyright 2015 The Chromium OS Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6"""Create e-mail reports of the Lab's DUT inventory.
7
8Gathers a list of all DUTs of interest in the Lab, segregated by
Richard Barnette5de01eb2017-12-15 09:53:42 -08009model and pool, and determines whether each DUT is working or
J. Richard Barnette96db3492015-03-27 17:23:52 -070010broken. Then, send one or more e-mail reports summarizing the
11status to e-mail addresses provided on the command line.
12
Richard Barnette5de01eb2017-12-15 09:53:42 -080013usage: lab_inventory.py [ options ] [ model ... ]
J. Richard Barnette96db3492015-03-27 17:23:52 -070014
15Options:
16--duration / -d <hours>
17 How far back in time to search job history to determine DUT
18 status.
19
Richard Barnette5de01eb2017-12-15 09:53:42 -080020--model-notify <address>[,<address>]
21 Send the "model status" e-mail to all the specified e-mail
J. Richard Barnette96db3492015-03-27 17:23:52 -070022 addresses.
23
24--pool-notify <address>[,<address>]
25 Send the "pool status" e-mail to all the specified e-mail
26 addresses.
27
J. Richard Barnette1df6a562015-06-09 10:06:17 -070028--recommend <number>
Richard Barnette5de01eb2017-12-15 09:53:42 -080029 When generating the "model status" e-mail, include a list of
J. Richard Barnette1df6a562015-06-09 10:06:17 -070030 <number> specific DUTs to be recommended for repair.
31
Richard Barnette1ca30e62018-04-09 16:45:58 -070032--report-untestable
33 Scan the inventory for DUTs that can't test because they're stuck in
34 repair loops, or because the scheduler can't give them work.
Richard Barnettecf5d8342017-10-24 18:13:11 -070035
J. Richard Barnette96db3492015-03-27 17:23:52 -070036--logdir <directory>
37 Log progress and actions in a file under this directory. Text
38 of any e-mail sent will also be logged in a timestamped file in
39 this directory.
40
J. Richard Barnette02e82432015-10-13 16:02:47 -070041--debug
Richard Barnettecf5d8342017-10-24 18:13:11 -070042 Suppress all logging, metrics reporting, and sending e-mail.
43 Instead, write the output that would be generated onto stdout.
J. Richard Barnette96db3492015-03-27 17:23:52 -070044
Richard Barnette5de01eb2017-12-15 09:53:42 -080045<model> arguments:
46 With no arguments, gathers the status for all models in the lab.
47 With one or more named models on the command line, restricts
48 reporting to just those models.
J. Richard Barnette96db3492015-03-27 17:23:52 -070049"""
50
51
52import argparse
Prathmesh Prabhu021e7842017-11-08 18:05:45 -080053import collections
J. Richard Barnette96db3492015-03-27 17:23:52 -070054import logging
55import logging.handlers
56import os
J. Richard Barnettef6839282015-06-01 16:00:35 -070057import re
J. Richard Barnette96db3492015-03-27 17:23:52 -070058import sys
59import time
60
61import common
J. Richard Barnettef6839282015-06-01 16:00:35 -070062from autotest_lib.client.bin import utils
Richard Barnette6f6ce322018-09-07 16:23:20 +000063from autotest_lib.client.common_lib import time_utils
Jacob Kopczynski7c4c9542018-08-13 17:24:41 -070064from autotest_lib.frontend.afe.json_rpc import proxy
Xixuan Wu93e646c2017-12-07 18:36:10 -080065from autotest_lib.server import constants
Richard Barnettecf5d8342017-10-24 18:13:11 -070066from autotest_lib.server import site_utils
J. Richard Barnettea7c514e2015-09-15 11:13:23 -070067from autotest_lib.server.cros.dynamic_suite import frontend_wrappers
J. Richard Barnettef6839282015-06-01 16:00:35 -070068from autotest_lib.server.hosts import servo_host
Aviv Keshet7ee95862016-08-30 15:18:27 -070069from autotest_lib.server.lib import status_history
J. Richard Barnette96db3492015-03-27 17:23:52 -070070from autotest_lib.site_utils import gmail_lib
Richard Barnettecf5d8342017-10-24 18:13:11 -070071from chromite.lib import metrics
J. Richard Barnette96db3492015-03-27 17:23:52 -070072
73
Richard Barnette673573b2016-12-12 09:46:39 -080074CRITICAL_POOLS = constants.Pools.CRITICAL_POOLS
75SPARE_POOL = constants.Pools.SPARE_POOL
76MANAGED_POOLS = constants.Pools.MANAGED_POOLS
J. Richard Barnette96db3492015-03-27 17:23:52 -070077
J. Richard Barnetteb8bc570c2016-03-17 17:03:57 -070078# _EXCLUDED_LABELS - A set of labels that disqualify a DUT from
Richard Barnetteeabcf392017-09-01 15:10:54 -070079# monitoring by this script. Currently, we're excluding these:
80# + 'adb' - We're not ready to monitor Android or Brillo hosts.
81# + 'board:guado_moblab' - These are maintained by a separate
82# process that doesn't use this script.
Aviv Keshet1ba0dec2018-07-12 17:14:08 -070083# + 'board:veyron_rialto' due to crbug.com/854404
Kevin Chengcf0ad2b2016-04-19 14:51:39 -070084
C Shapiro7de04422018-08-29 14:46:11 -060085_EXCLUDED_LABELS = {'adb', 'board:guado_moblab',
Aviv Keshet1ba0dec2018-07-12 17:14:08 -070086 'board:veyron_rialto'}
J. Richard Barnetteb8bc570c2016-03-17 17:03:57 -070087
J. Richard Barnette96db3492015-03-27 17:23:52 -070088# _DEFAULT_DURATION:
89# Default value used for the --duration command line option.
90# Specifies how far back in time to search in order to determine
91# DUT status.
92
93_DEFAULT_DURATION = 24
94
J. Richard Barnette96db3492015-03-27 17:23:52 -070095# _LOGDIR:
Richard Barnettecf5d8342017-10-24 18:13:11 -070096# Relative path used in the calculation of the default setting for
97# the --logdir option. The full path is relative to the root of the
98# autotest directory, as determined from sys.argv[0].
J. Richard Barnette96db3492015-03-27 17:23:52 -070099# _LOGFILE:
100# Basename of a file to which general log information will be
101# written.
102# _LOG_FORMAT:
103# Format string for log messages.
104
105_LOGDIR = os.path.join('logs', 'dut-data')
106_LOGFILE = 'lab-inventory.log'
107_LOG_FORMAT = '%(asctime)s | %(levelname)-10s | %(message)s'
108
J. Richard Barnettef6839282015-06-01 16:00:35 -0700109# Pattern describing location-based host names in the Chrome OS test
110# labs. Each DUT hostname designates the DUT's location:
111# * A lab (room) that's physically separated from other labs
112# (i.e. there's a door).
113# * A row (or aisle) of DUTs within the lab.
114# * A vertical rack of shelves on the row.
115# * A specific host on one shelf of the rack.
116
117_HOSTNAME_PATTERN = re.compile(
118 r'(chromeos\d+)-row(\d+)-rack(\d+)-host(\d+)')
119
Richard Barnettecf5d8342017-10-24 18:13:11 -0700120# _REPAIR_LOOP_THRESHOLD:
121# The number of repeated Repair tasks that must be seen to declare
122# that a DUT is stuck in a repair loop.
123
124_REPAIR_LOOP_THRESHOLD = 4
125
J. Richard Barnette96db3492015-03-27 17:23:52 -0700126
Prathmesh Prabhub69a6cc2018-05-07 14:49:33 -0700127_METRICS_PREFIX = 'chromeos/autotest/inventory'
Richard Barnette1ca30e62018-04-09 16:45:58 -0700128_UNTESTABLE_PRESENCE_METRIC = metrics.BooleanMetric(
Jacob Kopczynski7c4c9542018-08-13 17:24:41 -0700129 _METRICS_PREFIX + '/untestable',
Richard Barnette1ca30e62018-04-09 16:45:58 -0700130 'DUTs that cannot be scheduled for testing')
131
Jacob Kopczynski7c4c9542018-08-13 17:24:41 -0700132_MISSING_DUT_METRIC = metrics.Counter(
133 _METRICS_PREFIX + '/missing', 'DUTs which cannot be found by lookup queries'
134 ' because they are invalid or deleted')
135
Richard Barnette59404262018-09-14 15:25:30 -0700136# _Diagnosis - namedtuple corresponding to the return value from
137# `HostHistory.last_diagnosis()`
138_Diagnosis = collections.namedtuple('_Diagnosis', ['status', 'task'])
139
Richard Barnette59404262018-09-14 15:25:30 -0700140def _get_diagnosis(history):
Jacob Kopczynski7c4c9542018-08-13 17:24:41 -0700141 dut_present = True
142 try:
Richard Barnette59404262018-09-14 15:25:30 -0700143 diagnosis = _Diagnosis(*history.last_diagnosis())
144 if (diagnosis.status == status_history.BROKEN
145 and diagnosis.task.end_time < history.start_time):
146 return _Diagnosis(status_history.UNUSED, diagnosis.task)
147 else:
148 return diagnosis
Jacob Kopczynski7c4c9542018-08-13 17:24:41 -0700149 except proxy.JSONRPCException as e:
150 logging.warn(e)
151 dut_present = False
152 finally:
153 _MISSING_DUT_METRIC.increment(
154 fields={'host': history.hostname, 'presence': dut_present})
Richard Barnettea3071b72018-09-26 10:04:18 -0700155 return _Diagnosis(None, None)
156
Richard Barnette1ca30e62018-04-09 16:45:58 -0700157
Richard Barnettee8eee312018-04-27 13:12:04 -0400158def _host_is_working(history):
Richard Barnette59404262018-09-14 15:25:30 -0700159 return _get_diagnosis(history).status == status_history.WORKING
Richard Barnettee8eee312018-04-27 13:12:04 -0400160
161
162def _host_is_broken(history):
Richard Barnette59404262018-09-14 15:25:30 -0700163 return _get_diagnosis(history).status == status_history.BROKEN
Richard Barnettee8eee312018-04-27 13:12:04 -0400164
165
166def _host_is_idle(history):
167 idle_statuses = {status_history.UNUSED, status_history.UNKNOWN}
Richard Barnette59404262018-09-14 15:25:30 -0700168 return _get_diagnosis(history).status in idle_statuses
Richard Barnettee8eee312018-04-27 13:12:04 -0400169
170
Richard Barnette5de01eb2017-12-15 09:53:42 -0800171class _HostSetInventory(object):
172 """Maintains a set of related `HostJobHistory` objects.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700173
Richard Barnettee8eee312018-04-27 13:12:04 -0400174 Current usage of this class is that all DUTs are part of a single
175 scheduling pool of DUTs for a single model; however, this class make
176 no assumptions about the actual relationship among the DUTs.
177
Richard Barnette5de01eb2017-12-15 09:53:42 -0800178 The collection is segregated into disjoint categories of "working",
179 "broken", and "idle" DUTs. Accessor methods allow finding both the
180 list of DUTs in each category, as well as counts of each category.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700181
J. Richard Barnettef6839282015-06-01 16:00:35 -0700182 Performance note: Certain methods in this class are potentially
183 expensive:
184 * `get_working()`
185 * `get_working_list()`
186 * `get_broken()`
187 * `get_broken_list()`
xixuan12ce04f2016-03-10 13:16:30 -0800188 * `get_idle()`
189 * `get_idle_list()`
J. Richard Barnettef6839282015-06-01 16:00:35 -0700190 The first time any one of these methods is called, it causes
191 multiple RPC calls with a relatively expensive set of database
192 queries. However, the results of the queries are cached in the
193 individual `HostJobHistory` objects, so only the first call
194 actually pays the full cost.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700195
xixuan12ce04f2016-03-10 13:16:30 -0800196 Additionally, `get_working_list()`, `get_broken_list()` and
197 `get_idle_list()` cache their return values to avoid recalculating
Richard Barnette5de01eb2017-12-15 09:53:42 -0800198 lists at every call; this caching is separate from the caching of
199 RPC results described above.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700200
Richard Barnette5de01eb2017-12-15 09:53:42 -0800201 This class is deliberately constructed to delay the RPC cost until
202 the accessor methods are called (rather than to query in
J. Richard Barnette96db3492015-03-27 17:23:52 -0700203 `record_host()`) so that it's possible to construct a complete
204 `_LabInventory` without making the expensive queries at creation
Richard Barnette5de01eb2017-12-15 09:53:42 -0800205 time. `_populate_model_counts()`, below, assumes this behavior.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700206 """
207
208 def __init__(self):
209 self._histories = []
J. Richard Barnettef6839282015-06-01 16:00:35 -0700210 self._working_list = None
211 self._broken_list = None
xixuan12ce04f2016-03-10 13:16:30 -0800212 self._idle_list = None
J. Richard Barnette96db3492015-03-27 17:23:52 -0700213
J. Richard Barnette96db3492015-03-27 17:23:52 -0700214 def record_host(self, host_history):
215 """Add one `HostJobHistory` object to the collection.
216
217 @param host_history The `HostJobHistory` object to be
218 remembered.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700219 """
J. Richard Barnettef6839282015-06-01 16:00:35 -0700220 self._working_list = None
221 self._broken_list = None
xixuan12ce04f2016-03-10 13:16:30 -0800222 self._idle_list = None
J. Richard Barnette96db3492015-03-27 17:23:52 -0700223 self._histories.append(host_history)
224
J. Richard Barnettef6839282015-06-01 16:00:35 -0700225 def get_working_list(self):
226 """Return a list of all working DUTs in the pool.
227
Richard Barnettee8eee312018-04-27 13:12:04 -0400228 Filter `self._histories` for histories where the DUT is
229 diagnosed as working.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700230
231 Cache the result so that we only cacluate it once.
232
233 @return A list of HostJobHistory objects.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700234 """
235 if self._working_list is None:
236 self._working_list = [h for h in self._histories
Richard Barnettee8eee312018-04-27 13:12:04 -0400237 if _host_is_working(h)]
J. Richard Barnettef6839282015-06-01 16:00:35 -0700238 return self._working_list
239
J. Richard Barnette96db3492015-03-27 17:23:52 -0700240 def get_working(self):
J. Richard Barnettef6839282015-06-01 16:00:35 -0700241 """Return the number of working DUTs in the pool."""
242 return len(self.get_working_list())
243
J. Richard Barnettef6839282015-06-01 16:00:35 -0700244 def get_broken_list(self):
245 """Return a list of all broken DUTs in the pool.
246
Richard Barnettee8eee312018-04-27 13:12:04 -0400247 Filter `self._histories` for histories where the DUT is
248 diagnosed as broken.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700249
250 Cache the result so that we only cacluate it once.
251
252 @return A list of HostJobHistory objects.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700253 """
254 if self._broken_list is None:
255 self._broken_list = [h for h in self._histories
Richard Barnettee8eee312018-04-27 13:12:04 -0400256 if _host_is_broken(h)]
J. Richard Barnettef6839282015-06-01 16:00:35 -0700257 return self._broken_list
J. Richard Barnette96db3492015-03-27 17:23:52 -0700258
J. Richard Barnette96db3492015-03-27 17:23:52 -0700259 def get_broken(self):
J. Richard Barnettef6839282015-06-01 16:00:35 -0700260 """Return the number of broken DUTs in the pool."""
261 return len(self.get_broken_list())
J. Richard Barnette96db3492015-03-27 17:23:52 -0700262
xixuan12ce04f2016-03-10 13:16:30 -0800263 def get_idle_list(self):
264 """Return a list of all idle DUTs in the pool.
265
Richard Barnettee8eee312018-04-27 13:12:04 -0400266 Filter `self._histories` for histories where the DUT is
267 diagnosed as idle.
xixuan12ce04f2016-03-10 13:16:30 -0800268
269 Cache the result so that we only cacluate it once.
270
271 @return A list of HostJobHistory objects.
xixuan12ce04f2016-03-10 13:16:30 -0800272 """
xixuan12ce04f2016-03-10 13:16:30 -0800273 if self._idle_list is None:
274 self._idle_list = [h for h in self._histories
Richard Barnettee8eee312018-04-27 13:12:04 -0400275 if _host_is_idle(h)]
xixuan12ce04f2016-03-10 13:16:30 -0800276 return self._idle_list
277
xixuan12ce04f2016-03-10 13:16:30 -0800278 def get_idle(self):
279 """Return the number of idle DUTs in the pool."""
280 return len(self.get_idle_list())
281
J. Richard Barnette96db3492015-03-27 17:23:52 -0700282 def get_total(self):
J. Richard Barnettef6839282015-06-01 16:00:35 -0700283 """Return the total number of DUTs in the pool."""
J. Richard Barnette96db3492015-03-27 17:23:52 -0700284 return len(self._histories)
285
Richard Barnettee8eee312018-04-27 13:12:04 -0400286 def get_all_histories(self):
287 return self._histories
288
J. Richard Barnette96db3492015-03-27 17:23:52 -0700289
Richard Barnette5de01eb2017-12-15 09:53:42 -0800290class _PoolSetInventory(object):
291 """Maintains a set of `HostJobHistory`s for a set of pools.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700292
Richard Barnette5de01eb2017-12-15 09:53:42 -0800293 The collection is segregated into disjoint categories of "working",
294 "broken", and "idle" DUTs. Accessor methods allow finding both the
295 list of DUTs in each category, as well as counts of each category.
296 Accessor queries can be for an individual pool, or against all
297 pools.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700298
Richard Barnette5de01eb2017-12-15 09:53:42 -0800299 Performance note: This class relies on `_HostSetInventory`. Public
300 methods in this class generally rely on methods of the same name in
301 the underlying class, and so will have the same underlying
302 performance characteristics.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700303 """
304
Richard Barnette5de01eb2017-12-15 09:53:42 -0800305 def __init__(self, pools):
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800306 self._histories_by_pool = {
Richard Barnette5de01eb2017-12-15 09:53:42 -0800307 pool: _HostSetInventory() for pool in pools
J. Richard Barnette96db3492015-03-27 17:23:52 -0700308 }
309
310 def record_host(self, host_history):
311 """Add one `HostJobHistory` object to the collection.
312
313 @param host_history The `HostJobHistory` object to be
314 remembered.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700315 """
J. Richard Barnette3d0590a2015-04-29 12:56:12 -0700316 pool = host_history.host_pool
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800317 self._histories_by_pool[pool].record_host(host_history)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700318
J. Richard Barnette96db3492015-03-27 17:23:52 -0700319 def _count_pool(self, get_pool_count, pool=None):
320 """Internal helper to count hosts in a given pool.
321
322 The `get_pool_count` parameter is a function to calculate
323 the exact count of interest for the pool.
324
325 @param get_pool_count Function to return a count from a
326 _PoolCount object.
327 @param pool The pool to be counted. If `None`,
328 return the total across all pools.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700329 """
330 if pool is None:
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800331 return sum([get_pool_count(cached_history) for cached_history in
332 self._histories_by_pool.values()])
J. Richard Barnette96db3492015-03-27 17:23:52 -0700333 else:
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800334 return get_pool_count(self._histories_by_pool[pool])
J. Richard Barnette96db3492015-03-27 17:23:52 -0700335
J. Richard Barnettef6839282015-06-01 16:00:35 -0700336 def get_working_list(self):
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800337 """Return a list of all working DUTs (across all pools).
J. Richard Barnettef6839282015-06-01 16:00:35 -0700338
Richard Barnettee8eee312018-04-27 13:12:04 -0400339 Go through all HostJobHistory objects across all pools,
340 selecting all DUTs identified as working.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700341
342 @return A list of HostJobHistory objects.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700343 """
344 l = []
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800345 for p in self._histories_by_pool.values():
J. Richard Barnettef6839282015-06-01 16:00:35 -0700346 l.extend(p.get_working_list())
347 return l
348
J. Richard Barnette96db3492015-03-27 17:23:52 -0700349 def get_working(self, pool=None):
350 """Return the number of working DUTs in a pool.
351
352 @param pool The pool to be counted. If `None`, return the
353 total across all pools.
354
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700355 @return The total number of working DUTs in the selected
356 pool(s).
J. Richard Barnette96db3492015-03-27 17:23:52 -0700357 """
Richard Barnette5de01eb2017-12-15 09:53:42 -0800358 return self._count_pool(_HostSetInventory.get_working, pool)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700359
J. Richard Barnettef6839282015-06-01 16:00:35 -0700360 def get_broken_list(self):
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800361 """Return a list of all broken DUTs (across all pools).
J. Richard Barnettef6839282015-06-01 16:00:35 -0700362
Richard Barnettee8eee312018-04-27 13:12:04 -0400363 Go through all HostJobHistory objects across all pools,
364 selecting all DUTs identified as broken.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700365
366 @return A list of HostJobHistory objects.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700367 """
368 l = []
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800369 for p in self._histories_by_pool.values():
J. Richard Barnettef6839282015-06-01 16:00:35 -0700370 l.extend(p.get_broken_list())
371 return l
372
J. Richard Barnette96db3492015-03-27 17:23:52 -0700373 def get_broken(self, pool=None):
374 """Return the number of broken DUTs in a pool.
375
376 @param pool The pool to be counted. If `None`, return the
377 total across all pools.
378
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700379 @return The total number of broken DUTs in the selected pool(s).
J. Richard Barnette96db3492015-03-27 17:23:52 -0700380 """
Richard Barnette5de01eb2017-12-15 09:53:42 -0800381 return self._count_pool(_HostSetInventory.get_broken, pool)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700382
xixuan12ce04f2016-03-10 13:16:30 -0800383 def get_idle_list(self, pool=None):
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800384 """Return a list of all idle DUTs in the given pool.
xixuan12ce04f2016-03-10 13:16:30 -0800385
Richard Barnettee8eee312018-04-27 13:12:04 -0400386 Go through all HostJobHistory objects across all pools,
387 selecting all DUTs identified as idle.
xixuan12ce04f2016-03-10 13:16:30 -0800388
389 @param pool: The pool to be counted. If `None`, return the total list
390 across all pools.
391
392 @return A list of HostJobHistory objects.
xixuan12ce04f2016-03-10 13:16:30 -0800393 """
394 if pool is None:
395 l = []
Richard Barnette5de01eb2017-12-15 09:53:42 -0800396 for p in self._histories_by_pool.itervalues():
xixuan12ce04f2016-03-10 13:16:30 -0800397 l.extend(p.get_idle_list())
398 return l
399 else:
Richard Barnette5de01eb2017-12-15 09:53:42 -0800400 return self._histories_by_pool[pool].get_idle_list()
xixuan12ce04f2016-03-10 13:16:30 -0800401
xixuan12ce04f2016-03-10 13:16:30 -0800402 def get_idle(self, pool=None):
403 """Return the number of idle DUTs in a pool.
404
405 @param pool: The pool to be counted. If `None`, return the total
406 across all pools.
407
408 @return The total number of idle DUTs in the selected pool(s).
409 """
Richard Barnette5de01eb2017-12-15 09:53:42 -0800410 return self._count_pool(_HostSetInventory.get_idle, pool)
xixuan12ce04f2016-03-10 13:16:30 -0800411
Richard Barnette5de01eb2017-12-15 09:53:42 -0800412 def get_spares_buffer(self, spare_pool=SPARE_POOL):
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700413 """Return the the nominal number of working spares.
414
415 Calculates and returns how many working spares there would
416 be in the spares pool if all broken DUTs were in the spares
417 pool. This number may be negative, indicating a shortfall
418 in the critical pools.
419
420 @return The total number DUTs in the spares pool, less the total
421 number of broken DUTs in all pools.
422 """
Richard Barnette5de01eb2017-12-15 09:53:42 -0800423 return self.get_total(spare_pool) - self.get_broken()
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700424
J. Richard Barnette96db3492015-03-27 17:23:52 -0700425 def get_total(self, pool=None):
426 """Return the total number of DUTs in a pool.
427
428 @param pool The pool to be counted. If `None`, return the
429 total across all pools.
430
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700431 @return The total number of DUTs in the selected pool(s).
J. Richard Barnette96db3492015-03-27 17:23:52 -0700432 """
Richard Barnette5de01eb2017-12-15 09:53:42 -0800433 return self._count_pool(_HostSetInventory.get_total, pool)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700434
Richard Barnettee8eee312018-04-27 13:12:04 -0400435 def get_all_histories(self, pool=None):
436 if pool is None:
437 for p in self._histories_by_pool.itervalues():
438 for h in p.get_all_histories():
439 yield h
440 else:
441 for h in self._histories_by_pool[pool].get_all_histories():
442 yield h
443
J. Richard Barnette96db3492015-03-27 17:23:52 -0700444
Prathmesh Prabhu2637cd42018-11-08 08:18:21 -0800445def _is_migrated_to_skylab(afehost):
446 """Return True if the provided frontend.Host has been migrated to skylab."""
447 return afehost.hostname.endswith('-migrated-do-not-use')
448
449
Richard Barnette5de01eb2017-12-15 09:53:42 -0800450def _eligible_host(afehost):
451 """Return whether this host is eligible for monitoring.
452
Richard Barnette5de01eb2017-12-15 09:53:42 -0800453 @param afehost The host to be tested for eligibility.
454 """
Prathmesh Prabhu2637cd42018-11-08 08:18:21 -0800455 if _is_migrated_to_skylab(afehost):
456 return False
457
458 # DUTs without an existing, unique 'model' or 'pool' label aren't meant to
459 # exist in the managed inventory; their presence generally indicates an
460 # error in the database. The _LabInventory constructor requires hosts to
461 # conform to the label restrictions. Failing an inventory run for a single
462 # bad entry is wrong, so we ignore these hosts.
Richard Barnette5de01eb2017-12-15 09:53:42 -0800463 models = [l for l in afehost.labels
464 if l.startswith(constants.Labels.MODEL_PREFIX)]
465 pools = [l for l in afehost.labels
466 if l.startswith(constants.Labels.POOL_PREFIX)]
467 excluded = _EXCLUDED_LABELS.intersection(afehost.labels)
468 return len(models) == 1 and len(pools) == 1 and not excluded
469
470
471class _LabInventory(collections.Mapping):
J. Richard Barnette96db3492015-03-27 17:23:52 -0700472 """Collection of `HostJobHistory` objects for the Lab's inventory.
473
Richard Barnette5de01eb2017-12-15 09:53:42 -0800474 This is a dict-like collection indexed by model. Indexing returns
475 the _PoolSetInventory object associated with the model.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700476 """
477
478 @classmethod
Richard Barnette5de01eb2017-12-15 09:53:42 -0800479 def create_inventory(cls, afe, start_time, end_time, modellist=[]):
J. Richard Barnette96db3492015-03-27 17:23:52 -0700480 """Return a Lab inventory with specified parameters.
481
Richard Barnette5de01eb2017-12-15 09:53:42 -0800482 By default, gathers inventory from `HostJobHistory` objects for
483 all DUTs in the `MANAGED_POOLS` list. If `modellist` is
484 supplied, the inventory will be restricted to only the given
485 models.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700486
Richard Barnette5de01eb2017-12-15 09:53:42 -0800487 @param afe AFE object for constructing the
488 `HostJobHistory` objects.
489 @param start_time Start time for the `HostJobHistory` objects.
490 @param end_time End time for the `HostJobHistory` objects.
491 @param modellist List of models to include. If empty,
492 include all available models.
493 @return A `_LabInventory` object for the specified models.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700494 """
Richard Barnette5de01eb2017-12-15 09:53:42 -0800495 target_pools = MANAGED_POOLS
496 label_list = [constants.Labels.POOL_PREFIX + l for l in target_pools]
J. Richard Barnette96db3492015-03-27 17:23:52 -0700497 afehosts = afe.get_hosts(labels__name__in=label_list)
Richard Barnette5de01eb2017-12-15 09:53:42 -0800498 if modellist:
J. Richard Barnetteb8bc570c2016-03-17 17:03:57 -0700499 # We're deliberately not checking host eligibility in this
500 # code path. This is a debug path, not used in production;
501 # it may be useful to include ineligible hosts here.
Richard Barnette5de01eb2017-12-15 09:53:42 -0800502 modelhosts = []
503 for model in modellist:
504 model_label = constants.Labels.MODEL_PREFIX + model
J. Richard Barnette96db3492015-03-27 17:23:52 -0700505 host_list = [h for h in afehosts
Richard Barnette5de01eb2017-12-15 09:53:42 -0800506 if model_label in h.labels]
507 modelhosts.extend(host_list)
508 afehosts = modelhosts
J. Richard Barnetteb8bc570c2016-03-17 17:03:57 -0700509 else:
Richard Barnette3a404492018-02-08 13:57:01 -0800510 afehosts = [h for h in afehosts if _eligible_host(h)]
J. Richard Barnette96db3492015-03-27 17:23:52 -0700511 create = lambda host: (
512 status_history.HostJobHistory(afe, host,
513 start_time, end_time))
Richard Barnette5de01eb2017-12-15 09:53:42 -0800514 return cls([create(host) for host in afehosts], target_pools)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700515
Richard Barnette5de01eb2017-12-15 09:53:42 -0800516 def __init__(self, histories, pools):
517 models = {h.host_model for h in histories}
518 self._modeldata = {model: _PoolSetInventory(pools) for model in models}
J. Richard Barnette96db3492015-03-27 17:23:52 -0700519 self._dut_count = len(histories)
Richard Barnette5de01eb2017-12-15 09:53:42 -0800520 for h in histories:
521 self[h.host_model].record_host(h)
522 self._boards = {h.host_board for h in histories}
Prathmesh Prabhu154cb2b2017-11-08 17:36:51 -0800523
Richard Barnette5de01eb2017-12-15 09:53:42 -0800524 def __getitem__(self, key):
525 return self._modeldata.__getitem__(key)
Prathmesh Prabhu021e7842017-11-08 18:05:45 -0800526
Richard Barnette5de01eb2017-12-15 09:53:42 -0800527 def __len__(self):
528 return self._modeldata.__len__()
529
Richard Barnette5de01eb2017-12-15 09:53:42 -0800530 def __iter__(self):
531 return self._modeldata.__iter__()
532
J. Richard Barnette96db3492015-03-27 17:23:52 -0700533 def get_num_duts(self):
534 """Return the total number of DUTs in the inventory."""
535 return self._dut_count
536
Prathmesh Prabhua5a0e3d2017-11-09 08:53:53 -0800537 def get_num_models(self):
538 """Return the total number of models in the inventory."""
Richard Barnette5de01eb2017-12-15 09:53:42 -0800539 return len(self)
540
Richard Barnette5de01eb2017-12-15 09:53:42 -0800541 def get_pool_models(self, pool):
542 """Return all models in `pool`.
543
544 @param pool The pool to be inventoried for models.
545 """
546 return {m for m, h in self.iteritems() if h.get_total(pool)}
547
Richard Barnette5de01eb2017-12-15 09:53:42 -0800548 def get_boards(self):
549 return self._boards
Prathmesh Prabhua5a0e3d2017-11-09 08:53:53 -0800550
551
Richard Barnettee8eee312018-04-27 13:12:04 -0400552def _reportable_models(inventory, spare_pool=SPARE_POOL):
553 """Iterate over all models subject to reporting.
554
555 Yields the contents of `inventory.iteritems()` filtered to include
556 only reportable models. A model is reportable if it has DUTs in
557 both `spare_pool` and at least one other pool.
558
559 @param spare_pool The spare pool to be tested for reporting.
560 """
561 for model, poolset in inventory.iteritems():
562 spares = poolset.get_total(spare_pool)
563 total = poolset.get_total()
564 if spares != 0 and spares != total:
565 yield model, poolset
566
567
568def _all_dut_histories(inventory):
569 for poolset in inventory.itervalues():
570 for h in poolset.get_all_histories():
571 yield h
572
573
J. Richard Barnettef6839282015-06-01 16:00:35 -0700574def _sort_by_location(inventory_list):
575 """Return a list of DUTs, organized by location.
576
577 Take the given list of `HostJobHistory` objects, separate it
578 into a list per lab, and sort each lab's list by location. The
579 order of sorting within a lab is
580 * By row number within the lab,
581 * then by rack number within the row,
582 * then by host shelf number within the rack.
583
584 Return a list of the sorted lists.
585
586 Implementation note: host locations are sorted by converting
587 each location into a base 100 number. If row, rack or
588 host numbers exceed the range [0..99], then sorting will
589 break down.
590
591 @return A list of sorted lists of DUTs.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700592 """
593 BASE = 100
594 lab_lists = {}
595 for history in inventory_list:
596 location = _HOSTNAME_PATTERN.match(history.host.hostname)
597 if location:
598 lab = location.group(1)
599 key = 0
600 for idx in location.group(2, 3, 4):
601 key = BASE * key + int(idx)
602 lab_lists.setdefault(lab, []).append((key, history))
603 return_list = []
604 for dut_list in lab_lists.values():
605 dut_list.sort(key=lambda t: t[0])
606 return_list.append([t[1] for t in dut_list])
607 return return_list
608
609
610def _score_repair_set(buffer_counts, repair_list):
611 """Return a numeric score rating a set of DUTs to be repaired.
612
Richard Barnette5de01eb2017-12-15 09:53:42 -0800613 `buffer_counts` is a dictionary mapping model names to the size of
614 the model's spares buffer.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700615
Richard Barnette5de01eb2017-12-15 09:53:42 -0800616 `repair_list` is a list of `HostJobHistory` objects for the DUTs to
617 be repaired.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700618
619 This function calculates the new set of buffer counts that would
Richard Barnette5de01eb2017-12-15 09:53:42 -0800620 result from the proposed repairs, and scores the new set using two
621 numbers:
622 * Worst case buffer count for any model (higher is better). This
623 is the more significant number for comparison.
624 * Number of models at the worst case (lower is better). This is
625 the less significant number.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700626
Richard Barnette5de01eb2017-12-15 09:53:42 -0800627 Implementation note: The score could fail to reflect the intended
628 criteria if there are more than 1000 models in the inventory.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700629
Richard Barnette5de01eb2017-12-15 09:53:42 -0800630 @param spare_counts A dictionary mapping models to buffer counts.
631 @param repair_list A list of `HostJobHistory` objects for the
632 DUTs to be repaired.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700633 @return A numeric score.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700634 """
635 # Go through `buffer_counts`, and create a list of new counts
Richard Barnette5de01eb2017-12-15 09:53:42 -0800636 # that records the buffer count for each model after repair.
637 # The new list of counts discards the model names, as they don't
J. Richard Barnettef6839282015-06-01 16:00:35 -0700638 # contribute to the final score.
Richard Barnette5de01eb2017-12-15 09:53:42 -0800639 _NMODELS = 1000
640 pools = {h.host_pool for h in repair_list}
641 repair_inventory = _LabInventory(repair_list, pools)
J. Richard Barnettef6839282015-06-01 16:00:35 -0700642 new_counts = []
Richard Barnette5de01eb2017-12-15 09:53:42 -0800643 for m, c in buffer_counts.iteritems():
644 if m in repair_inventory:
645 newcount = repair_inventory[m].get_total()
J. Richard Barnettef6839282015-06-01 16:00:35 -0700646 else:
647 newcount = 0
648 new_counts.append(c + newcount)
649 # Go through the new list of counts. Find the worst available
650 # spares count, and count how many times that worst case occurs.
651 worst_count = new_counts[0]
652 num_worst = 1
653 for c in new_counts[1:]:
654 if c == worst_count:
655 num_worst += 1
656 elif c < worst_count:
657 worst_count = c
658 num_worst = 1
659 # Return the calculated score
Richard Barnette5de01eb2017-12-15 09:53:42 -0800660 return _NMODELS * worst_count - num_worst
J. Richard Barnettef6839282015-06-01 16:00:35 -0700661
662
663def _generate_repair_recommendation(inventory, num_recommend):
664 """Return a summary of selected DUTs needing repair.
665
Richard Barnette5de01eb2017-12-15 09:53:42 -0800666 Returns a message recommending a list of broken DUTs to be repaired.
667 The list of DUTs is selected based on these criteria:
J. Richard Barnettef6839282015-06-01 16:00:35 -0700668 * No more than `num_recommend` DUTs will be listed.
669 * All DUTs must be in the same lab.
Richard Barnette5de01eb2017-12-15 09:53:42 -0800670 * DUTs should be selected for some degree of physical proximity.
671 * DUTs for models with a low spares buffer are more important than
672 DUTs with larger buffers.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700673
Richard Barnette5de01eb2017-12-15 09:53:42 -0800674 The algorithm used will guarantee that at least one DUT from a model
675 with the lowest spares buffer will be recommended. If the worst
676 spares buffer number is shared by more than one model, the algorithm
677 will tend to prefer repair sets that include more of those models
678 over sets that cover fewer models.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700679
Richard Barnette5de01eb2017-12-15 09:53:42 -0800680 @param inventory `_LabInventory` object from which to generate
681 recommendations.
J. Richard Barnette1df6a562015-06-09 10:06:17 -0700682 @param num_recommend Number of DUTs to recommend for repair.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700683 """
684 logging.debug('Creating DUT repair recommendations')
Richard Barnette5de01eb2017-12-15 09:53:42 -0800685 model_buffer_counts = {}
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700686 broken_list = []
Richard Barnettee8eee312018-04-27 13:12:04 -0400687 for model, counts in _reportable_models(inventory):
Richard Barnette5de01eb2017-12-15 09:53:42 -0800688 logging.debug('Listing failed DUTs for %s', model)
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700689 if counts.get_broken() != 0:
Richard Barnette5de01eb2017-12-15 09:53:42 -0800690 model_buffer_counts[model] = counts.get_spares_buffer()
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700691 broken_list.extend(counts.get_broken_list())
J. Richard Barnette55127432015-10-13 17:01:56 -0700692 # N.B. The logic inside this loop may seem complicated, but
J. Richard Barnettef6839282015-06-01 16:00:35 -0700693 # simplification is hard:
694 # * Calculating an initial recommendation outside of
695 # the loop likely would make things more complicated,
696 # not less.
697 # * It's necessary to calculate an initial lab slice once per
698 # lab _before_ the while loop, in case the number of broken
699 # DUTs in a lab is less than `num_recommend`.
J. Richard Barnette55127432015-10-13 17:01:56 -0700700 recommendation = None
701 best_score = None
702 for lab_duts in _sort_by_location(broken_list):
J. Richard Barnettef6839282015-06-01 16:00:35 -0700703 start = 0
704 end = num_recommend
705 lab_slice = lab_duts[start : end]
Richard Barnette5de01eb2017-12-15 09:53:42 -0800706 lab_score = _score_repair_set(model_buffer_counts, lab_slice)
J. Richard Barnettef6839282015-06-01 16:00:35 -0700707 while end < len(lab_duts):
708 start += 1
709 end += 1
710 new_slice = lab_duts[start : end]
Richard Barnette5de01eb2017-12-15 09:53:42 -0800711 new_score = _score_repair_set(model_buffer_counts, new_slice)
J. Richard Barnettef6839282015-06-01 16:00:35 -0700712 if new_score > lab_score:
713 lab_slice = new_slice
714 lab_score = new_score
715 if recommendation is None or lab_score > best_score:
716 recommendation = lab_slice
717 best_score = lab_score
Richard Barnette5de01eb2017-12-15 09:53:42 -0800718 # N.B. The trailing space in `line_fmt` is manadatory: Without it,
719 # Gmail will parse the URL wrong. Don't ask. If you simply _must_
J. Richard Barnette5d0fa512016-04-05 17:39:52 -0700720 # know more, go try it yourself...
721 line_fmt = '%-30s %-16s %-6s\n %s '
J. Richard Barnette1df6a562015-06-09 10:06:17 -0700722 message = ['Repair recommendations:\n',
Richard Barnette5de01eb2017-12-15 09:53:42 -0800723 line_fmt % ( 'Hostname', 'Model', 'Servo?', 'Logs URL')]
Richard Barnetteb14c7dc2018-09-17 14:16:48 -0700724 if recommendation:
725 for h in recommendation:
726 servo_name = servo_host.make_servo_hostname(h.host.hostname)
727 servo_present = utils.host_is_in_lab_zone(servo_name)
Richard Barnette59404262018-09-14 15:25:30 -0700728 event = _get_diagnosis(h).task
Richard Barnetteb14c7dc2018-09-17 14:16:48 -0700729 line = line_fmt % (
730 h.host.hostname, h.host_model,
731 'Yes' if servo_present else 'No', event.job_url)
732 message.append(line)
733 else:
734 message.append('(No DUTs to repair)')
J. Richard Barnettef6839282015-06-01 16:00:35 -0700735 return '\n'.join(message)
736
737
Richard Barnette5de01eb2017-12-15 09:53:42 -0800738def _generate_model_inventory_message(inventory):
739 """Generate the "model inventory" e-mail message.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700740
Richard Barnette5de01eb2017-12-15 09:53:42 -0800741 The model inventory is a list by model summarizing the number of
742 working, broken, and idle DUTs, and the total shortfall or surplus
J. Richard Barnette96db3492015-03-27 17:23:52 -0700743 of working devices relative to the minimum critical pool
744 requirement.
745
Richard Barnette5de01eb2017-12-15 09:53:42 -0800746 The report omits models with no DUTs in the spare pool or with no
747 DUTs in a critical pool.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700748
749 N.B. For sample output text formattted as users can expect to
750 see it in e-mail and log files, refer to the unit tests.
751
Richard Barnette5de01eb2017-12-15 09:53:42 -0800752 @param inventory `_LabInventory` object to be reported on.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700753 @return String with the inventory message to be sent.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700754 """
Richard Barnette5de01eb2017-12-15 09:53:42 -0800755 logging.debug('Creating model inventory')
J. Richard Barnetted3ba33a2015-10-14 11:20:49 -0700756 nworking = 0
757 nbroken = 0
xixuan12ce04f2016-03-10 13:16:30 -0800758 nidle = 0
Richard Barnette5de01eb2017-12-15 09:53:42 -0800759 nbroken_models = 0
760 ntotal_models = 0
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700761 summaries = []
Richard Barnette5de01eb2017-12-15 09:53:42 -0800762 column_names = (
763 'Model', 'Avail', 'Bad', 'Idle', 'Good', 'Spare', 'Total')
Richard Barnettee8eee312018-04-27 13:12:04 -0400764 for model, counts in _reportable_models(inventory):
Richard Barnette5de01eb2017-12-15 09:53:42 -0800765 logging.debug('Counting %2d DUTS for model %s',
766 counts.get_total(), model)
767 # Summary elements laid out in the same order as the column
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700768 # headers:
Richard Barnette5de01eb2017-12-15 09:53:42 -0800769 # Model Avail Bad Idle Good Spare Total
xixuan12ce04f2016-03-10 13:16:30 -0800770 # e[0] e[1] e[2] e[3] e[4] e[5] e[6]
Richard Barnette5de01eb2017-12-15 09:53:42 -0800771 element = (model,
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700772 counts.get_spares_buffer(),
773 counts.get_broken(),
xixuan12ce04f2016-03-10 13:16:30 -0800774 counts.get_idle(),
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700775 counts.get_working(),
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700776 counts.get_total(SPARE_POOL),
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700777 counts.get_total())
J. Richard Barnetteea5a4ba2016-02-18 16:34:50 -0800778 if element[2]:
779 summaries.append(element)
Richard Barnette5de01eb2017-12-15 09:53:42 -0800780 nbroken_models += 1
781 ntotal_models += 1
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700782 nbroken += element[2]
xixuan12ce04f2016-03-10 13:16:30 -0800783 nidle += element[3]
784 nworking += element[4]
785 ntotal = nworking + nbroken + nidle
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700786 summaries = sorted(summaries, key=lambda e: (e[1], -e[2]))
J. Richard Barnetted3ba33a2015-10-14 11:20:49 -0700787 broken_percent = int(round(100.0 * nbroken / ntotal))
xixuan12ce04f2016-03-10 13:16:30 -0800788 idle_percent = int(round(100.0 * nidle / ntotal))
789 working_percent = 100 - broken_percent - idle_percent
J. Richard Barnetted3ba33a2015-10-14 11:20:49 -0700790 message = ['Summary of DUTs in inventory:',
xixuan12ce04f2016-03-10 13:16:30 -0800791 '%10s %10s %10s %6s' % ('Bad', 'Idle', 'Good', 'Total'),
792 '%5d %3d%% %5d %3d%% %5d %3d%% %6d' % (
J. Richard Barnetted3ba33a2015-10-14 11:20:49 -0700793 nbroken, broken_percent,
xixuan12ce04f2016-03-10 13:16:30 -0800794 nidle, idle_percent,
J. Richard Barnetted3ba33a2015-10-14 11:20:49 -0700795 nworking, working_percent,
796 ntotal),
797 '',
Richard Barnette5de01eb2017-12-15 09:53:42 -0800798 'Models with failures: %d' % nbroken_models,
799 'Models in inventory: %d' % ntotal_models,
J. Richard Barnetted3ba33a2015-10-14 11:20:49 -0700800 '', '',
Richard Barnette5de01eb2017-12-15 09:53:42 -0800801 'Full model inventory:\n',
802 '%-22s %5s %5s %5s %5s %5s %5s' % column_names]
J. Richard Barnette96db3492015-03-27 17:23:52 -0700803 message.extend(
xixuan12ce04f2016-03-10 13:16:30 -0800804 ['%-22s %5d %5d %5d %5d %5d %5d' % e for e in summaries])
J. Richard Barnette96db3492015-03-27 17:23:52 -0700805 return '\n'.join(message)
806
807
J. Richard Barnette4845fcf2015-04-20 14:26:25 -0700808_POOL_INVENTORY_HEADER = '''\
Richard Barnette5de01eb2017-12-15 09:53:42 -0800809Notice to Infrastructure deputies: All models shown below are at
J. Richard Barnettec9a143c2015-06-04 11:11:19 -0700810less than full strength, please take action to resolve the issues.
811Once you're satisified that failures won't recur, failed DUTs can
812be replaced with spares by running `balance_pool`. Detailed
813instructions can be found here:
J. Richard Barnette4845fcf2015-04-20 14:26:25 -0700814 http://go/cros-manage-duts
815'''
816
817
J. Richard Barnette96db3492015-03-27 17:23:52 -0700818def _generate_pool_inventory_message(inventory):
819 """Generate the "pool inventory" e-mail message.
820
Richard Barnette5de01eb2017-12-15 09:53:42 -0800821 The pool inventory is a list by pool and model summarizing the
822 number of working and broken DUTs in the pool. Only models with
J. Richard Barnette96db3492015-03-27 17:23:52 -0700823 at least one broken DUT are included in the list.
824
Richard Barnette5de01eb2017-12-15 09:53:42 -0800825 N.B. For sample output text formattted as users can expect to see it
826 in e-mail and log files, refer to the unit tests.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700827
Richard Barnette5de01eb2017-12-15 09:53:42 -0800828 @param inventory `_LabInventory` object to be reported on.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700829 @return String with the inventory message to be sent.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700830 """
831 logging.debug('Creating pool inventory')
J. Richard Barnette4845fcf2015-04-20 14:26:25 -0700832 message = [_POOL_INVENTORY_HEADER]
J. Richard Barnette96db3492015-03-27 17:23:52 -0700833 newline = ''
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700834 for pool in CRITICAL_POOLS:
J. Richard Barnette96db3492015-03-27 17:23:52 -0700835 message.append(
Richard Barnette5de01eb2017-12-15 09:53:42 -0800836 '%sStatus for pool:%s, by model:' % (newline, pool))
J. Richard Barnette96db3492015-03-27 17:23:52 -0700837 message.append(
xixuan12ce04f2016-03-10 13:16:30 -0800838 '%-20s %5s %5s %5s %5s' % (
Richard Barnette5de01eb2017-12-15 09:53:42 -0800839 'Model', 'Bad', 'Idle', 'Good', 'Total'))
J. Richard Barnette96db3492015-03-27 17:23:52 -0700840 data_list = []
Richard Barnette5de01eb2017-12-15 09:53:42 -0800841 for model, counts in inventory.iteritems():
Richard Barnette254d5b42016-07-06 19:13:23 -0700842 logging.debug('Counting %2d DUTs for %s, %s',
Richard Barnette5de01eb2017-12-15 09:53:42 -0800843 counts.get_total(pool), model, pool)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700844 broken = counts.get_broken(pool)
xixuan12ce04f2016-03-10 13:16:30 -0800845 idle = counts.get_idle(pool)
Richard Barnette5de01eb2017-12-15 09:53:42 -0800846 # models at full strength are not reported
847 if not broken and not idle:
J. Richard Barnette96db3492015-03-27 17:23:52 -0700848 continue
849 working = counts.get_working(pool)
850 total = counts.get_total(pool)
Richard Barnette5de01eb2017-12-15 09:53:42 -0800851 data_list.append((model, broken, idle, working, total))
J. Richard Barnette96db3492015-03-27 17:23:52 -0700852 if data_list:
853 data_list = sorted(data_list, key=lambda d: -d[1])
854 message.extend(
xixuan12ce04f2016-03-10 13:16:30 -0800855 ['%-20s %5d %5d %5d %5d' % t for t in data_list])
J. Richard Barnette96db3492015-03-27 17:23:52 -0700856 else:
Richard Barnette5de01eb2017-12-15 09:53:42 -0800857 message.append('(All models at full strength)')
J. Richard Barnette96db3492015-03-27 17:23:52 -0700858 newline = '\n'
859 return '\n'.join(message)
860
861
xixuan12ce04f2016-03-10 13:16:30 -0800862_IDLE_INVENTORY_HEADER = '''\
863Notice to Infrastructure deputies: The hosts shown below haven't
864run any jobs for at least 24 hours. Please check each host; locked
865hosts should normally be unlocked; stuck jobs should normally be
866aborted.
867'''
868
869
870def _generate_idle_inventory_message(inventory):
871 """Generate the "idle inventory" e-mail message.
872
Richard Barnette5de01eb2017-12-15 09:53:42 -0800873 The idle inventory is a host list with corresponding pool and model,
Richard Barnettee8eee312018-04-27 13:12:04 -0400874 where the hosts are identified as idle.
xixuan12ce04f2016-03-10 13:16:30 -0800875
876 N.B. For sample output text format as users can expect to
877 see it in e-mail and log files, refer to the unit tests.
878
Richard Barnette5de01eb2017-12-15 09:53:42 -0800879 @param inventory `_LabInventory` object to be reported on.
xixuan12ce04f2016-03-10 13:16:30 -0800880 @return String with the inventory message to be sent.
xixuan12ce04f2016-03-10 13:16:30 -0800881 """
882 logging.debug('Creating idle inventory')
883 message = [_IDLE_INVENTORY_HEADER]
884 message.append('Idle Host List:')
Richard Barnette5de01eb2017-12-15 09:53:42 -0800885 message.append('%-30s %-20s %s' % ('Hostname', 'Model', 'Pool'))
xixuan12ce04f2016-03-10 13:16:30 -0800886 data_list = []
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700887 for pool in MANAGED_POOLS:
Richard Barnette5de01eb2017-12-15 09:53:42 -0800888 for model, counts in inventory.iteritems():
Richard Barnette254d5b42016-07-06 19:13:23 -0700889 logging.debug('Counting %2d DUTs for %s, %s',
Richard Barnette5de01eb2017-12-15 09:53:42 -0800890 counts.get_total(pool), model, pool)
891 data_list.extend([(dut.host.hostname, model, pool)
xixuan12ce04f2016-03-10 13:16:30 -0800892 for dut in counts.get_idle_list(pool)])
893 if data_list:
894 message.extend(['%-30s %-20s %s' % t for t in data_list])
895 else:
896 message.append('(No idle DUTs)')
897 return '\n'.join(message)
898
899
J. Richard Barnette96db3492015-03-27 17:23:52 -0700900def _send_email(arguments, tag, subject, recipients, body):
901 """Send an inventory e-mail message.
902
Richard Barnette5de01eb2017-12-15 09:53:42 -0800903 The message is logged in the selected log directory using `tag` for
904 the file name.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700905
Richard Barnette5de01eb2017-12-15 09:53:42 -0800906 If the --debug option was requested, the message is neither logged
907 nor sent, but merely printed on stdout.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700908
909 @param arguments Parsed command-line options.
910 @param tag Tag identifying the inventory for logging
911 purposes.
912 @param subject E-mail Subject: header line.
913 @param recipients E-mail addresses for the To: header line.
914 @param body E-mail message body.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700915 """
916 logging.debug('Generating email: "%s"', subject)
917 all_recipients = ', '.join(recipients)
918 report_body = '\n'.join([
919 'To: %s' % all_recipients,
920 'Subject: %s' % subject,
921 '', body, ''])
J. Richard Barnette02e82432015-10-13 16:02:47 -0700922 if arguments.debug:
J. Richard Barnette96db3492015-03-27 17:23:52 -0700923 print report_body
924 else:
925 filename = os.path.join(arguments.logdir, tag)
926 try:
927 report_file = open(filename, 'w')
928 report_file.write(report_body)
929 report_file.close()
930 except EnvironmentError as e:
931 logging.error('Failed to write %s: %s', filename, e)
932 try:
933 gmail_lib.send_email(all_recipients, subject, body)
934 except Exception as e:
935 logging.error('Failed to send e-mail to %s: %s',
936 all_recipients, e)
937
938
Richard Barnette5de01eb2017-12-15 09:53:42 -0800939def _populate_model_counts(inventory):
940 """Gather model counts while providing interactive feedback.
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700941
942 Gathering the status of all individual DUTs in the lab can take
943 considerable time (~30 minutes at the time of this writing).
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700944 Normally, we pay that cost by querying as we go. However, with
945 the `--debug` option, we expect a human being to be watching the
Richard Barnette5de01eb2017-12-15 09:53:42 -0800946 progress in real time. So, we force the first (expensive) queries
947 to happen up front, and provide simple ASCII output on sys.stdout
948 to show a progress bar and results.
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700949
Richard Barnette5de01eb2017-12-15 09:53:42 -0800950 @param inventory `_LabInventory` object from which to gather
951 counts.
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700952 """
953 n = 0
954 total_broken = 0
Richard Barnette5de01eb2017-12-15 09:53:42 -0800955 for counts in inventory.itervalues():
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700956 n += 1
957 if n % 10 == 5:
958 c = '+'
959 elif n % 10 == 0:
960 c = '%d' % ((n / 10) % 10)
961 else:
962 c = '.'
963 sys.stdout.write(c)
964 sys.stdout.flush()
Richard Barnette5de01eb2017-12-15 09:53:42 -0800965 # This next call is where all the time goes - it forces all of a
966 # model's `HostJobHistory` objects to query the database and
967 # cache their results.
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700968 total_broken += counts.get_broken()
969 sys.stdout.write('\n')
970 sys.stdout.write('Found %d broken DUTs\n' % total_broken)
971
972
Richard Barnette5de01eb2017-12-15 09:53:42 -0800973def _perform_model_inventory(arguments, inventory, timestamp):
974 """Perform the model inventory report.
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700975
Richard Barnette5de01eb2017-12-15 09:53:42 -0800976 The model inventory report consists of the following:
977 * A list of DUTs that are recommended to be repaired. This list
978 is optional, and only appears if the `--recommend` option is
979 present.
980 * A list of all models that have failed DUTs, with counts
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700981 of working, broken, and spare DUTs, among others.
982
983 @param arguments Command-line arguments as returned by
984 `ArgumentParser`
Richard Barnette5de01eb2017-12-15 09:53:42 -0800985 @param inventory `_LabInventory` object to be reported on.
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700986 @param timestamp A string used to identify this run's timestamp
987 in logs and email output.
988 """
989 if arguments.recommend:
990 recommend_message = _generate_repair_recommendation(
991 inventory, arguments.recommend) + '\n\n\n'
992 else:
993 recommend_message = ''
Richard Barnette5de01eb2017-12-15 09:53:42 -0800994 model_message = _generate_model_inventory_message(inventory)
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700995 _send_email(arguments,
Richard Barnette5de01eb2017-12-15 09:53:42 -0800996 'models-%s.txt' % timestamp,
997 'DUT model inventory %s' % timestamp,
998 arguments.model_notify,
999 recommend_message + model_message)
Richard Barnette3dcbb6a2017-10-23 17:57:50 -07001000
1001
1002def _perform_pool_inventory(arguments, inventory, timestamp):
1003 """Perform the pool inventory report.
1004
1005 The pool inventory report consists of the following:
1006 * A list of all critical pools that have failed DUTs, with counts
1007 of working, broken, and idle DUTs.
Richard Barnette5de01eb2017-12-15 09:53:42 -08001008 * A list of all idle DUTs by hostname including the model and
Richard Barnette3dcbb6a2017-10-23 17:57:50 -07001009 pool.
1010
1011 @param arguments Command-line arguments as returned by
1012 `ArgumentParser`
Richard Barnette5de01eb2017-12-15 09:53:42 -08001013 @param inventory `_LabInventory` object to be reported on.
Richard Barnettecf5d8342017-10-24 18:13:11 -07001014 @param timestamp A string used to identify this run's timestamp in
1015 logs and email output.
Richard Barnette3dcbb6a2017-10-23 17:57:50 -07001016 """
1017 pool_message = _generate_pool_inventory_message(inventory)
1018 idle_message = _generate_idle_inventory_message(inventory)
1019 _send_email(arguments,
1020 'pools-%s.txt' % timestamp,
1021 'DUT pool inventory %s' % timestamp,
1022 arguments.pool_notify,
1023 pool_message + '\n\n\n' + idle_message)
1024
1025
Richard Barnettecf5d8342017-10-24 18:13:11 -07001026def _dut_in_repair_loop(history):
1027 """Return whether a DUT's history indicates a repair loop.
1028
1029 A DUT is considered looping if it runs no tests, and no tasks pass
1030 other than repair tasks.
1031
1032 @param history An instance of `status_history.HostJobHistory` to be
1033 scanned for a repair loop. The caller guarantees
1034 that this history corresponds to a working DUT.
1035 @returns Return a true value if the DUT's most recent history
1036 indicates a repair loop.
1037 """
1038 # Our caller passes only histories for working DUTs; that means
1039 # we've already paid the cost of fetching the diagnosis task, and
1040 # we know that the task was successful. The diagnosis task will be
1041 # one of the tasks we must scan to find a loop, so if the task isn't
1042 # a repair task, then our history includes a successful non-repair
1043 # task, and we're not looping.
1044 #
Richard Barnette1ca30e62018-04-09 16:45:58 -07001045 # The for loop below is very expensive, because it must fetch the
Richard Barnettecf5d8342017-10-24 18:13:11 -07001046 # full history, regardless of how many tasks we examine. At the
1047 # time of this writing, this check against the diagnosis task
1048 # reduces the cost of finding loops in the full inventory from hours
1049 # to minutes.
Richard Barnette59404262018-09-14 15:25:30 -07001050 if _get_diagnosis(history).task.name != 'Repair':
Richard Barnettecf5d8342017-10-24 18:13:11 -07001051 return False
1052 repair_ok_count = 0
1053 for task in history:
1054 if not task.is_special:
1055 # This is a test, so we're not looping.
1056 return False
1057 if task.diagnosis == status_history.BROKEN:
1058 # Failed a repair, so we're not looping.
1059 return False
1060 if (task.diagnosis == status_history.WORKING
1061 and task.name != 'Repair'):
1062 # Non-repair task succeeded, so we're not looping.
1063 return False
1064 # At this point, we have either a failed non-repair task, or
1065 # a successful repair.
1066 if task.name == 'Repair':
1067 repair_ok_count += 1
1068 if repair_ok_count >= _REPAIR_LOOP_THRESHOLD:
1069 return True
1070
1071
Richard Barnette1ca30e62018-04-09 16:45:58 -07001072def _report_untestable_dut(history, state):
1073 fields = {
1074 'dut_hostname': history.hostname,
1075 'model': history.host_model,
1076 'pool': history.host_pool,
1077 'state': state,
1078 }
Richard Barnettee8eee312018-04-27 13:12:04 -04001079 logging.info('DUT in state %(state)s: %(dut_hostname)s, '
1080 'model: %(model)s, pool: %(pool)s', fields)
Richard Barnette1ca30e62018-04-09 16:45:58 -07001081 _UNTESTABLE_PRESENCE_METRIC.set(True, fields=fields)
Richard Barnettecf5d8342017-10-24 18:13:11 -07001082
Richard Barnettecf5d8342017-10-24 18:13:11 -07001083
Richard Barnette1ca30e62018-04-09 16:45:58 -07001084def _report_untestable_dut_metrics(inventory):
1085 """Scan the inventory for DUTs unable to run tests.
1086
1087 DUTs in the inventory are judged "untestable" if they meet one of
1088 two criteria:
1089 * The DUT is stuck in a repair loop; that is, it regularly passes
1090 repair, but never passes other operations.
1091 * The DUT runs no tasks at all, but is not locked.
1092
1093 This routine walks through the given inventory looking for DUTs in
1094 either of these states. Results are reported via a Monarch presence
1095 metric.
1096
1097 Note: To make sure that DUTs aren't flagged as "idle" merely
1098 because there's no work, a separate job runs prior to regular
1099 inventory runs which schedules trivial work on any DUT that appears
1100 idle.
1101
1102 @param inventory `_LabInventory` object to be reported on.
1103 """
Richard Barnettee8eee312018-04-27 13:12:04 -04001104 logging.info('Scanning for untestable DUTs.')
1105 for history in _all_dut_histories(inventory):
1106 # Managed DUTs with names that don't match
1107 # _HOSTNAME_PATTERN shouldn't be possible. However, we
1108 # don't want arbitrary strings being attached to the
1109 # 'dut_hostname' field, so for safety, we exclude all
1110 # anomalies.
1111 if not _HOSTNAME_PATTERN.match(history.hostname):
1112 continue
1113 if _host_is_working(history):
1114 if _dut_in_repair_loop(history):
1115 _report_untestable_dut(history, 'repair_loop')
1116 elif _host_is_idle(history):
1117 if not history.host.locked:
1118 _report_untestable_dut(history, 'idle_unlocked')
Richard Barnettecf5d8342017-10-24 18:13:11 -07001119
1120
Richard Barnette3dcbb6a2017-10-23 17:57:50 -07001121def _log_startup(arguments, startup_time):
1122 """Log the start of this inventory run.
1123
1124 Print various log messages indicating the start of the run. Return
1125 a string based on `startup_time` that will be used to identify this
1126 run in log files and e-mail messages.
1127
1128 @param startup_time A UNIX timestamp marking the moment when
1129 this inventory run began.
1130 @returns A timestamp string that will be used to identify this run
1131 in logs and email output.
1132 """
Alex Zamorzaev8e5317e52018-09-25 00:42:30 +00001133 timestamp = time.strftime('%Y-%m-%d.%H',
Richard Barnette3dcbb6a2017-10-23 17:57:50 -07001134 time.localtime(startup_time))
1135 logging.debug('Starting lab inventory for %s', timestamp)
Richard Barnette5de01eb2017-12-15 09:53:42 -08001136 if arguments.model_notify:
Richard Barnette3dcbb6a2017-10-23 17:57:50 -07001137 if arguments.recommend:
1138 logging.debug('Will include repair recommendations')
Richard Barnette5de01eb2017-12-15 09:53:42 -08001139 logging.debug('Will include model inventory')
Richard Barnette3dcbb6a2017-10-23 17:57:50 -07001140 if arguments.pool_notify:
1141 logging.debug('Will include pool inventory')
1142 return timestamp
1143
1144
1145def _create_inventory(arguments, end_time):
1146 """Create the `_LabInventory` instance to use for reporting.
1147
1148 @param end_time A UNIX timestamp for the end of the time range
1149 to be searched in this inventory run.
1150 """
1151 start_time = end_time - arguments.duration * 60 * 60
1152 afe = frontend_wrappers.RetryingAFE(server=None)
1153 inventory = _LabInventory.create_inventory(
Richard Barnette5de01eb2017-12-15 09:53:42 -08001154 afe, start_time, end_time, arguments.modelnames)
1155 logging.info('Found %d hosts across %d models',
Richard Barnette3dcbb6a2017-10-23 17:57:50 -07001156 inventory.get_num_duts(),
Richard Barnette5de01eb2017-12-15 09:53:42 -08001157 inventory.get_num_models())
Richard Barnette3dcbb6a2017-10-23 17:57:50 -07001158 return inventory
1159
1160
Richard Barnettecf5d8342017-10-24 18:13:11 -07001161def _perform_inventory_reports(arguments):
1162 """Perform all inventory checks requested on the command line.
1163
1164 Create the initial inventory and run through the inventory reports
1165 as called for by the parsed command-line arguments.
1166
1167 @param arguments Command-line arguments as returned by
1168 `ArgumentParser`.
1169 """
1170 startup_time = time.time()
1171 timestamp = _log_startup(arguments, startup_time)
1172 inventory = _create_inventory(arguments, startup_time)
1173 if arguments.debug:
Richard Barnette5de01eb2017-12-15 09:53:42 -08001174 _populate_model_counts(inventory)
1175 if arguments.model_notify:
1176 _perform_model_inventory(arguments, inventory, timestamp)
Richard Barnettecf5d8342017-10-24 18:13:11 -07001177 if arguments.pool_notify:
1178 _perform_pool_inventory(arguments, inventory, timestamp)
Richard Barnette1ca30e62018-04-09 16:45:58 -07001179 if arguments.report_untestable:
1180 _report_untestable_dut_metrics(inventory)
Richard Barnettecf5d8342017-10-24 18:13:11 -07001181
1182
J. Richard Barnette96db3492015-03-27 17:23:52 -07001183def _separate_email_addresses(address_list):
1184 """Parse a list of comma-separated lists of e-mail addresses.
1185
1186 @param address_list A list of strings containing comma
1187 separate e-mail addresses.
1188 @return A list of the individual e-mail addresses.
J. Richard Barnette96db3492015-03-27 17:23:52 -07001189 """
1190 newlist = []
1191 for arg in address_list:
1192 newlist.extend([email.strip() for email in arg.split(',')])
1193 return newlist
1194
1195
1196def _verify_arguments(arguments):
1197 """Validate command-line arguments.
1198
Richard Barnette5de01eb2017-12-15 09:53:42 -08001199 Join comma separated e-mail addresses for `--model-notify` and
J. Richard Barnette96db3492015-03-27 17:23:52 -07001200 `--pool-notify` in separate option arguments into a single list.
1201
Richard Barnette54150302018-02-26 10:42:46 -08001202 For non-debug uses, require that at least one inventory report be
1203 requested. For debug, if a report isn't specified, treat it as "run
1204 all the reports."
J. Richard Barnette02e82432015-10-13 16:02:47 -07001205
1206 The return value indicates success or failure; in the case of
1207 failure, we also write an error message to stderr.
1208
J. Richard Barnette96db3492015-03-27 17:23:52 -07001209 @param arguments Command-line arguments as returned by
1210 `ArgumentParser`
J. Richard Barnette02e82432015-10-13 16:02:47 -07001211 @return True if the arguments are semantically good, or False
1212 if the arguments don't meet requirements.
J. Richard Barnette96db3492015-03-27 17:23:52 -07001213 """
Richard Barnette5de01eb2017-12-15 09:53:42 -08001214 arguments.model_notify = _separate_email_addresses(
1215 arguments.model_notify)
J. Richard Barnette96db3492015-03-27 17:23:52 -07001216 arguments.pool_notify = _separate_email_addresses(
1217 arguments.pool_notify)
Richard Barnette54150302018-02-26 10:42:46 -08001218 if not any([arguments.model_notify, arguments.pool_notify,
Richard Barnette1ca30e62018-04-09 16:45:58 -07001219 arguments.report_untestable]):
J. Richard Barnette02e82432015-10-13 16:02:47 -07001220 if not arguments.debug:
Richard Barnette54150302018-02-26 10:42:46 -08001221 sys.stderr.write('Must request at least one report via '
1222 '--model-notify, --pool-notify, or '
Richard Barnette1ca30e62018-04-09 16:45:58 -07001223 '--report-untestable\n')
J. Richard Barnette02e82432015-10-13 16:02:47 -07001224 return False
1225 else:
Richard Barnette54150302018-02-26 10:42:46 -08001226 # We want to run all the e-mail reports. An empty notify
1227 # list will cause a report to be skipped, so make sure the
J. Richard Barnette02e82432015-10-13 16:02:47 -07001228 # lists are non-empty.
Richard Barnette5de01eb2017-12-15 09:53:42 -08001229 arguments.model_notify = ['']
J. Richard Barnette02e82432015-10-13 16:02:47 -07001230 arguments.pool_notify = ['']
1231 return True
J. Richard Barnette96db3492015-03-27 17:23:52 -07001232
1233
Richard Barnette3dcbb6a2017-10-23 17:57:50 -07001234def _get_default_logdir(script):
J. Richard Barnette96db3492015-03-27 17:23:52 -07001235 """Get the default directory for the `--logdir` option.
1236
1237 The default log directory is based on the parent directory
1238 containing this script.
1239
1240 @param script Path to this script file.
1241 @return A path to a directory.
J. Richard Barnette96db3492015-03-27 17:23:52 -07001242 """
1243 basedir = os.path.dirname(os.path.abspath(script))
1244 basedir = os.path.dirname(basedir)
1245 return os.path.join(basedir, _LOGDIR)
1246
1247
1248def _parse_command(argv):
1249 """Parse the command line arguments.
1250
1251 Create an argument parser for this command's syntax, parse the
1252 command line, and return the result of the ArgumentParser
1253 parse_args() method.
1254
1255 @param argv Standard command line argument vector; argv[0] is
1256 assumed to be the command name.
1257 @return Result returned by ArgumentParser.parse_args().
J. Richard Barnette96db3492015-03-27 17:23:52 -07001258 """
1259 parser = argparse.ArgumentParser(
1260 prog=argv[0],
1261 description='Gather and report lab inventory statistics')
1262 parser.add_argument('-d', '--duration', type=int,
1263 default=_DEFAULT_DURATION, metavar='HOURS',
1264 help='number of hours back to search for status'
1265 ' (default: %d)' % _DEFAULT_DURATION)
Richard Barnette5de01eb2017-12-15 09:53:42 -08001266 parser.add_argument('--model-notify', action='append',
J. Richard Barnette96db3492015-03-27 17:23:52 -07001267 default=[], metavar='ADDRESS',
Richard Barnette5de01eb2017-12-15 09:53:42 -08001268 help='Generate model inventory message, '
J. Richard Barnette96db3492015-03-27 17:23:52 -07001269 'and send it to the given e-mail address(es)')
1270 parser.add_argument('--pool-notify', action='append',
1271 default=[], metavar='ADDRESS',
1272 help='Generate pool inventory message, '
1273 'and send it to the given address(es)')
J. Richard Barnette1df6a562015-06-09 10:06:17 -07001274 parser.add_argument('-r', '--recommend', type=int, default=None,
J. Richard Barnettef6839282015-06-01 16:00:35 -07001275 help=('Specify how many DUTs should be '
J. Richard Barnette1df6a562015-06-09 10:06:17 -07001276 'recommended for repair (default: no '
1277 'recommendation)'))
Richard Barnette1ca30e62018-04-09 16:45:58 -07001278 parser.add_argument('--report-untestable', action='store_true',
1279 help='Check for devices unable to run tests.')
J. Richard Barnette02e82432015-10-13 16:02:47 -07001280 parser.add_argument('--debug', action='store_true',
Prathmesh Prabhu6b48ede2018-05-07 14:33:07 -07001281 help='Print e-mail, metrics messages on stdout '
J. Richard Barnette96db3492015-03-27 17:23:52 -07001282 'without sending them.')
Richard Barnettec4374692018-09-17 13:53:38 -07001283 parser.add_argument('--no-metrics', action='store_false',
1284 dest='use_metrics',
1285 help='Suppress generation of Monarch metrics.')
Richard Barnette3dcbb6a2017-10-23 17:57:50 -07001286 parser.add_argument('--logdir', default=_get_default_logdir(argv[0]),
J. Richard Barnette96db3492015-03-27 17:23:52 -07001287 help='Directory where logs will be written.')
Richard Barnette5de01eb2017-12-15 09:53:42 -08001288 parser.add_argument('modelnames', nargs='*',
1289 metavar='MODEL',
1290 help='names of models to report on '
1291 '(default: all models)')
J. Richard Barnette96db3492015-03-27 17:23:52 -07001292 arguments = parser.parse_args(argv[1:])
J. Richard Barnette02e82432015-10-13 16:02:47 -07001293 if not _verify_arguments(arguments):
1294 return None
J. Richard Barnette96db3492015-03-27 17:23:52 -07001295 return arguments
1296
1297
1298def _configure_logging(arguments):
1299 """Configure the `logging` module for our needs.
1300
Richard Barnette3dcbb6a2017-10-23 17:57:50 -07001301 How we log depends on whether the `--debug` option was provided on
1302 the command line.
1303 * Without the option, we configure the logging to capture all
1304 potentially relevant events in a log file. The log file is
1305 configured to rotate once a week on Friday evening, preserving
1306 ~3 months worth of history.
1307 * With the option, we expect stdout to contain other
1308 human-readable output (including the contents of the e-mail
Richard Barnettecf5d8342017-10-24 18:13:11 -07001309 messages), so we restrict the output to INFO level.
1310
1311 For convenience, when `--debug` is on, the logging format has
1312 no adornments, so that a call like `logging.info(msg)` simply writes
1313 `msg` to stdout, plus a trailing newline.
J. Richard Barnette96db3492015-03-27 17:23:52 -07001314
1315 @param arguments Command-line arguments as returned by
1316 `ArgumentParser`
J. Richard Barnette96db3492015-03-27 17:23:52 -07001317 """
J. Richard Barnettef6839282015-06-01 16:00:35 -07001318 root_logger = logging.getLogger()
J. Richard Barnette02e82432015-10-13 16:02:47 -07001319 if arguments.debug:
J. Richard Barnettef6839282015-06-01 16:00:35 -07001320 root_logger.setLevel(logging.INFO)
Alex Zamorzaev8e5317e52018-09-25 00:42:30 +00001321 handler = logging.StreamHandler(sys.stdout)
1322 handler.setFormatter(logging.Formatter())
J. Richard Barnette96db3492015-03-27 17:23:52 -07001323 else:
Richard Barnette5af97402016-04-18 11:00:26 -07001324 if not os.path.exists(arguments.logdir):
1325 os.mkdir(arguments.logdir)
Alex Zamorzaev8e5317e52018-09-25 00:42:30 +00001326 root_logger.setLevel(logging.DEBUG)
1327 logfile = os.path.join(arguments.logdir, _LOGFILE)
1328 handler = logging.handlers.TimedRotatingFileHandler(
1329 logfile, when='W4', backupCount=13)
1330 formatter = logging.Formatter(_LOG_FORMAT,
1331 time_utils.TIME_FMT)
1332 handler.setFormatter(formatter)
J. Richard Barnettef6839282015-06-01 16:00:35 -07001333 # TODO(jrbarnette) This is gross. Importing client.bin.utils
1334 # implicitly imported logging_config, which calls
1335 # logging.basicConfig() *at module level*. That gives us an
1336 # extra logging handler that we don't want. So, clear out all
1337 # the handlers here.
1338 for h in root_logger.handlers:
1339 root_logger.removeHandler(h)
1340 root_logger.addHandler(handler)
J. Richard Barnette96db3492015-03-27 17:23:52 -07001341
1342
J. Richard Barnette96db3492015-03-27 17:23:52 -07001343def main(argv):
1344 """Standard main routine.
Richard Barnettecf5d8342017-10-24 18:13:11 -07001345
1346 @param argv Command line arguments, including `sys.argv[0]`.
J. Richard Barnette96db3492015-03-27 17:23:52 -07001347 """
1348 arguments = _parse_command(argv)
J. Richard Barnette02e82432015-10-13 16:02:47 -07001349 if not arguments:
1350 sys.exit(1)
J. Richard Barnette96db3492015-03-27 17:23:52 -07001351 _configure_logging(arguments)
Prathmesh Prabhu6b48ede2018-05-07 14:33:07 -07001352
Richard Barnettec4374692018-09-17 13:53:38 -07001353 try:
1354 if arguments.use_metrics:
1355 if arguments.debug:
1356 logging.info('Debug mode: Will not report metrics to monarch.')
1357 metrics_file = '/dev/null'
1358 else:
1359 metrics_file = None
1360 with site_utils.SetupTsMonGlobalState(
1361 'lab_inventory', debug_file=metrics_file,
1362 auto_flush=False):
1363 success = False
1364 try:
1365 with metrics.SecondsTimer('%s/duration' % _METRICS_PREFIX):
1366 _perform_inventory_reports(arguments)
1367 success = True
1368 finally:
1369 metrics.Counter('%s/tick' % _METRICS_PREFIX).increment(
1370 fields={'success': success})
1371 metrics.Flush()
1372 else:
1373 _perform_inventory_reports(arguments)
1374 except KeyboardInterrupt:
1375 pass
1376 except Exception:
1377 # Our cron setup doesn't preserve stderr, so drop extra breadcrumbs.
1378 logging.exception('Error escaped main')
1379 raise
J. Richard Barnette96db3492015-03-27 17:23:52 -07001380
1381
Kevin Chengcf0ad2b2016-04-19 14:51:39 -07001382def get_inventory(afe):
J. Richard Barnetteaa868932015-10-23 13:28:59 -07001383 end_time = int(time.time())
1384 start_time = end_time - 24 * 60 * 60
Kevin Chengcf0ad2b2016-04-19 14:51:39 -07001385 return _LabInventory.create_inventory(afe, start_time, end_time)
1386
1387
1388def get_managed_boards(afe):
Richard Barnette5de01eb2017-12-15 09:53:42 -08001389 return get_inventory(afe).get_boards()
J. Richard Barnetteaa868932015-10-23 13:28:59 -07001390
1391
J. Richard Barnette96db3492015-03-27 17:23:52 -07001392if __name__ == '__main__':
1393 main(sys.argv)