blob: 8b9b646d02e0f3342fd150e75a38d53b04647b28 [file] [log] [blame]
J. Richard Barnette96db3492015-03-27 17:23:52 -07001#!/usr/bin/env python
2# Copyright 2015 The Chromium OS Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6"""Create e-mail reports of the Lab's DUT inventory.
7
8Gathers a list of all DUTs of interest in the Lab, segregated by
Richard Barnette5de01eb2017-12-15 09:53:42 -08009model and pool, and determines whether each DUT is working or
J. Richard Barnette96db3492015-03-27 17:23:52 -070010broken. Then, send one or more e-mail reports summarizing the
11status to e-mail addresses provided on the command line.
12
Richard Barnette5de01eb2017-12-15 09:53:42 -080013usage: lab_inventory.py [ options ] [ model ... ]
J. Richard Barnette96db3492015-03-27 17:23:52 -070014
15Options:
16--duration / -d <hours>
17 How far back in time to search job history to determine DUT
18 status.
19
Richard Barnette5de01eb2017-12-15 09:53:42 -080020--model-notify <address>[,<address>]
21 Send the "model status" e-mail to all the specified e-mail
J. Richard Barnette96db3492015-03-27 17:23:52 -070022 addresses.
23
24--pool-notify <address>[,<address>]
25 Send the "pool status" e-mail to all the specified e-mail
26 addresses.
27
J. Richard Barnette1df6a562015-06-09 10:06:17 -070028--recommend <number>
Richard Barnette5de01eb2017-12-15 09:53:42 -080029 When generating the "model status" e-mail, include a list of
J. Richard Barnette1df6a562015-06-09 10:06:17 -070030 <number> specific DUTs to be recommended for repair.
31
Richard Barnette1ca30e62018-04-09 16:45:58 -070032--report-untestable
33 Scan the inventory for DUTs that can't test because they're stuck in
34 repair loops, or because the scheduler can't give them work.
Richard Barnettecf5d8342017-10-24 18:13:11 -070035
J. Richard Barnette96db3492015-03-27 17:23:52 -070036--logdir <directory>
37 Log progress and actions in a file under this directory. Text
38 of any e-mail sent will also be logged in a timestamped file in
39 this directory.
40
J. Richard Barnette02e82432015-10-13 16:02:47 -070041--debug
Richard Barnettecf5d8342017-10-24 18:13:11 -070042 Suppress all logging, metrics reporting, and sending e-mail.
43 Instead, write the output that would be generated onto stdout.
J. Richard Barnette96db3492015-03-27 17:23:52 -070044
Richard Barnette5de01eb2017-12-15 09:53:42 -080045<model> arguments:
46 With no arguments, gathers the status for all models in the lab.
47 With one or more named models on the command line, restricts
48 reporting to just those models.
J. Richard Barnette96db3492015-03-27 17:23:52 -070049"""
50
51
52import argparse
Prathmesh Prabhu021e7842017-11-08 18:05:45 -080053import collections
J. Richard Barnette96db3492015-03-27 17:23:52 -070054import logging
55import logging.handlers
56import os
J. Richard Barnettef6839282015-06-01 16:00:35 -070057import re
J. Richard Barnette96db3492015-03-27 17:23:52 -070058import sys
59import time
60
61import common
J. Richard Barnettef6839282015-06-01 16:00:35 -070062from autotest_lib.client.bin import utils
Richard Barnette6f6ce322018-09-07 16:23:20 +000063from autotest_lib.client.common_lib import time_utils
Jacob Kopczynski7c4c9542018-08-13 17:24:41 -070064from autotest_lib.frontend.afe.json_rpc import proxy
Xixuan Wu93e646c2017-12-07 18:36:10 -080065from autotest_lib.server import constants
Richard Barnettecf5d8342017-10-24 18:13:11 -070066from autotest_lib.server import site_utils
J. Richard Barnettea7c514e2015-09-15 11:13:23 -070067from autotest_lib.server.cros.dynamic_suite import frontend_wrappers
J. Richard Barnettef6839282015-06-01 16:00:35 -070068from autotest_lib.server.hosts import servo_host
Aviv Keshet7ee95862016-08-30 15:18:27 -070069from autotest_lib.server.lib import status_history
J. Richard Barnette96db3492015-03-27 17:23:52 -070070from autotest_lib.site_utils import gmail_lib
Richard Barnettecf5d8342017-10-24 18:13:11 -070071from chromite.lib import metrics
J. Richard Barnette96db3492015-03-27 17:23:52 -070072
73
Richard Barnette673573b2016-12-12 09:46:39 -080074CRITICAL_POOLS = constants.Pools.CRITICAL_POOLS
75SPARE_POOL = constants.Pools.SPARE_POOL
76MANAGED_POOLS = constants.Pools.MANAGED_POOLS
J. Richard Barnette96db3492015-03-27 17:23:52 -070077
J. Richard Barnetteb8bc570c2016-03-17 17:03:57 -070078# _EXCLUDED_LABELS - A set of labels that disqualify a DUT from
Richard Barnetteeabcf392017-09-01 15:10:54 -070079# monitoring by this script. Currently, we're excluding these:
80# + 'adb' - We're not ready to monitor Android or Brillo hosts.
81# + 'board:guado_moblab' - These are maintained by a separate
82# process that doesn't use this script.
Aviv Keshet1ba0dec2018-07-12 17:14:08 -070083# + 'board:veyron_rialto' due to crbug.com/854404
Kevin Chengcf0ad2b2016-04-19 14:51:39 -070084
C Shapiro7de04422018-08-29 14:46:11 -060085_EXCLUDED_LABELS = {'adb', 'board:guado_moblab',
Aviv Keshet1ba0dec2018-07-12 17:14:08 -070086 'board:veyron_rialto'}
J. Richard Barnetteb8bc570c2016-03-17 17:03:57 -070087
J. Richard Barnette96db3492015-03-27 17:23:52 -070088# _DEFAULT_DURATION:
89# Default value used for the --duration command line option.
90# Specifies how far back in time to search in order to determine
91# DUT status.
92
93_DEFAULT_DURATION = 24
94
J. Richard Barnette96db3492015-03-27 17:23:52 -070095# _LOGDIR:
Richard Barnettecf5d8342017-10-24 18:13:11 -070096# Relative path used in the calculation of the default setting for
97# the --logdir option. The full path is relative to the root of the
98# autotest directory, as determined from sys.argv[0].
J. Richard Barnette96db3492015-03-27 17:23:52 -070099# _LOGFILE:
100# Basename of a file to which general log information will be
101# written.
102# _LOG_FORMAT:
103# Format string for log messages.
104
105_LOGDIR = os.path.join('logs', 'dut-data')
106_LOGFILE = 'lab-inventory.log'
107_LOG_FORMAT = '%(asctime)s | %(levelname)-10s | %(message)s'
108
J. Richard Barnettef6839282015-06-01 16:00:35 -0700109# Pattern describing location-based host names in the Chrome OS test
110# labs. Each DUT hostname designates the DUT's location:
111# * A lab (room) that's physically separated from other labs
112# (i.e. there's a door).
113# * A row (or aisle) of DUTs within the lab.
114# * A vertical rack of shelves on the row.
115# * A specific host on one shelf of the rack.
116
117_HOSTNAME_PATTERN = re.compile(
118 r'(chromeos\d+)-row(\d+)-rack(\d+)-host(\d+)')
119
Richard Barnettecf5d8342017-10-24 18:13:11 -0700120# _REPAIR_LOOP_THRESHOLD:
121# The number of repeated Repair tasks that must be seen to declare
122# that a DUT is stuck in a repair loop.
123
124_REPAIR_LOOP_THRESHOLD = 4
125
J. Richard Barnette96db3492015-03-27 17:23:52 -0700126
Prathmesh Prabhub69a6cc2018-05-07 14:49:33 -0700127_METRICS_PREFIX = 'chromeos/autotest/inventory'
Richard Barnette1ca30e62018-04-09 16:45:58 -0700128_UNTESTABLE_PRESENCE_METRIC = metrics.BooleanMetric(
Jacob Kopczynski7c4c9542018-08-13 17:24:41 -0700129 _METRICS_PREFIX + '/untestable',
Richard Barnette1ca30e62018-04-09 16:45:58 -0700130 'DUTs that cannot be scheduled for testing')
131
Jacob Kopczynski7c4c9542018-08-13 17:24:41 -0700132_MISSING_DUT_METRIC = metrics.Counter(
133 _METRICS_PREFIX + '/missing', 'DUTs which cannot be found by lookup queries'
134 ' because they are invalid or deleted')
135
Richard Barnette59404262018-09-14 15:25:30 -0700136# _Diagnosis - namedtuple corresponding to the return value from
137# `HostHistory.last_diagnosis()`
138_Diagnosis = collections.namedtuple('_Diagnosis', ['status', 'task'])
139
Richard Barnette59404262018-09-14 15:25:30 -0700140def _get_diagnosis(history):
Jacob Kopczynski7c4c9542018-08-13 17:24:41 -0700141 dut_present = True
142 try:
Richard Barnette59404262018-09-14 15:25:30 -0700143 diagnosis = _Diagnosis(*history.last_diagnosis())
144 if (diagnosis.status == status_history.BROKEN
145 and diagnosis.task.end_time < history.start_time):
146 return _Diagnosis(status_history.UNUSED, diagnosis.task)
147 else:
148 return diagnosis
Jacob Kopczynski7c4c9542018-08-13 17:24:41 -0700149 except proxy.JSONRPCException as e:
150 logging.warn(e)
151 dut_present = False
152 finally:
153 _MISSING_DUT_METRIC.increment(
154 fields={'host': history.hostname, 'presence': dut_present})
Richard Barnettea3071b72018-09-26 10:04:18 -0700155 return _Diagnosis(None, None)
156
Richard Barnette1ca30e62018-04-09 16:45:58 -0700157
Richard Barnettee8eee312018-04-27 13:12:04 -0400158def _host_is_working(history):
Richard Barnette59404262018-09-14 15:25:30 -0700159 return _get_diagnosis(history).status == status_history.WORKING
Richard Barnettee8eee312018-04-27 13:12:04 -0400160
161
162def _host_is_broken(history):
Richard Barnette59404262018-09-14 15:25:30 -0700163 return _get_diagnosis(history).status == status_history.BROKEN
Richard Barnettee8eee312018-04-27 13:12:04 -0400164
165
166def _host_is_idle(history):
167 idle_statuses = {status_history.UNUSED, status_history.UNKNOWN}
Richard Barnette59404262018-09-14 15:25:30 -0700168 return _get_diagnosis(history).status in idle_statuses
Richard Barnettee8eee312018-04-27 13:12:04 -0400169
170
Richard Barnette5de01eb2017-12-15 09:53:42 -0800171class _HostSetInventory(object):
172 """Maintains a set of related `HostJobHistory` objects.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700173
Richard Barnettee8eee312018-04-27 13:12:04 -0400174 Current usage of this class is that all DUTs are part of a single
175 scheduling pool of DUTs for a single model; however, this class make
176 no assumptions about the actual relationship among the DUTs.
177
Richard Barnette5de01eb2017-12-15 09:53:42 -0800178 The collection is segregated into disjoint categories of "working",
179 "broken", and "idle" DUTs. Accessor methods allow finding both the
180 list of DUTs in each category, as well as counts of each category.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700181
J. Richard Barnettef6839282015-06-01 16:00:35 -0700182 Performance note: Certain methods in this class are potentially
183 expensive:
184 * `get_working()`
185 * `get_working_list()`
186 * `get_broken()`
187 * `get_broken_list()`
xixuan12ce04f2016-03-10 13:16:30 -0800188 * `get_idle()`
189 * `get_idle_list()`
J. Richard Barnettef6839282015-06-01 16:00:35 -0700190 The first time any one of these methods is called, it causes
191 multiple RPC calls with a relatively expensive set of database
192 queries. However, the results of the queries are cached in the
193 individual `HostJobHistory` objects, so only the first call
194 actually pays the full cost.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700195
xixuan12ce04f2016-03-10 13:16:30 -0800196 Additionally, `get_working_list()`, `get_broken_list()` and
197 `get_idle_list()` cache their return values to avoid recalculating
Richard Barnette5de01eb2017-12-15 09:53:42 -0800198 lists at every call; this caching is separate from the caching of
199 RPC results described above.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700200
Richard Barnette5de01eb2017-12-15 09:53:42 -0800201 This class is deliberately constructed to delay the RPC cost until
202 the accessor methods are called (rather than to query in
J. Richard Barnette96db3492015-03-27 17:23:52 -0700203 `record_host()`) so that it's possible to construct a complete
204 `_LabInventory` without making the expensive queries at creation
Richard Barnette5de01eb2017-12-15 09:53:42 -0800205 time. `_populate_model_counts()`, below, assumes this behavior.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700206 """
207
208 def __init__(self):
209 self._histories = []
J. Richard Barnettef6839282015-06-01 16:00:35 -0700210 self._working_list = None
211 self._broken_list = None
xixuan12ce04f2016-03-10 13:16:30 -0800212 self._idle_list = None
J. Richard Barnette96db3492015-03-27 17:23:52 -0700213
J. Richard Barnette96db3492015-03-27 17:23:52 -0700214 def record_host(self, host_history):
215 """Add one `HostJobHistory` object to the collection.
216
217 @param host_history The `HostJobHistory` object to be
218 remembered.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700219 """
J. Richard Barnettef6839282015-06-01 16:00:35 -0700220 self._working_list = None
221 self._broken_list = None
xixuan12ce04f2016-03-10 13:16:30 -0800222 self._idle_list = None
J. Richard Barnette96db3492015-03-27 17:23:52 -0700223 self._histories.append(host_history)
224
J. Richard Barnettef6839282015-06-01 16:00:35 -0700225 def get_working_list(self):
226 """Return a list of all working DUTs in the pool.
227
Richard Barnettee8eee312018-04-27 13:12:04 -0400228 Filter `self._histories` for histories where the DUT is
229 diagnosed as working.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700230
231 Cache the result so that we only cacluate it once.
232
233 @return A list of HostJobHistory objects.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700234 """
235 if self._working_list is None:
236 self._working_list = [h for h in self._histories
Richard Barnettee8eee312018-04-27 13:12:04 -0400237 if _host_is_working(h)]
J. Richard Barnettef6839282015-06-01 16:00:35 -0700238 return self._working_list
239
J. Richard Barnette96db3492015-03-27 17:23:52 -0700240 def get_working(self):
J. Richard Barnettef6839282015-06-01 16:00:35 -0700241 """Return the number of working DUTs in the pool."""
242 return len(self.get_working_list())
243
J. Richard Barnettef6839282015-06-01 16:00:35 -0700244 def get_broken_list(self):
245 """Return a list of all broken DUTs in the pool.
246
Richard Barnettee8eee312018-04-27 13:12:04 -0400247 Filter `self._histories` for histories where the DUT is
248 diagnosed as broken.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700249
250 Cache the result so that we only cacluate it once.
251
252 @return A list of HostJobHistory objects.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700253 """
254 if self._broken_list is None:
255 self._broken_list = [h for h in self._histories
Richard Barnettee8eee312018-04-27 13:12:04 -0400256 if _host_is_broken(h)]
J. Richard Barnettef6839282015-06-01 16:00:35 -0700257 return self._broken_list
J. Richard Barnette96db3492015-03-27 17:23:52 -0700258
J. Richard Barnette96db3492015-03-27 17:23:52 -0700259 def get_broken(self):
J. Richard Barnettef6839282015-06-01 16:00:35 -0700260 """Return the number of broken DUTs in the pool."""
261 return len(self.get_broken_list())
J. Richard Barnette96db3492015-03-27 17:23:52 -0700262
xixuan12ce04f2016-03-10 13:16:30 -0800263 def get_idle_list(self):
264 """Return a list of all idle DUTs in the pool.
265
Richard Barnettee8eee312018-04-27 13:12:04 -0400266 Filter `self._histories` for histories where the DUT is
267 diagnosed as idle.
xixuan12ce04f2016-03-10 13:16:30 -0800268
269 Cache the result so that we only cacluate it once.
270
271 @return A list of HostJobHistory objects.
xixuan12ce04f2016-03-10 13:16:30 -0800272 """
xixuan12ce04f2016-03-10 13:16:30 -0800273 if self._idle_list is None:
274 self._idle_list = [h for h in self._histories
Richard Barnettee8eee312018-04-27 13:12:04 -0400275 if _host_is_idle(h)]
xixuan12ce04f2016-03-10 13:16:30 -0800276 return self._idle_list
277
xixuan12ce04f2016-03-10 13:16:30 -0800278 def get_idle(self):
279 """Return the number of idle DUTs in the pool."""
280 return len(self.get_idle_list())
281
J. Richard Barnette96db3492015-03-27 17:23:52 -0700282 def get_total(self):
J. Richard Barnettef6839282015-06-01 16:00:35 -0700283 """Return the total number of DUTs in the pool."""
J. Richard Barnette96db3492015-03-27 17:23:52 -0700284 return len(self._histories)
285
Richard Barnettee8eee312018-04-27 13:12:04 -0400286 def get_all_histories(self):
287 return self._histories
288
J. Richard Barnette96db3492015-03-27 17:23:52 -0700289
Richard Barnette5de01eb2017-12-15 09:53:42 -0800290class _PoolSetInventory(object):
291 """Maintains a set of `HostJobHistory`s for a set of pools.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700292
Richard Barnette5de01eb2017-12-15 09:53:42 -0800293 The collection is segregated into disjoint categories of "working",
294 "broken", and "idle" DUTs. Accessor methods allow finding both the
295 list of DUTs in each category, as well as counts of each category.
296 Accessor queries can be for an individual pool, or against all
297 pools.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700298
Richard Barnette5de01eb2017-12-15 09:53:42 -0800299 Performance note: This class relies on `_HostSetInventory`. Public
300 methods in this class generally rely on methods of the same name in
301 the underlying class, and so will have the same underlying
302 performance characteristics.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700303 """
304
Richard Barnette5de01eb2017-12-15 09:53:42 -0800305 def __init__(self, pools):
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800306 self._histories_by_pool = {
Richard Barnette5de01eb2017-12-15 09:53:42 -0800307 pool: _HostSetInventory() for pool in pools
J. Richard Barnette96db3492015-03-27 17:23:52 -0700308 }
309
310 def record_host(self, host_history):
311 """Add one `HostJobHistory` object to the collection.
312
313 @param host_history The `HostJobHistory` object to be
314 remembered.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700315 """
J. Richard Barnette3d0590a2015-04-29 12:56:12 -0700316 pool = host_history.host_pool
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800317 self._histories_by_pool[pool].record_host(host_history)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700318
J. Richard Barnette96db3492015-03-27 17:23:52 -0700319 def _count_pool(self, get_pool_count, pool=None):
320 """Internal helper to count hosts in a given pool.
321
322 The `get_pool_count` parameter is a function to calculate
323 the exact count of interest for the pool.
324
325 @param get_pool_count Function to return a count from a
326 _PoolCount object.
327 @param pool The pool to be counted. If `None`,
328 return the total across all pools.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700329 """
330 if pool is None:
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800331 return sum([get_pool_count(cached_history) for cached_history in
332 self._histories_by_pool.values()])
J. Richard Barnette96db3492015-03-27 17:23:52 -0700333 else:
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800334 return get_pool_count(self._histories_by_pool[pool])
J. Richard Barnette96db3492015-03-27 17:23:52 -0700335
J. Richard Barnettef6839282015-06-01 16:00:35 -0700336 def get_working_list(self):
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800337 """Return a list of all working DUTs (across all pools).
J. Richard Barnettef6839282015-06-01 16:00:35 -0700338
Richard Barnettee8eee312018-04-27 13:12:04 -0400339 Go through all HostJobHistory objects across all pools,
340 selecting all DUTs identified as working.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700341
342 @return A list of HostJobHistory objects.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700343 """
344 l = []
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800345 for p in self._histories_by_pool.values():
J. Richard Barnettef6839282015-06-01 16:00:35 -0700346 l.extend(p.get_working_list())
347 return l
348
J. Richard Barnette96db3492015-03-27 17:23:52 -0700349 def get_working(self, pool=None):
350 """Return the number of working DUTs in a pool.
351
352 @param pool The pool to be counted. If `None`, return the
353 total across all pools.
354
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700355 @return The total number of working DUTs in the selected
356 pool(s).
J. Richard Barnette96db3492015-03-27 17:23:52 -0700357 """
Richard Barnette5de01eb2017-12-15 09:53:42 -0800358 return self._count_pool(_HostSetInventory.get_working, pool)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700359
J. Richard Barnettef6839282015-06-01 16:00:35 -0700360 def get_broken_list(self):
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800361 """Return a list of all broken DUTs (across all pools).
J. Richard Barnettef6839282015-06-01 16:00:35 -0700362
Richard Barnettee8eee312018-04-27 13:12:04 -0400363 Go through all HostJobHistory objects across all pools,
364 selecting all DUTs identified as broken.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700365
366 @return A list of HostJobHistory objects.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700367 """
368 l = []
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800369 for p in self._histories_by_pool.values():
J. Richard Barnettef6839282015-06-01 16:00:35 -0700370 l.extend(p.get_broken_list())
371 return l
372
J. Richard Barnette96db3492015-03-27 17:23:52 -0700373 def get_broken(self, pool=None):
374 """Return the number of broken DUTs in a pool.
375
376 @param pool The pool to be counted. If `None`, return the
377 total across all pools.
378
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700379 @return The total number of broken DUTs in the selected pool(s).
J. Richard Barnette96db3492015-03-27 17:23:52 -0700380 """
Richard Barnette5de01eb2017-12-15 09:53:42 -0800381 return self._count_pool(_HostSetInventory.get_broken, pool)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700382
xixuan12ce04f2016-03-10 13:16:30 -0800383 def get_idle_list(self, pool=None):
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800384 """Return a list of all idle DUTs in the given pool.
xixuan12ce04f2016-03-10 13:16:30 -0800385
Richard Barnettee8eee312018-04-27 13:12:04 -0400386 Go through all HostJobHistory objects across all pools,
387 selecting all DUTs identified as idle.
xixuan12ce04f2016-03-10 13:16:30 -0800388
389 @param pool: The pool to be counted. If `None`, return the total list
390 across all pools.
391
392 @return A list of HostJobHistory objects.
xixuan12ce04f2016-03-10 13:16:30 -0800393 """
394 if pool is None:
395 l = []
Richard Barnette5de01eb2017-12-15 09:53:42 -0800396 for p in self._histories_by_pool.itervalues():
xixuan12ce04f2016-03-10 13:16:30 -0800397 l.extend(p.get_idle_list())
398 return l
399 else:
Richard Barnette5de01eb2017-12-15 09:53:42 -0800400 return self._histories_by_pool[pool].get_idle_list()
xixuan12ce04f2016-03-10 13:16:30 -0800401
xixuan12ce04f2016-03-10 13:16:30 -0800402 def get_idle(self, pool=None):
403 """Return the number of idle DUTs in a pool.
404
405 @param pool: The pool to be counted. If `None`, return the total
406 across all pools.
407
408 @return The total number of idle DUTs in the selected pool(s).
409 """
Richard Barnette5de01eb2017-12-15 09:53:42 -0800410 return self._count_pool(_HostSetInventory.get_idle, pool)
xixuan12ce04f2016-03-10 13:16:30 -0800411
Richard Barnette5de01eb2017-12-15 09:53:42 -0800412 def get_spares_buffer(self, spare_pool=SPARE_POOL):
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700413 """Return the the nominal number of working spares.
414
415 Calculates and returns how many working spares there would
416 be in the spares pool if all broken DUTs were in the spares
417 pool. This number may be negative, indicating a shortfall
418 in the critical pools.
419
420 @return The total number DUTs in the spares pool, less the total
421 number of broken DUTs in all pools.
422 """
Richard Barnette5de01eb2017-12-15 09:53:42 -0800423 return self.get_total(spare_pool) - self.get_broken()
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700424
J. Richard Barnette96db3492015-03-27 17:23:52 -0700425 def get_total(self, pool=None):
426 """Return the total number of DUTs in a pool.
427
428 @param pool The pool to be counted. If `None`, return the
429 total across all pools.
430
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700431 @return The total number of DUTs in the selected pool(s).
J. Richard Barnette96db3492015-03-27 17:23:52 -0700432 """
Richard Barnette5de01eb2017-12-15 09:53:42 -0800433 return self._count_pool(_HostSetInventory.get_total, pool)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700434
Richard Barnettee8eee312018-04-27 13:12:04 -0400435 def get_all_histories(self, pool=None):
436 if pool is None:
437 for p in self._histories_by_pool.itervalues():
438 for h in p.get_all_histories():
439 yield h
440 else:
441 for h in self._histories_by_pool[pool].get_all_histories():
442 yield h
443
J. Richard Barnette96db3492015-03-27 17:23:52 -0700444
Richard Barnette5de01eb2017-12-15 09:53:42 -0800445def _eligible_host(afehost):
446 """Return whether this host is eligible for monitoring.
447
448 A host is eligible if it has a (unique) 'model' label, it's in
449 exactly one pool, and it has no labels from the
450 `_EXCLUDED_LABELS` set.
451
452 @param afehost The host to be tested for eligibility.
453 """
454 # DUTs without an existing, unique 'model' or 'pool' label
455 # aren't meant to exist in the managed inventory; their presence
456 # generally indicates an error in the database. Unfortunately
457 # such errors have been seen to occur from time to time.
458 #
459 # The _LabInventory constructor requires hosts to conform to the
460 # label restrictions, and may fail if they don't. Failing an
461 # inventory run for a single bad entry is the wrong thing, so we
462 # ignore the problem children here, to keep them out of the
463 # inventory.
464 models = [l for l in afehost.labels
465 if l.startswith(constants.Labels.MODEL_PREFIX)]
466 pools = [l for l in afehost.labels
467 if l.startswith(constants.Labels.POOL_PREFIX)]
468 excluded = _EXCLUDED_LABELS.intersection(afehost.labels)
469 return len(models) == 1 and len(pools) == 1 and not excluded
470
471
472class _LabInventory(collections.Mapping):
J. Richard Barnette96db3492015-03-27 17:23:52 -0700473 """Collection of `HostJobHistory` objects for the Lab's inventory.
474
Richard Barnette5de01eb2017-12-15 09:53:42 -0800475 This is a dict-like collection indexed by model. Indexing returns
476 the _PoolSetInventory object associated with the model.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700477 """
478
479 @classmethod
Richard Barnette5de01eb2017-12-15 09:53:42 -0800480 def create_inventory(cls, afe, start_time, end_time, modellist=[]):
J. Richard Barnette96db3492015-03-27 17:23:52 -0700481 """Return a Lab inventory with specified parameters.
482
Richard Barnette5de01eb2017-12-15 09:53:42 -0800483 By default, gathers inventory from `HostJobHistory` objects for
484 all DUTs in the `MANAGED_POOLS` list. If `modellist` is
485 supplied, the inventory will be restricted to only the given
486 models.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700487
Richard Barnette5de01eb2017-12-15 09:53:42 -0800488 @param afe AFE object for constructing the
489 `HostJobHistory` objects.
490 @param start_time Start time for the `HostJobHistory` objects.
491 @param end_time End time for the `HostJobHistory` objects.
492 @param modellist List of models to include. If empty,
493 include all available models.
494 @return A `_LabInventory` object for the specified models.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700495 """
Richard Barnette5de01eb2017-12-15 09:53:42 -0800496 target_pools = MANAGED_POOLS
497 label_list = [constants.Labels.POOL_PREFIX + l for l in target_pools]
J. Richard Barnette96db3492015-03-27 17:23:52 -0700498 afehosts = afe.get_hosts(labels__name__in=label_list)
Richard Barnette5de01eb2017-12-15 09:53:42 -0800499 if modellist:
J. Richard Barnetteb8bc570c2016-03-17 17:03:57 -0700500 # We're deliberately not checking host eligibility in this
501 # code path. This is a debug path, not used in production;
502 # it may be useful to include ineligible hosts here.
Richard Barnette5de01eb2017-12-15 09:53:42 -0800503 modelhosts = []
504 for model in modellist:
505 model_label = constants.Labels.MODEL_PREFIX + model
J. Richard Barnette96db3492015-03-27 17:23:52 -0700506 host_list = [h for h in afehosts
Richard Barnette5de01eb2017-12-15 09:53:42 -0800507 if model_label in h.labels]
508 modelhosts.extend(host_list)
509 afehosts = modelhosts
J. Richard Barnetteb8bc570c2016-03-17 17:03:57 -0700510 else:
Richard Barnette3a404492018-02-08 13:57:01 -0800511 afehosts = [h for h in afehosts if _eligible_host(h)]
J. Richard Barnette96db3492015-03-27 17:23:52 -0700512 create = lambda host: (
513 status_history.HostJobHistory(afe, host,
514 start_time, end_time))
Richard Barnette5de01eb2017-12-15 09:53:42 -0800515 return cls([create(host) for host in afehosts], target_pools)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700516
Richard Barnette5de01eb2017-12-15 09:53:42 -0800517 def __init__(self, histories, pools):
518 models = {h.host_model for h in histories}
519 self._modeldata = {model: _PoolSetInventory(pools) for model in models}
J. Richard Barnette96db3492015-03-27 17:23:52 -0700520 self._dut_count = len(histories)
Richard Barnette5de01eb2017-12-15 09:53:42 -0800521 for h in histories:
522 self[h.host_model].record_host(h)
523 self._boards = {h.host_board for h in histories}
Prathmesh Prabhu154cb2b2017-11-08 17:36:51 -0800524
Richard Barnette5de01eb2017-12-15 09:53:42 -0800525 def __getitem__(self, key):
526 return self._modeldata.__getitem__(key)
Prathmesh Prabhu021e7842017-11-08 18:05:45 -0800527
Richard Barnette5de01eb2017-12-15 09:53:42 -0800528 def __len__(self):
529 return self._modeldata.__len__()
530
Richard Barnette5de01eb2017-12-15 09:53:42 -0800531 def __iter__(self):
532 return self._modeldata.__iter__()
533
J. Richard Barnette96db3492015-03-27 17:23:52 -0700534 def get_num_duts(self):
535 """Return the total number of DUTs in the inventory."""
536 return self._dut_count
537
Prathmesh Prabhua5a0e3d2017-11-09 08:53:53 -0800538 def get_num_models(self):
539 """Return the total number of models in the inventory."""
Richard Barnette5de01eb2017-12-15 09:53:42 -0800540 return len(self)
541
Richard Barnette5de01eb2017-12-15 09:53:42 -0800542 def get_pool_models(self, pool):
543 """Return all models in `pool`.
544
545 @param pool The pool to be inventoried for models.
546 """
547 return {m for m, h in self.iteritems() if h.get_total(pool)}
548
Richard Barnette5de01eb2017-12-15 09:53:42 -0800549 def get_boards(self):
550 return self._boards
Prathmesh Prabhua5a0e3d2017-11-09 08:53:53 -0800551
552
Richard Barnettee8eee312018-04-27 13:12:04 -0400553def _reportable_models(inventory, spare_pool=SPARE_POOL):
554 """Iterate over all models subject to reporting.
555
556 Yields the contents of `inventory.iteritems()` filtered to include
557 only reportable models. A model is reportable if it has DUTs in
558 both `spare_pool` and at least one other pool.
559
560 @param spare_pool The spare pool to be tested for reporting.
561 """
562 for model, poolset in inventory.iteritems():
563 spares = poolset.get_total(spare_pool)
564 total = poolset.get_total()
565 if spares != 0 and spares != total:
566 yield model, poolset
567
568
569def _all_dut_histories(inventory):
570 for poolset in inventory.itervalues():
571 for h in poolset.get_all_histories():
572 yield h
573
574
J. Richard Barnettef6839282015-06-01 16:00:35 -0700575def _sort_by_location(inventory_list):
576 """Return a list of DUTs, organized by location.
577
578 Take the given list of `HostJobHistory` objects, separate it
579 into a list per lab, and sort each lab's list by location. The
580 order of sorting within a lab is
581 * By row number within the lab,
582 * then by rack number within the row,
583 * then by host shelf number within the rack.
584
585 Return a list of the sorted lists.
586
587 Implementation note: host locations are sorted by converting
588 each location into a base 100 number. If row, rack or
589 host numbers exceed the range [0..99], then sorting will
590 break down.
591
592 @return A list of sorted lists of DUTs.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700593 """
594 BASE = 100
595 lab_lists = {}
596 for history in inventory_list:
597 location = _HOSTNAME_PATTERN.match(history.host.hostname)
598 if location:
599 lab = location.group(1)
600 key = 0
601 for idx in location.group(2, 3, 4):
602 key = BASE * key + int(idx)
603 lab_lists.setdefault(lab, []).append((key, history))
604 return_list = []
605 for dut_list in lab_lists.values():
606 dut_list.sort(key=lambda t: t[0])
607 return_list.append([t[1] for t in dut_list])
608 return return_list
609
610
611def _score_repair_set(buffer_counts, repair_list):
612 """Return a numeric score rating a set of DUTs to be repaired.
613
Richard Barnette5de01eb2017-12-15 09:53:42 -0800614 `buffer_counts` is a dictionary mapping model names to the size of
615 the model's spares buffer.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700616
Richard Barnette5de01eb2017-12-15 09:53:42 -0800617 `repair_list` is a list of `HostJobHistory` objects for the DUTs to
618 be repaired.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700619
620 This function calculates the new set of buffer counts that would
Richard Barnette5de01eb2017-12-15 09:53:42 -0800621 result from the proposed repairs, and scores the new set using two
622 numbers:
623 * Worst case buffer count for any model (higher is better). This
624 is the more significant number for comparison.
625 * Number of models at the worst case (lower is better). This is
626 the less significant number.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700627
Richard Barnette5de01eb2017-12-15 09:53:42 -0800628 Implementation note: The score could fail to reflect the intended
629 criteria if there are more than 1000 models in the inventory.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700630
Richard Barnette5de01eb2017-12-15 09:53:42 -0800631 @param spare_counts A dictionary mapping models to buffer counts.
632 @param repair_list A list of `HostJobHistory` objects for the
633 DUTs to be repaired.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700634 @return A numeric score.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700635 """
636 # Go through `buffer_counts`, and create a list of new counts
Richard Barnette5de01eb2017-12-15 09:53:42 -0800637 # that records the buffer count for each model after repair.
638 # The new list of counts discards the model names, as they don't
J. Richard Barnettef6839282015-06-01 16:00:35 -0700639 # contribute to the final score.
Richard Barnette5de01eb2017-12-15 09:53:42 -0800640 _NMODELS = 1000
641 pools = {h.host_pool for h in repair_list}
642 repair_inventory = _LabInventory(repair_list, pools)
J. Richard Barnettef6839282015-06-01 16:00:35 -0700643 new_counts = []
Richard Barnette5de01eb2017-12-15 09:53:42 -0800644 for m, c in buffer_counts.iteritems():
645 if m in repair_inventory:
646 newcount = repair_inventory[m].get_total()
J. Richard Barnettef6839282015-06-01 16:00:35 -0700647 else:
648 newcount = 0
649 new_counts.append(c + newcount)
650 # Go through the new list of counts. Find the worst available
651 # spares count, and count how many times that worst case occurs.
652 worst_count = new_counts[0]
653 num_worst = 1
654 for c in new_counts[1:]:
655 if c == worst_count:
656 num_worst += 1
657 elif c < worst_count:
658 worst_count = c
659 num_worst = 1
660 # Return the calculated score
Richard Barnette5de01eb2017-12-15 09:53:42 -0800661 return _NMODELS * worst_count - num_worst
J. Richard Barnettef6839282015-06-01 16:00:35 -0700662
663
664def _generate_repair_recommendation(inventory, num_recommend):
665 """Return a summary of selected DUTs needing repair.
666
Richard Barnette5de01eb2017-12-15 09:53:42 -0800667 Returns a message recommending a list of broken DUTs to be repaired.
668 The list of DUTs is selected based on these criteria:
J. Richard Barnettef6839282015-06-01 16:00:35 -0700669 * No more than `num_recommend` DUTs will be listed.
670 * All DUTs must be in the same lab.
Richard Barnette5de01eb2017-12-15 09:53:42 -0800671 * DUTs should be selected for some degree of physical proximity.
672 * DUTs for models with a low spares buffer are more important than
673 DUTs with larger buffers.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700674
Richard Barnette5de01eb2017-12-15 09:53:42 -0800675 The algorithm used will guarantee that at least one DUT from a model
676 with the lowest spares buffer will be recommended. If the worst
677 spares buffer number is shared by more than one model, the algorithm
678 will tend to prefer repair sets that include more of those models
679 over sets that cover fewer models.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700680
Richard Barnette5de01eb2017-12-15 09:53:42 -0800681 @param inventory `_LabInventory` object from which to generate
682 recommendations.
J. Richard Barnette1df6a562015-06-09 10:06:17 -0700683 @param num_recommend Number of DUTs to recommend for repair.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700684 """
685 logging.debug('Creating DUT repair recommendations')
Richard Barnette5de01eb2017-12-15 09:53:42 -0800686 model_buffer_counts = {}
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700687 broken_list = []
Richard Barnettee8eee312018-04-27 13:12:04 -0400688 for model, counts in _reportable_models(inventory):
Richard Barnette5de01eb2017-12-15 09:53:42 -0800689 logging.debug('Listing failed DUTs for %s', model)
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700690 if counts.get_broken() != 0:
Richard Barnette5de01eb2017-12-15 09:53:42 -0800691 model_buffer_counts[model] = counts.get_spares_buffer()
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700692 broken_list.extend(counts.get_broken_list())
J. Richard Barnette55127432015-10-13 17:01:56 -0700693 # N.B. The logic inside this loop may seem complicated, but
J. Richard Barnettef6839282015-06-01 16:00:35 -0700694 # simplification is hard:
695 # * Calculating an initial recommendation outside of
696 # the loop likely would make things more complicated,
697 # not less.
698 # * It's necessary to calculate an initial lab slice once per
699 # lab _before_ the while loop, in case the number of broken
700 # DUTs in a lab is less than `num_recommend`.
J. Richard Barnette55127432015-10-13 17:01:56 -0700701 recommendation = None
702 best_score = None
703 for lab_duts in _sort_by_location(broken_list):
J. Richard Barnettef6839282015-06-01 16:00:35 -0700704 start = 0
705 end = num_recommend
706 lab_slice = lab_duts[start : end]
Richard Barnette5de01eb2017-12-15 09:53:42 -0800707 lab_score = _score_repair_set(model_buffer_counts, lab_slice)
J. Richard Barnettef6839282015-06-01 16:00:35 -0700708 while end < len(lab_duts):
709 start += 1
710 end += 1
711 new_slice = lab_duts[start : end]
Richard Barnette5de01eb2017-12-15 09:53:42 -0800712 new_score = _score_repair_set(model_buffer_counts, new_slice)
J. Richard Barnettef6839282015-06-01 16:00:35 -0700713 if new_score > lab_score:
714 lab_slice = new_slice
715 lab_score = new_score
716 if recommendation is None or lab_score > best_score:
717 recommendation = lab_slice
718 best_score = lab_score
Richard Barnette5de01eb2017-12-15 09:53:42 -0800719 # N.B. The trailing space in `line_fmt` is manadatory: Without it,
720 # Gmail will parse the URL wrong. Don't ask. If you simply _must_
J. Richard Barnette5d0fa512016-04-05 17:39:52 -0700721 # know more, go try it yourself...
722 line_fmt = '%-30s %-16s %-6s\n %s '
J. Richard Barnette1df6a562015-06-09 10:06:17 -0700723 message = ['Repair recommendations:\n',
Richard Barnette5de01eb2017-12-15 09:53:42 -0800724 line_fmt % ( 'Hostname', 'Model', 'Servo?', 'Logs URL')]
Richard Barnetteb14c7dc2018-09-17 14:16:48 -0700725 if recommendation:
726 for h in recommendation:
727 servo_name = servo_host.make_servo_hostname(h.host.hostname)
728 servo_present = utils.host_is_in_lab_zone(servo_name)
Richard Barnette59404262018-09-14 15:25:30 -0700729 event = _get_diagnosis(h).task
Richard Barnetteb14c7dc2018-09-17 14:16:48 -0700730 line = line_fmt % (
731 h.host.hostname, h.host_model,
732 'Yes' if servo_present else 'No', event.job_url)
733 message.append(line)
734 else:
735 message.append('(No DUTs to repair)')
J. Richard Barnettef6839282015-06-01 16:00:35 -0700736 return '\n'.join(message)
737
738
Richard Barnette5de01eb2017-12-15 09:53:42 -0800739def _generate_model_inventory_message(inventory):
740 """Generate the "model inventory" e-mail message.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700741
Richard Barnette5de01eb2017-12-15 09:53:42 -0800742 The model inventory is a list by model summarizing the number of
743 working, broken, and idle DUTs, and the total shortfall or surplus
J. Richard Barnette96db3492015-03-27 17:23:52 -0700744 of working devices relative to the minimum critical pool
745 requirement.
746
Richard Barnette5de01eb2017-12-15 09:53:42 -0800747 The report omits models with no DUTs in the spare pool or with no
748 DUTs in a critical pool.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700749
750 N.B. For sample output text formattted as users can expect to
751 see it in e-mail and log files, refer to the unit tests.
752
Richard Barnette5de01eb2017-12-15 09:53:42 -0800753 @param inventory `_LabInventory` object to be reported on.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700754 @return String with the inventory message to be sent.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700755 """
Richard Barnette5de01eb2017-12-15 09:53:42 -0800756 logging.debug('Creating model inventory')
J. Richard Barnetted3ba33a2015-10-14 11:20:49 -0700757 nworking = 0
758 nbroken = 0
xixuan12ce04f2016-03-10 13:16:30 -0800759 nidle = 0
Richard Barnette5de01eb2017-12-15 09:53:42 -0800760 nbroken_models = 0
761 ntotal_models = 0
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700762 summaries = []
Richard Barnette5de01eb2017-12-15 09:53:42 -0800763 column_names = (
764 'Model', 'Avail', 'Bad', 'Idle', 'Good', 'Spare', 'Total')
Richard Barnettee8eee312018-04-27 13:12:04 -0400765 for model, counts in _reportable_models(inventory):
Richard Barnette5de01eb2017-12-15 09:53:42 -0800766 logging.debug('Counting %2d DUTS for model %s',
767 counts.get_total(), model)
768 # Summary elements laid out in the same order as the column
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700769 # headers:
Richard Barnette5de01eb2017-12-15 09:53:42 -0800770 # Model Avail Bad Idle Good Spare Total
xixuan12ce04f2016-03-10 13:16:30 -0800771 # e[0] e[1] e[2] e[3] e[4] e[5] e[6]
Richard Barnette5de01eb2017-12-15 09:53:42 -0800772 element = (model,
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700773 counts.get_spares_buffer(),
774 counts.get_broken(),
xixuan12ce04f2016-03-10 13:16:30 -0800775 counts.get_idle(),
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700776 counts.get_working(),
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700777 counts.get_total(SPARE_POOL),
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700778 counts.get_total())
J. Richard Barnetteea5a4ba2016-02-18 16:34:50 -0800779 if element[2]:
780 summaries.append(element)
Richard Barnette5de01eb2017-12-15 09:53:42 -0800781 nbroken_models += 1
782 ntotal_models += 1
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700783 nbroken += element[2]
xixuan12ce04f2016-03-10 13:16:30 -0800784 nidle += element[3]
785 nworking += element[4]
786 ntotal = nworking + nbroken + nidle
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700787 summaries = sorted(summaries, key=lambda e: (e[1], -e[2]))
J. Richard Barnetted3ba33a2015-10-14 11:20:49 -0700788 broken_percent = int(round(100.0 * nbroken / ntotal))
xixuan12ce04f2016-03-10 13:16:30 -0800789 idle_percent = int(round(100.0 * nidle / ntotal))
790 working_percent = 100 - broken_percent - idle_percent
J. Richard Barnetted3ba33a2015-10-14 11:20:49 -0700791 message = ['Summary of DUTs in inventory:',
xixuan12ce04f2016-03-10 13:16:30 -0800792 '%10s %10s %10s %6s' % ('Bad', 'Idle', 'Good', 'Total'),
793 '%5d %3d%% %5d %3d%% %5d %3d%% %6d' % (
J. Richard Barnetted3ba33a2015-10-14 11:20:49 -0700794 nbroken, broken_percent,
xixuan12ce04f2016-03-10 13:16:30 -0800795 nidle, idle_percent,
J. Richard Barnetted3ba33a2015-10-14 11:20:49 -0700796 nworking, working_percent,
797 ntotal),
798 '',
Richard Barnette5de01eb2017-12-15 09:53:42 -0800799 'Models with failures: %d' % nbroken_models,
800 'Models in inventory: %d' % ntotal_models,
J. Richard Barnetted3ba33a2015-10-14 11:20:49 -0700801 '', '',
Richard Barnette5de01eb2017-12-15 09:53:42 -0800802 'Full model inventory:\n',
803 '%-22s %5s %5s %5s %5s %5s %5s' % column_names]
J. Richard Barnette96db3492015-03-27 17:23:52 -0700804 message.extend(
xixuan12ce04f2016-03-10 13:16:30 -0800805 ['%-22s %5d %5d %5d %5d %5d %5d' % e for e in summaries])
J. Richard Barnette96db3492015-03-27 17:23:52 -0700806 return '\n'.join(message)
807
808
J. Richard Barnette4845fcf2015-04-20 14:26:25 -0700809_POOL_INVENTORY_HEADER = '''\
Richard Barnette5de01eb2017-12-15 09:53:42 -0800810Notice to Infrastructure deputies: All models shown below are at
J. Richard Barnettec9a143c2015-06-04 11:11:19 -0700811less than full strength, please take action to resolve the issues.
812Once you're satisified that failures won't recur, failed DUTs can
813be replaced with spares by running `balance_pool`. Detailed
814instructions can be found here:
J. Richard Barnette4845fcf2015-04-20 14:26:25 -0700815 http://go/cros-manage-duts
816'''
817
818
J. Richard Barnette96db3492015-03-27 17:23:52 -0700819def _generate_pool_inventory_message(inventory):
820 """Generate the "pool inventory" e-mail message.
821
Richard Barnette5de01eb2017-12-15 09:53:42 -0800822 The pool inventory is a list by pool and model summarizing the
823 number of working and broken DUTs in the pool. Only models with
J. Richard Barnette96db3492015-03-27 17:23:52 -0700824 at least one broken DUT are included in the list.
825
Richard Barnette5de01eb2017-12-15 09:53:42 -0800826 N.B. For sample output text formattted as users can expect to see it
827 in e-mail and log files, refer to the unit tests.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700828
Richard Barnette5de01eb2017-12-15 09:53:42 -0800829 @param inventory `_LabInventory` object to be reported on.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700830 @return String with the inventory message to be sent.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700831 """
832 logging.debug('Creating pool inventory')
J. Richard Barnette4845fcf2015-04-20 14:26:25 -0700833 message = [_POOL_INVENTORY_HEADER]
J. Richard Barnette96db3492015-03-27 17:23:52 -0700834 newline = ''
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700835 for pool in CRITICAL_POOLS:
J. Richard Barnette96db3492015-03-27 17:23:52 -0700836 message.append(
Richard Barnette5de01eb2017-12-15 09:53:42 -0800837 '%sStatus for pool:%s, by model:' % (newline, pool))
J. Richard Barnette96db3492015-03-27 17:23:52 -0700838 message.append(
xixuan12ce04f2016-03-10 13:16:30 -0800839 '%-20s %5s %5s %5s %5s' % (
Richard Barnette5de01eb2017-12-15 09:53:42 -0800840 'Model', 'Bad', 'Idle', 'Good', 'Total'))
J. Richard Barnette96db3492015-03-27 17:23:52 -0700841 data_list = []
Richard Barnette5de01eb2017-12-15 09:53:42 -0800842 for model, counts in inventory.iteritems():
Richard Barnette254d5b42016-07-06 19:13:23 -0700843 logging.debug('Counting %2d DUTs for %s, %s',
Richard Barnette5de01eb2017-12-15 09:53:42 -0800844 counts.get_total(pool), model, pool)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700845 broken = counts.get_broken(pool)
xixuan12ce04f2016-03-10 13:16:30 -0800846 idle = counts.get_idle(pool)
Richard Barnette5de01eb2017-12-15 09:53:42 -0800847 # models at full strength are not reported
848 if not broken and not idle:
J. Richard Barnette96db3492015-03-27 17:23:52 -0700849 continue
850 working = counts.get_working(pool)
851 total = counts.get_total(pool)
Richard Barnette5de01eb2017-12-15 09:53:42 -0800852 data_list.append((model, broken, idle, working, total))
J. Richard Barnette96db3492015-03-27 17:23:52 -0700853 if data_list:
854 data_list = sorted(data_list, key=lambda d: -d[1])
855 message.extend(
xixuan12ce04f2016-03-10 13:16:30 -0800856 ['%-20s %5d %5d %5d %5d' % t for t in data_list])
J. Richard Barnette96db3492015-03-27 17:23:52 -0700857 else:
Richard Barnette5de01eb2017-12-15 09:53:42 -0800858 message.append('(All models at full strength)')
J. Richard Barnette96db3492015-03-27 17:23:52 -0700859 newline = '\n'
860 return '\n'.join(message)
861
862
xixuan12ce04f2016-03-10 13:16:30 -0800863_IDLE_INVENTORY_HEADER = '''\
864Notice to Infrastructure deputies: The hosts shown below haven't
865run any jobs for at least 24 hours. Please check each host; locked
866hosts should normally be unlocked; stuck jobs should normally be
867aborted.
868'''
869
870
871def _generate_idle_inventory_message(inventory):
872 """Generate the "idle inventory" e-mail message.
873
Richard Barnette5de01eb2017-12-15 09:53:42 -0800874 The idle inventory is a host list with corresponding pool and model,
Richard Barnettee8eee312018-04-27 13:12:04 -0400875 where the hosts are identified as idle.
xixuan12ce04f2016-03-10 13:16:30 -0800876
877 N.B. For sample output text format as users can expect to
878 see it in e-mail and log files, refer to the unit tests.
879
Richard Barnette5de01eb2017-12-15 09:53:42 -0800880 @param inventory `_LabInventory` object to be reported on.
xixuan12ce04f2016-03-10 13:16:30 -0800881 @return String with the inventory message to be sent.
xixuan12ce04f2016-03-10 13:16:30 -0800882 """
883 logging.debug('Creating idle inventory')
884 message = [_IDLE_INVENTORY_HEADER]
885 message.append('Idle Host List:')
Richard Barnette5de01eb2017-12-15 09:53:42 -0800886 message.append('%-30s %-20s %s' % ('Hostname', 'Model', 'Pool'))
xixuan12ce04f2016-03-10 13:16:30 -0800887 data_list = []
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700888 for pool in MANAGED_POOLS:
Richard Barnette5de01eb2017-12-15 09:53:42 -0800889 for model, counts in inventory.iteritems():
Richard Barnette254d5b42016-07-06 19:13:23 -0700890 logging.debug('Counting %2d DUTs for %s, %s',
Richard Barnette5de01eb2017-12-15 09:53:42 -0800891 counts.get_total(pool), model, pool)
892 data_list.extend([(dut.host.hostname, model, pool)
xixuan12ce04f2016-03-10 13:16:30 -0800893 for dut in counts.get_idle_list(pool)])
894 if data_list:
895 message.extend(['%-30s %-20s %s' % t for t in data_list])
896 else:
897 message.append('(No idle DUTs)')
898 return '\n'.join(message)
899
900
J. Richard Barnette96db3492015-03-27 17:23:52 -0700901def _send_email(arguments, tag, subject, recipients, body):
902 """Send an inventory e-mail message.
903
Richard Barnette5de01eb2017-12-15 09:53:42 -0800904 The message is logged in the selected log directory using `tag` for
905 the file name.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700906
Richard Barnette5de01eb2017-12-15 09:53:42 -0800907 If the --debug option was requested, the message is neither logged
908 nor sent, but merely printed on stdout.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700909
910 @param arguments Parsed command-line options.
911 @param tag Tag identifying the inventory for logging
912 purposes.
913 @param subject E-mail Subject: header line.
914 @param recipients E-mail addresses for the To: header line.
915 @param body E-mail message body.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700916 """
917 logging.debug('Generating email: "%s"', subject)
918 all_recipients = ', '.join(recipients)
919 report_body = '\n'.join([
920 'To: %s' % all_recipients,
921 'Subject: %s' % subject,
922 '', body, ''])
J. Richard Barnette02e82432015-10-13 16:02:47 -0700923 if arguments.debug:
J. Richard Barnette96db3492015-03-27 17:23:52 -0700924 print report_body
925 else:
926 filename = os.path.join(arguments.logdir, tag)
927 try:
928 report_file = open(filename, 'w')
929 report_file.write(report_body)
930 report_file.close()
931 except EnvironmentError as e:
932 logging.error('Failed to write %s: %s', filename, e)
933 try:
934 gmail_lib.send_email(all_recipients, subject, body)
935 except Exception as e:
936 logging.error('Failed to send e-mail to %s: %s',
937 all_recipients, e)
938
939
Richard Barnette5de01eb2017-12-15 09:53:42 -0800940def _populate_model_counts(inventory):
941 """Gather model counts while providing interactive feedback.
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700942
943 Gathering the status of all individual DUTs in the lab can take
944 considerable time (~30 minutes at the time of this writing).
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700945 Normally, we pay that cost by querying as we go. However, with
946 the `--debug` option, we expect a human being to be watching the
Richard Barnette5de01eb2017-12-15 09:53:42 -0800947 progress in real time. So, we force the first (expensive) queries
948 to happen up front, and provide simple ASCII output on sys.stdout
949 to show a progress bar and results.
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700950
Richard Barnette5de01eb2017-12-15 09:53:42 -0800951 @param inventory `_LabInventory` object from which to gather
952 counts.
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700953 """
954 n = 0
955 total_broken = 0
Richard Barnette5de01eb2017-12-15 09:53:42 -0800956 for counts in inventory.itervalues():
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700957 n += 1
958 if n % 10 == 5:
959 c = '+'
960 elif n % 10 == 0:
961 c = '%d' % ((n / 10) % 10)
962 else:
963 c = '.'
964 sys.stdout.write(c)
965 sys.stdout.flush()
Richard Barnette5de01eb2017-12-15 09:53:42 -0800966 # This next call is where all the time goes - it forces all of a
967 # model's `HostJobHistory` objects to query the database and
968 # cache their results.
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700969 total_broken += counts.get_broken()
970 sys.stdout.write('\n')
971 sys.stdout.write('Found %d broken DUTs\n' % total_broken)
972
973
Richard Barnette5de01eb2017-12-15 09:53:42 -0800974def _perform_model_inventory(arguments, inventory, timestamp):
975 """Perform the model inventory report.
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700976
Richard Barnette5de01eb2017-12-15 09:53:42 -0800977 The model inventory report consists of the following:
978 * A list of DUTs that are recommended to be repaired. This list
979 is optional, and only appears if the `--recommend` option is
980 present.
981 * A list of all models that have failed DUTs, with counts
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700982 of working, broken, and spare DUTs, among others.
983
984 @param arguments Command-line arguments as returned by
985 `ArgumentParser`
Richard Barnette5de01eb2017-12-15 09:53:42 -0800986 @param inventory `_LabInventory` object to be reported on.
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700987 @param timestamp A string used to identify this run's timestamp
988 in logs and email output.
989 """
990 if arguments.recommend:
991 recommend_message = _generate_repair_recommendation(
992 inventory, arguments.recommend) + '\n\n\n'
993 else:
994 recommend_message = ''
Richard Barnette5de01eb2017-12-15 09:53:42 -0800995 model_message = _generate_model_inventory_message(inventory)
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700996 _send_email(arguments,
Richard Barnette5de01eb2017-12-15 09:53:42 -0800997 'models-%s.txt' % timestamp,
998 'DUT model inventory %s' % timestamp,
999 arguments.model_notify,
1000 recommend_message + model_message)
Richard Barnette3dcbb6a2017-10-23 17:57:50 -07001001
1002
1003def _perform_pool_inventory(arguments, inventory, timestamp):
1004 """Perform the pool inventory report.
1005
1006 The pool inventory report consists of the following:
1007 * A list of all critical pools that have failed DUTs, with counts
1008 of working, broken, and idle DUTs.
Richard Barnette5de01eb2017-12-15 09:53:42 -08001009 * A list of all idle DUTs by hostname including the model and
Richard Barnette3dcbb6a2017-10-23 17:57:50 -07001010 pool.
1011
1012 @param arguments Command-line arguments as returned by
1013 `ArgumentParser`
Richard Barnette5de01eb2017-12-15 09:53:42 -08001014 @param inventory `_LabInventory` object to be reported on.
Richard Barnettecf5d8342017-10-24 18:13:11 -07001015 @param timestamp A string used to identify this run's timestamp in
1016 logs and email output.
Richard Barnette3dcbb6a2017-10-23 17:57:50 -07001017 """
1018 pool_message = _generate_pool_inventory_message(inventory)
1019 idle_message = _generate_idle_inventory_message(inventory)
1020 _send_email(arguments,
1021 'pools-%s.txt' % timestamp,
1022 'DUT pool inventory %s' % timestamp,
1023 arguments.pool_notify,
1024 pool_message + '\n\n\n' + idle_message)
1025
1026
Richard Barnettecf5d8342017-10-24 18:13:11 -07001027def _dut_in_repair_loop(history):
1028 """Return whether a DUT's history indicates a repair loop.
1029
1030 A DUT is considered looping if it runs no tests, and no tasks pass
1031 other than repair tasks.
1032
1033 @param history An instance of `status_history.HostJobHistory` to be
1034 scanned for a repair loop. The caller guarantees
1035 that this history corresponds to a working DUT.
1036 @returns Return a true value if the DUT's most recent history
1037 indicates a repair loop.
1038 """
1039 # Our caller passes only histories for working DUTs; that means
1040 # we've already paid the cost of fetching the diagnosis task, and
1041 # we know that the task was successful. The diagnosis task will be
1042 # one of the tasks we must scan to find a loop, so if the task isn't
1043 # a repair task, then our history includes a successful non-repair
1044 # task, and we're not looping.
1045 #
Richard Barnette1ca30e62018-04-09 16:45:58 -07001046 # The for loop below is very expensive, because it must fetch the
Richard Barnettecf5d8342017-10-24 18:13:11 -07001047 # full history, regardless of how many tasks we examine. At the
1048 # time of this writing, this check against the diagnosis task
1049 # reduces the cost of finding loops in the full inventory from hours
1050 # to minutes.
Richard Barnette59404262018-09-14 15:25:30 -07001051 if _get_diagnosis(history).task.name != 'Repair':
Richard Barnettecf5d8342017-10-24 18:13:11 -07001052 return False
1053 repair_ok_count = 0
1054 for task in history:
1055 if not task.is_special:
1056 # This is a test, so we're not looping.
1057 return False
1058 if task.diagnosis == status_history.BROKEN:
1059 # Failed a repair, so we're not looping.
1060 return False
1061 if (task.diagnosis == status_history.WORKING
1062 and task.name != 'Repair'):
1063 # Non-repair task succeeded, so we're not looping.
1064 return False
1065 # At this point, we have either a failed non-repair task, or
1066 # a successful repair.
1067 if task.name == 'Repair':
1068 repair_ok_count += 1
1069 if repair_ok_count >= _REPAIR_LOOP_THRESHOLD:
1070 return True
1071
1072
Richard Barnette1ca30e62018-04-09 16:45:58 -07001073def _report_untestable_dut(history, state):
1074 fields = {
1075 'dut_hostname': history.hostname,
1076 'model': history.host_model,
1077 'pool': history.host_pool,
1078 'state': state,
1079 }
Richard Barnettee8eee312018-04-27 13:12:04 -04001080 logging.info('DUT in state %(state)s: %(dut_hostname)s, '
1081 'model: %(model)s, pool: %(pool)s', fields)
Richard Barnette1ca30e62018-04-09 16:45:58 -07001082 _UNTESTABLE_PRESENCE_METRIC.set(True, fields=fields)
Richard Barnettecf5d8342017-10-24 18:13:11 -07001083
Richard Barnettecf5d8342017-10-24 18:13:11 -07001084
Richard Barnette1ca30e62018-04-09 16:45:58 -07001085def _report_untestable_dut_metrics(inventory):
1086 """Scan the inventory for DUTs unable to run tests.
1087
1088 DUTs in the inventory are judged "untestable" if they meet one of
1089 two criteria:
1090 * The DUT is stuck in a repair loop; that is, it regularly passes
1091 repair, but never passes other operations.
1092 * The DUT runs no tasks at all, but is not locked.
1093
1094 This routine walks through the given inventory looking for DUTs in
1095 either of these states. Results are reported via a Monarch presence
1096 metric.
1097
1098 Note: To make sure that DUTs aren't flagged as "idle" merely
1099 because there's no work, a separate job runs prior to regular
1100 inventory runs which schedules trivial work on any DUT that appears
1101 idle.
1102
1103 @param inventory `_LabInventory` object to be reported on.
1104 """
Richard Barnettee8eee312018-04-27 13:12:04 -04001105 logging.info('Scanning for untestable DUTs.')
1106 for history in _all_dut_histories(inventory):
1107 # Managed DUTs with names that don't match
1108 # _HOSTNAME_PATTERN shouldn't be possible. However, we
1109 # don't want arbitrary strings being attached to the
1110 # 'dut_hostname' field, so for safety, we exclude all
1111 # anomalies.
1112 if not _HOSTNAME_PATTERN.match(history.hostname):
1113 continue
1114 if _host_is_working(history):
1115 if _dut_in_repair_loop(history):
1116 _report_untestable_dut(history, 'repair_loop')
1117 elif _host_is_idle(history):
1118 if not history.host.locked:
1119 _report_untestable_dut(history, 'idle_unlocked')
Richard Barnettecf5d8342017-10-24 18:13:11 -07001120
1121
Richard Barnette3dcbb6a2017-10-23 17:57:50 -07001122def _log_startup(arguments, startup_time):
1123 """Log the start of this inventory run.
1124
1125 Print various log messages indicating the start of the run. Return
1126 a string based on `startup_time` that will be used to identify this
1127 run in log files and e-mail messages.
1128
1129 @param startup_time A UNIX timestamp marking the moment when
1130 this inventory run began.
1131 @returns A timestamp string that will be used to identify this run
1132 in logs and email output.
1133 """
Alex Zamorzaev8e5317e52018-09-25 00:42:30 +00001134 timestamp = time.strftime('%Y-%m-%d.%H',
Richard Barnette3dcbb6a2017-10-23 17:57:50 -07001135 time.localtime(startup_time))
1136 logging.debug('Starting lab inventory for %s', timestamp)
Richard Barnette5de01eb2017-12-15 09:53:42 -08001137 if arguments.model_notify:
Richard Barnette3dcbb6a2017-10-23 17:57:50 -07001138 if arguments.recommend:
1139 logging.debug('Will include repair recommendations')
Richard Barnette5de01eb2017-12-15 09:53:42 -08001140 logging.debug('Will include model inventory')
Richard Barnette3dcbb6a2017-10-23 17:57:50 -07001141 if arguments.pool_notify:
1142 logging.debug('Will include pool inventory')
1143 return timestamp
1144
1145
1146def _create_inventory(arguments, end_time):
1147 """Create the `_LabInventory` instance to use for reporting.
1148
1149 @param end_time A UNIX timestamp for the end of the time range
1150 to be searched in this inventory run.
1151 """
1152 start_time = end_time - arguments.duration * 60 * 60
1153 afe = frontend_wrappers.RetryingAFE(server=None)
1154 inventory = _LabInventory.create_inventory(
Richard Barnette5de01eb2017-12-15 09:53:42 -08001155 afe, start_time, end_time, arguments.modelnames)
1156 logging.info('Found %d hosts across %d models',
Richard Barnette3dcbb6a2017-10-23 17:57:50 -07001157 inventory.get_num_duts(),
Richard Barnette5de01eb2017-12-15 09:53:42 -08001158 inventory.get_num_models())
Richard Barnette3dcbb6a2017-10-23 17:57:50 -07001159 return inventory
1160
1161
Richard Barnettecf5d8342017-10-24 18:13:11 -07001162def _perform_inventory_reports(arguments):
1163 """Perform all inventory checks requested on the command line.
1164
1165 Create the initial inventory and run through the inventory reports
1166 as called for by the parsed command-line arguments.
1167
1168 @param arguments Command-line arguments as returned by
1169 `ArgumentParser`.
1170 """
1171 startup_time = time.time()
1172 timestamp = _log_startup(arguments, startup_time)
1173 inventory = _create_inventory(arguments, startup_time)
1174 if arguments.debug:
Richard Barnette5de01eb2017-12-15 09:53:42 -08001175 _populate_model_counts(inventory)
1176 if arguments.model_notify:
1177 _perform_model_inventory(arguments, inventory, timestamp)
Richard Barnettecf5d8342017-10-24 18:13:11 -07001178 if arguments.pool_notify:
1179 _perform_pool_inventory(arguments, inventory, timestamp)
Richard Barnette1ca30e62018-04-09 16:45:58 -07001180 if arguments.report_untestable:
1181 _report_untestable_dut_metrics(inventory)
Richard Barnettecf5d8342017-10-24 18:13:11 -07001182
1183
J. Richard Barnette96db3492015-03-27 17:23:52 -07001184def _separate_email_addresses(address_list):
1185 """Parse a list of comma-separated lists of e-mail addresses.
1186
1187 @param address_list A list of strings containing comma
1188 separate e-mail addresses.
1189 @return A list of the individual e-mail addresses.
J. Richard Barnette96db3492015-03-27 17:23:52 -07001190 """
1191 newlist = []
1192 for arg in address_list:
1193 newlist.extend([email.strip() for email in arg.split(',')])
1194 return newlist
1195
1196
1197def _verify_arguments(arguments):
1198 """Validate command-line arguments.
1199
Richard Barnette5de01eb2017-12-15 09:53:42 -08001200 Join comma separated e-mail addresses for `--model-notify` and
J. Richard Barnette96db3492015-03-27 17:23:52 -07001201 `--pool-notify` in separate option arguments into a single list.
1202
Richard Barnette54150302018-02-26 10:42:46 -08001203 For non-debug uses, require that at least one inventory report be
1204 requested. For debug, if a report isn't specified, treat it as "run
1205 all the reports."
J. Richard Barnette02e82432015-10-13 16:02:47 -07001206
1207 The return value indicates success or failure; in the case of
1208 failure, we also write an error message to stderr.
1209
J. Richard Barnette96db3492015-03-27 17:23:52 -07001210 @param arguments Command-line arguments as returned by
1211 `ArgumentParser`
J. Richard Barnette02e82432015-10-13 16:02:47 -07001212 @return True if the arguments are semantically good, or False
1213 if the arguments don't meet requirements.
J. Richard Barnette96db3492015-03-27 17:23:52 -07001214 """
Richard Barnette5de01eb2017-12-15 09:53:42 -08001215 arguments.model_notify = _separate_email_addresses(
1216 arguments.model_notify)
J. Richard Barnette96db3492015-03-27 17:23:52 -07001217 arguments.pool_notify = _separate_email_addresses(
1218 arguments.pool_notify)
Richard Barnette54150302018-02-26 10:42:46 -08001219 if not any([arguments.model_notify, arguments.pool_notify,
Richard Barnette1ca30e62018-04-09 16:45:58 -07001220 arguments.report_untestable]):
J. Richard Barnette02e82432015-10-13 16:02:47 -07001221 if not arguments.debug:
Richard Barnette54150302018-02-26 10:42:46 -08001222 sys.stderr.write('Must request at least one report via '
1223 '--model-notify, --pool-notify, or '
Richard Barnette1ca30e62018-04-09 16:45:58 -07001224 '--report-untestable\n')
J. Richard Barnette02e82432015-10-13 16:02:47 -07001225 return False
1226 else:
Richard Barnette54150302018-02-26 10:42:46 -08001227 # We want to run all the e-mail reports. An empty notify
1228 # list will cause a report to be skipped, so make sure the
J. Richard Barnette02e82432015-10-13 16:02:47 -07001229 # lists are non-empty.
Richard Barnette5de01eb2017-12-15 09:53:42 -08001230 arguments.model_notify = ['']
J. Richard Barnette02e82432015-10-13 16:02:47 -07001231 arguments.pool_notify = ['']
1232 return True
J. Richard Barnette96db3492015-03-27 17:23:52 -07001233
1234
Richard Barnette3dcbb6a2017-10-23 17:57:50 -07001235def _get_default_logdir(script):
J. Richard Barnette96db3492015-03-27 17:23:52 -07001236 """Get the default directory for the `--logdir` option.
1237
1238 The default log directory is based on the parent directory
1239 containing this script.
1240
1241 @param script Path to this script file.
1242 @return A path to a directory.
J. Richard Barnette96db3492015-03-27 17:23:52 -07001243 """
1244 basedir = os.path.dirname(os.path.abspath(script))
1245 basedir = os.path.dirname(basedir)
1246 return os.path.join(basedir, _LOGDIR)
1247
1248
1249def _parse_command(argv):
1250 """Parse the command line arguments.
1251
1252 Create an argument parser for this command's syntax, parse the
1253 command line, and return the result of the ArgumentParser
1254 parse_args() method.
1255
1256 @param argv Standard command line argument vector; argv[0] is
1257 assumed to be the command name.
1258 @return Result returned by ArgumentParser.parse_args().
J. Richard Barnette96db3492015-03-27 17:23:52 -07001259 """
1260 parser = argparse.ArgumentParser(
1261 prog=argv[0],
1262 description='Gather and report lab inventory statistics')
1263 parser.add_argument('-d', '--duration', type=int,
1264 default=_DEFAULT_DURATION, metavar='HOURS',
1265 help='number of hours back to search for status'
1266 ' (default: %d)' % _DEFAULT_DURATION)
Richard Barnette5de01eb2017-12-15 09:53:42 -08001267 parser.add_argument('--model-notify', action='append',
J. Richard Barnette96db3492015-03-27 17:23:52 -07001268 default=[], metavar='ADDRESS',
Richard Barnette5de01eb2017-12-15 09:53:42 -08001269 help='Generate model inventory message, '
J. Richard Barnette96db3492015-03-27 17:23:52 -07001270 'and send it to the given e-mail address(es)')
1271 parser.add_argument('--pool-notify', action='append',
1272 default=[], metavar='ADDRESS',
1273 help='Generate pool inventory message, '
1274 'and send it to the given address(es)')
J. Richard Barnette1df6a562015-06-09 10:06:17 -07001275 parser.add_argument('-r', '--recommend', type=int, default=None,
J. Richard Barnettef6839282015-06-01 16:00:35 -07001276 help=('Specify how many DUTs should be '
J. Richard Barnette1df6a562015-06-09 10:06:17 -07001277 'recommended for repair (default: no '
1278 'recommendation)'))
Richard Barnette1ca30e62018-04-09 16:45:58 -07001279 parser.add_argument('--report-untestable', action='store_true',
1280 help='Check for devices unable to run tests.')
J. Richard Barnette02e82432015-10-13 16:02:47 -07001281 parser.add_argument('--debug', action='store_true',
Prathmesh Prabhu6b48ede2018-05-07 14:33:07 -07001282 help='Print e-mail, metrics messages on stdout '
J. Richard Barnette96db3492015-03-27 17:23:52 -07001283 'without sending them.')
Richard Barnettec4374692018-09-17 13:53:38 -07001284 parser.add_argument('--no-metrics', action='store_false',
1285 dest='use_metrics',
1286 help='Suppress generation of Monarch metrics.')
Richard Barnette3dcbb6a2017-10-23 17:57:50 -07001287 parser.add_argument('--logdir', default=_get_default_logdir(argv[0]),
J. Richard Barnette96db3492015-03-27 17:23:52 -07001288 help='Directory where logs will be written.')
Richard Barnette5de01eb2017-12-15 09:53:42 -08001289 parser.add_argument('modelnames', nargs='*',
1290 metavar='MODEL',
1291 help='names of models to report on '
1292 '(default: all models)')
J. Richard Barnette96db3492015-03-27 17:23:52 -07001293 arguments = parser.parse_args(argv[1:])
J. Richard Barnette02e82432015-10-13 16:02:47 -07001294 if not _verify_arguments(arguments):
1295 return None
J. Richard Barnette96db3492015-03-27 17:23:52 -07001296 return arguments
1297
1298
1299def _configure_logging(arguments):
1300 """Configure the `logging` module for our needs.
1301
Richard Barnette3dcbb6a2017-10-23 17:57:50 -07001302 How we log depends on whether the `--debug` option was provided on
1303 the command line.
1304 * Without the option, we configure the logging to capture all
1305 potentially relevant events in a log file. The log file is
1306 configured to rotate once a week on Friday evening, preserving
1307 ~3 months worth of history.
1308 * With the option, we expect stdout to contain other
1309 human-readable output (including the contents of the e-mail
Richard Barnettecf5d8342017-10-24 18:13:11 -07001310 messages), so we restrict the output to INFO level.
1311
1312 For convenience, when `--debug` is on, the logging format has
1313 no adornments, so that a call like `logging.info(msg)` simply writes
1314 `msg` to stdout, plus a trailing newline.
J. Richard Barnette96db3492015-03-27 17:23:52 -07001315
1316 @param arguments Command-line arguments as returned by
1317 `ArgumentParser`
J. Richard Barnette96db3492015-03-27 17:23:52 -07001318 """
J. Richard Barnettef6839282015-06-01 16:00:35 -07001319 root_logger = logging.getLogger()
J. Richard Barnette02e82432015-10-13 16:02:47 -07001320 if arguments.debug:
J. Richard Barnettef6839282015-06-01 16:00:35 -07001321 root_logger.setLevel(logging.INFO)
Alex Zamorzaev8e5317e52018-09-25 00:42:30 +00001322 handler = logging.StreamHandler(sys.stdout)
1323 handler.setFormatter(logging.Formatter())
J. Richard Barnette96db3492015-03-27 17:23:52 -07001324 else:
Richard Barnette5af97402016-04-18 11:00:26 -07001325 if not os.path.exists(arguments.logdir):
1326 os.mkdir(arguments.logdir)
Alex Zamorzaev8e5317e52018-09-25 00:42:30 +00001327 root_logger.setLevel(logging.DEBUG)
1328 logfile = os.path.join(arguments.logdir, _LOGFILE)
1329 handler = logging.handlers.TimedRotatingFileHandler(
1330 logfile, when='W4', backupCount=13)
1331 formatter = logging.Formatter(_LOG_FORMAT,
1332 time_utils.TIME_FMT)
1333 handler.setFormatter(formatter)
J. Richard Barnettef6839282015-06-01 16:00:35 -07001334 # TODO(jrbarnette) This is gross. Importing client.bin.utils
1335 # implicitly imported logging_config, which calls
1336 # logging.basicConfig() *at module level*. That gives us an
1337 # extra logging handler that we don't want. So, clear out all
1338 # the handlers here.
1339 for h in root_logger.handlers:
1340 root_logger.removeHandler(h)
1341 root_logger.addHandler(handler)
J. Richard Barnette96db3492015-03-27 17:23:52 -07001342
1343
J. Richard Barnette96db3492015-03-27 17:23:52 -07001344def main(argv):
1345 """Standard main routine.
Richard Barnettecf5d8342017-10-24 18:13:11 -07001346
1347 @param argv Command line arguments, including `sys.argv[0]`.
J. Richard Barnette96db3492015-03-27 17:23:52 -07001348 """
1349 arguments = _parse_command(argv)
J. Richard Barnette02e82432015-10-13 16:02:47 -07001350 if not arguments:
1351 sys.exit(1)
J. Richard Barnette96db3492015-03-27 17:23:52 -07001352 _configure_logging(arguments)
Prathmesh Prabhu6b48ede2018-05-07 14:33:07 -07001353
Richard Barnettec4374692018-09-17 13:53:38 -07001354 try:
1355 if arguments.use_metrics:
1356 if arguments.debug:
1357 logging.info('Debug mode: Will not report metrics to monarch.')
1358 metrics_file = '/dev/null'
1359 else:
1360 metrics_file = None
1361 with site_utils.SetupTsMonGlobalState(
1362 'lab_inventory', debug_file=metrics_file,
1363 auto_flush=False):
1364 success = False
1365 try:
1366 with metrics.SecondsTimer('%s/duration' % _METRICS_PREFIX):
1367 _perform_inventory_reports(arguments)
1368 success = True
1369 finally:
1370 metrics.Counter('%s/tick' % _METRICS_PREFIX).increment(
1371 fields={'success': success})
1372 metrics.Flush()
1373 else:
1374 _perform_inventory_reports(arguments)
1375 except KeyboardInterrupt:
1376 pass
1377 except Exception:
1378 # Our cron setup doesn't preserve stderr, so drop extra breadcrumbs.
1379 logging.exception('Error escaped main')
1380 raise
J. Richard Barnette96db3492015-03-27 17:23:52 -07001381
1382
Kevin Chengcf0ad2b2016-04-19 14:51:39 -07001383def get_inventory(afe):
J. Richard Barnetteaa868932015-10-23 13:28:59 -07001384 end_time = int(time.time())
1385 start_time = end_time - 24 * 60 * 60
Kevin Chengcf0ad2b2016-04-19 14:51:39 -07001386 return _LabInventory.create_inventory(afe, start_time, end_time)
1387
1388
1389def get_managed_boards(afe):
Richard Barnette5de01eb2017-12-15 09:53:42 -08001390 return get_inventory(afe).get_boards()
J. Richard Barnetteaa868932015-10-23 13:28:59 -07001391
1392
J. Richard Barnette96db3492015-03-27 17:23:52 -07001393if __name__ == '__main__':
1394 main(sys.argv)