blob: 79fbb5becbcd58ba72e2fd66add3b9d28cbef646 [file] [log] [blame]
J. Richard Barnette96db3492015-03-27 17:23:52 -07001#!/usr/bin/env python
2# Copyright 2015 The Chromium OS Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6"""Create e-mail reports of the Lab's DUT inventory.
7
8Gathers a list of all DUTs of interest in the Lab, segregated by
Richard Barnette5de01eb2017-12-15 09:53:42 -08009model and pool, and determines whether each DUT is working or
J. Richard Barnette96db3492015-03-27 17:23:52 -070010broken. Then, send one or more e-mail reports summarizing the
11status to e-mail addresses provided on the command line.
12
Richard Barnette5de01eb2017-12-15 09:53:42 -080013usage: lab_inventory.py [ options ] [ model ... ]
J. Richard Barnette96db3492015-03-27 17:23:52 -070014
15Options:
16--duration / -d <hours>
17 How far back in time to search job history to determine DUT
18 status.
19
Richard Barnette5de01eb2017-12-15 09:53:42 -080020--model-notify <address>[,<address>]
21 Send the "model status" e-mail to all the specified e-mail
J. Richard Barnette96db3492015-03-27 17:23:52 -070022 addresses.
23
24--pool-notify <address>[,<address>]
25 Send the "pool status" e-mail to all the specified e-mail
26 addresses.
27
J. Richard Barnette1df6a562015-06-09 10:06:17 -070028--recommend <number>
Richard Barnette5de01eb2017-12-15 09:53:42 -080029 When generating the "model status" e-mail, include a list of
J. Richard Barnette1df6a562015-06-09 10:06:17 -070030 <number> specific DUTs to be recommended for repair.
31
Richard Barnette1ca30e62018-04-09 16:45:58 -070032--report-untestable
33 Scan the inventory for DUTs that can't test because they're stuck in
34 repair loops, or because the scheduler can't give them work.
Richard Barnettecf5d8342017-10-24 18:13:11 -070035
J. Richard Barnette96db3492015-03-27 17:23:52 -070036--logdir <directory>
37 Log progress and actions in a file under this directory. Text
38 of any e-mail sent will also be logged in a timestamped file in
39 this directory.
40
J. Richard Barnette02e82432015-10-13 16:02:47 -070041--debug
Richard Barnettecf5d8342017-10-24 18:13:11 -070042 Suppress all logging, metrics reporting, and sending e-mail.
43 Instead, write the output that would be generated onto stdout.
J. Richard Barnette96db3492015-03-27 17:23:52 -070044
Richard Barnette5de01eb2017-12-15 09:53:42 -080045<model> arguments:
46 With no arguments, gathers the status for all models in the lab.
47 With one or more named models on the command line, restricts
48 reporting to just those models.
J. Richard Barnette96db3492015-03-27 17:23:52 -070049"""
50
51
52import argparse
Prathmesh Prabhu021e7842017-11-08 18:05:45 -080053import collections
J. Richard Barnette96db3492015-03-27 17:23:52 -070054import logging
55import logging.handlers
56import os
J. Richard Barnettef6839282015-06-01 16:00:35 -070057import re
J. Richard Barnette96db3492015-03-27 17:23:52 -070058import sys
59import time
60
61import common
J. Richard Barnettef6839282015-06-01 16:00:35 -070062from autotest_lib.client.bin import utils
J. Richard Barnette96db3492015-03-27 17:23:52 -070063from autotest_lib.client.common_lib import time_utils
Xixuan Wu93e646c2017-12-07 18:36:10 -080064from autotest_lib.server import constants
Richard Barnettecf5d8342017-10-24 18:13:11 -070065from autotest_lib.server import site_utils
J. Richard Barnettea7c514e2015-09-15 11:13:23 -070066from autotest_lib.server.cros.dynamic_suite import frontend_wrappers
J. Richard Barnettef6839282015-06-01 16:00:35 -070067from autotest_lib.server.hosts import servo_host
Aviv Keshet7ee95862016-08-30 15:18:27 -070068from autotest_lib.server.lib import status_history
J. Richard Barnette96db3492015-03-27 17:23:52 -070069from autotest_lib.site_utils import gmail_lib
Richard Barnettecf5d8342017-10-24 18:13:11 -070070from chromite.lib import metrics
J. Richard Barnette96db3492015-03-27 17:23:52 -070071
72
Richard Barnette673573b2016-12-12 09:46:39 -080073CRITICAL_POOLS = constants.Pools.CRITICAL_POOLS
74SPARE_POOL = constants.Pools.SPARE_POOL
75MANAGED_POOLS = constants.Pools.MANAGED_POOLS
J. Richard Barnette96db3492015-03-27 17:23:52 -070076
J. Richard Barnetteb8bc570c2016-03-17 17:03:57 -070077# _EXCLUDED_LABELS - A set of labels that disqualify a DUT from
Richard Barnetteeabcf392017-09-01 15:10:54 -070078# monitoring by this script. Currently, we're excluding these:
79# + 'adb' - We're not ready to monitor Android or Brillo hosts.
80# + 'board:guado_moblab' - These are maintained by a separate
81# process that doesn't use this script.
Aviv Keshet1ba0dec2018-07-12 17:14:08 -070082# + 'board:scarlet' due to crbug.com/846012 and other issues discussed at
83# https://bugs.chromium.org/p/chromium/issues/detail?id=861806#c2
84# + 'board:veyron_rialto' due to crbug.com/854404
Kevin Chengcf0ad2b2016-04-19 14:51:39 -070085
Aviv Keshet1ba0dec2018-07-12 17:14:08 -070086_EXCLUDED_LABELS = {'adb', 'board:guado_moblab', 'board:scarlet',
87 'board:veyron_rialto'}
J. Richard Barnetteb8bc570c2016-03-17 17:03:57 -070088
J. Richard Barnette96db3492015-03-27 17:23:52 -070089# _DEFAULT_DURATION:
90# Default value used for the --duration command line option.
91# Specifies how far back in time to search in order to determine
92# DUT status.
93
94_DEFAULT_DURATION = 24
95
J. Richard Barnette96db3492015-03-27 17:23:52 -070096# _LOGDIR:
Richard Barnettecf5d8342017-10-24 18:13:11 -070097# Relative path used in the calculation of the default setting for
98# the --logdir option. The full path is relative to the root of the
99# autotest directory, as determined from sys.argv[0].
J. Richard Barnette96db3492015-03-27 17:23:52 -0700100# _LOGFILE:
101# Basename of a file to which general log information will be
102# written.
103# _LOG_FORMAT:
104# Format string for log messages.
105
106_LOGDIR = os.path.join('logs', 'dut-data')
107_LOGFILE = 'lab-inventory.log'
108_LOG_FORMAT = '%(asctime)s | %(levelname)-10s | %(message)s'
109
J. Richard Barnettef6839282015-06-01 16:00:35 -0700110# Pattern describing location-based host names in the Chrome OS test
111# labs. Each DUT hostname designates the DUT's location:
112# * A lab (room) that's physically separated from other labs
113# (i.e. there's a door).
114# * A row (or aisle) of DUTs within the lab.
115# * A vertical rack of shelves on the row.
116# * A specific host on one shelf of the rack.
117
118_HOSTNAME_PATTERN = re.compile(
119 r'(chromeos\d+)-row(\d+)-rack(\d+)-host(\d+)')
120
Richard Barnettecf5d8342017-10-24 18:13:11 -0700121# _REPAIR_LOOP_THRESHOLD:
122# The number of repeated Repair tasks that must be seen to declare
123# that a DUT is stuck in a repair loop.
124
125_REPAIR_LOOP_THRESHOLD = 4
126
J. Richard Barnette96db3492015-03-27 17:23:52 -0700127
Prathmesh Prabhub69a6cc2018-05-07 14:49:33 -0700128_METRICS_PREFIX = 'chromeos/autotest/inventory'
Richard Barnette1ca30e62018-04-09 16:45:58 -0700129_UNTESTABLE_PRESENCE_METRIC = metrics.BooleanMetric(
Prathmesh Prabhub69a6cc2018-05-07 14:49:33 -0700130 '%s/untestable' % _METRICS_PREFIX,
Richard Barnette1ca30e62018-04-09 16:45:58 -0700131 'DUTs that cannot be scheduled for testing')
132
133
Richard Barnettee8eee312018-04-27 13:12:04 -0400134def _host_is_working(history):
135 return history.last_diagnosis()[0] == status_history.WORKING
136
137
138def _host_is_broken(history):
139 return history.last_diagnosis()[0] == status_history.BROKEN
140
141
142def _host_is_idle(history):
143 idle_statuses = {status_history.UNUSED, status_history.UNKNOWN}
144 return history.last_diagnosis()[0] in idle_statuses
145
146
Richard Barnette5de01eb2017-12-15 09:53:42 -0800147class _HostSetInventory(object):
148 """Maintains a set of related `HostJobHistory` objects.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700149
Richard Barnettee8eee312018-04-27 13:12:04 -0400150 Current usage of this class is that all DUTs are part of a single
151 scheduling pool of DUTs for a single model; however, this class make
152 no assumptions about the actual relationship among the DUTs.
153
Richard Barnette5de01eb2017-12-15 09:53:42 -0800154 The collection is segregated into disjoint categories of "working",
155 "broken", and "idle" DUTs. Accessor methods allow finding both the
156 list of DUTs in each category, as well as counts of each category.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700157
J. Richard Barnettef6839282015-06-01 16:00:35 -0700158 Performance note: Certain methods in this class are potentially
159 expensive:
160 * `get_working()`
161 * `get_working_list()`
162 * `get_broken()`
163 * `get_broken_list()`
xixuan12ce04f2016-03-10 13:16:30 -0800164 * `get_idle()`
165 * `get_idle_list()`
J. Richard Barnettef6839282015-06-01 16:00:35 -0700166 The first time any one of these methods is called, it causes
167 multiple RPC calls with a relatively expensive set of database
168 queries. However, the results of the queries are cached in the
169 individual `HostJobHistory` objects, so only the first call
170 actually pays the full cost.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700171
xixuan12ce04f2016-03-10 13:16:30 -0800172 Additionally, `get_working_list()`, `get_broken_list()` and
173 `get_idle_list()` cache their return values to avoid recalculating
Richard Barnette5de01eb2017-12-15 09:53:42 -0800174 lists at every call; this caching is separate from the caching of
175 RPC results described above.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700176
Richard Barnette5de01eb2017-12-15 09:53:42 -0800177 This class is deliberately constructed to delay the RPC cost until
178 the accessor methods are called (rather than to query in
J. Richard Barnette96db3492015-03-27 17:23:52 -0700179 `record_host()`) so that it's possible to construct a complete
180 `_LabInventory` without making the expensive queries at creation
Richard Barnette5de01eb2017-12-15 09:53:42 -0800181 time. `_populate_model_counts()`, below, assumes this behavior.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700182 """
183
184 def __init__(self):
185 self._histories = []
J. Richard Barnettef6839282015-06-01 16:00:35 -0700186 self._working_list = None
187 self._broken_list = None
xixuan12ce04f2016-03-10 13:16:30 -0800188 self._idle_list = None
J. Richard Barnette96db3492015-03-27 17:23:52 -0700189
J. Richard Barnette96db3492015-03-27 17:23:52 -0700190 def record_host(self, host_history):
191 """Add one `HostJobHistory` object to the collection.
192
193 @param host_history The `HostJobHistory` object to be
194 remembered.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700195 """
J. Richard Barnettef6839282015-06-01 16:00:35 -0700196 self._working_list = None
197 self._broken_list = None
xixuan12ce04f2016-03-10 13:16:30 -0800198 self._idle_list = None
J. Richard Barnette96db3492015-03-27 17:23:52 -0700199 self._histories.append(host_history)
200
J. Richard Barnettef6839282015-06-01 16:00:35 -0700201 def get_working_list(self):
202 """Return a list of all working DUTs in the pool.
203
Richard Barnettee8eee312018-04-27 13:12:04 -0400204 Filter `self._histories` for histories where the DUT is
205 diagnosed as working.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700206
207 Cache the result so that we only cacluate it once.
208
209 @return A list of HostJobHistory objects.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700210 """
211 if self._working_list is None:
212 self._working_list = [h for h in self._histories
Richard Barnettee8eee312018-04-27 13:12:04 -0400213 if _host_is_working(h)]
J. Richard Barnettef6839282015-06-01 16:00:35 -0700214 return self._working_list
215
J. Richard Barnette96db3492015-03-27 17:23:52 -0700216 def get_working(self):
J. Richard Barnettef6839282015-06-01 16:00:35 -0700217 """Return the number of working DUTs in the pool."""
218 return len(self.get_working_list())
219
J. Richard Barnettef6839282015-06-01 16:00:35 -0700220 def get_broken_list(self):
221 """Return a list of all broken DUTs in the pool.
222
Richard Barnettee8eee312018-04-27 13:12:04 -0400223 Filter `self._histories` for histories where the DUT is
224 diagnosed as broken.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700225
226 Cache the result so that we only cacluate it once.
227
228 @return A list of HostJobHistory objects.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700229 """
230 if self._broken_list is None:
231 self._broken_list = [h for h in self._histories
Richard Barnettee8eee312018-04-27 13:12:04 -0400232 if _host_is_broken(h)]
J. Richard Barnettef6839282015-06-01 16:00:35 -0700233 return self._broken_list
J. Richard Barnette96db3492015-03-27 17:23:52 -0700234
J. Richard Barnette96db3492015-03-27 17:23:52 -0700235 def get_broken(self):
J. Richard Barnettef6839282015-06-01 16:00:35 -0700236 """Return the number of broken DUTs in the pool."""
237 return len(self.get_broken_list())
J. Richard Barnette96db3492015-03-27 17:23:52 -0700238
xixuan12ce04f2016-03-10 13:16:30 -0800239 def get_idle_list(self):
240 """Return a list of all idle DUTs in the pool.
241
Richard Barnettee8eee312018-04-27 13:12:04 -0400242 Filter `self._histories` for histories where the DUT is
243 diagnosed as idle.
xixuan12ce04f2016-03-10 13:16:30 -0800244
245 Cache the result so that we only cacluate it once.
246
247 @return A list of HostJobHistory objects.
xixuan12ce04f2016-03-10 13:16:30 -0800248 """
xixuan12ce04f2016-03-10 13:16:30 -0800249 if self._idle_list is None:
250 self._idle_list = [h for h in self._histories
Richard Barnettee8eee312018-04-27 13:12:04 -0400251 if _host_is_idle(h)]
xixuan12ce04f2016-03-10 13:16:30 -0800252 return self._idle_list
253
xixuan12ce04f2016-03-10 13:16:30 -0800254 def get_idle(self):
255 """Return the number of idle DUTs in the pool."""
256 return len(self.get_idle_list())
257
J. Richard Barnette96db3492015-03-27 17:23:52 -0700258 def get_total(self):
J. Richard Barnettef6839282015-06-01 16:00:35 -0700259 """Return the total number of DUTs in the pool."""
J. Richard Barnette96db3492015-03-27 17:23:52 -0700260 return len(self._histories)
261
Richard Barnettee8eee312018-04-27 13:12:04 -0400262 def get_all_histories(self):
263 return self._histories
264
J. Richard Barnette96db3492015-03-27 17:23:52 -0700265
Richard Barnette5de01eb2017-12-15 09:53:42 -0800266class _PoolSetInventory(object):
267 """Maintains a set of `HostJobHistory`s for a set of pools.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700268
Richard Barnette5de01eb2017-12-15 09:53:42 -0800269 The collection is segregated into disjoint categories of "working",
270 "broken", and "idle" DUTs. Accessor methods allow finding both the
271 list of DUTs in each category, as well as counts of each category.
272 Accessor queries can be for an individual pool, or against all
273 pools.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700274
Richard Barnette5de01eb2017-12-15 09:53:42 -0800275 Performance note: This class relies on `_HostSetInventory`. Public
276 methods in this class generally rely on methods of the same name in
277 the underlying class, and so will have the same underlying
278 performance characteristics.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700279 """
280
Richard Barnette5de01eb2017-12-15 09:53:42 -0800281 def __init__(self, pools):
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800282 self._histories_by_pool = {
Richard Barnette5de01eb2017-12-15 09:53:42 -0800283 pool: _HostSetInventory() for pool in pools
J. Richard Barnette96db3492015-03-27 17:23:52 -0700284 }
285
286 def record_host(self, host_history):
287 """Add one `HostJobHistory` object to the collection.
288
289 @param host_history The `HostJobHistory` object to be
290 remembered.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700291 """
J. Richard Barnette3d0590a2015-04-29 12:56:12 -0700292 pool = host_history.host_pool
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800293 self._histories_by_pool[pool].record_host(host_history)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700294
J. Richard Barnette96db3492015-03-27 17:23:52 -0700295 def _count_pool(self, get_pool_count, pool=None):
296 """Internal helper to count hosts in a given pool.
297
298 The `get_pool_count` parameter is a function to calculate
299 the exact count of interest for the pool.
300
301 @param get_pool_count Function to return a count from a
302 _PoolCount object.
303 @param pool The pool to be counted. If `None`,
304 return the total across all pools.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700305 """
306 if pool is None:
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800307 return sum([get_pool_count(cached_history) for cached_history in
308 self._histories_by_pool.values()])
J. Richard Barnette96db3492015-03-27 17:23:52 -0700309 else:
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800310 return get_pool_count(self._histories_by_pool[pool])
J. Richard Barnette96db3492015-03-27 17:23:52 -0700311
J. Richard Barnettef6839282015-06-01 16:00:35 -0700312 def get_working_list(self):
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800313 """Return a list of all working DUTs (across all pools).
J. Richard Barnettef6839282015-06-01 16:00:35 -0700314
Richard Barnettee8eee312018-04-27 13:12:04 -0400315 Go through all HostJobHistory objects across all pools,
316 selecting all DUTs identified as working.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700317
318 @return A list of HostJobHistory objects.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700319 """
320 l = []
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800321 for p in self._histories_by_pool.values():
J. Richard Barnettef6839282015-06-01 16:00:35 -0700322 l.extend(p.get_working_list())
323 return l
324
J. Richard Barnette96db3492015-03-27 17:23:52 -0700325 def get_working(self, pool=None):
326 """Return the number of working DUTs in a pool.
327
328 @param pool The pool to be counted. If `None`, return the
329 total across all pools.
330
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700331 @return The total number of working DUTs in the selected
332 pool(s).
J. Richard Barnette96db3492015-03-27 17:23:52 -0700333 """
Richard Barnette5de01eb2017-12-15 09:53:42 -0800334 return self._count_pool(_HostSetInventory.get_working, pool)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700335
J. Richard Barnettef6839282015-06-01 16:00:35 -0700336 def get_broken_list(self):
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800337 """Return a list of all broken DUTs (across all pools).
J. Richard Barnettef6839282015-06-01 16:00:35 -0700338
Richard Barnettee8eee312018-04-27 13:12:04 -0400339 Go through all HostJobHistory objects across all pools,
340 selecting all DUTs identified as broken.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700341
342 @return A list of HostJobHistory objects.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700343 """
344 l = []
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800345 for p in self._histories_by_pool.values():
J. Richard Barnettef6839282015-06-01 16:00:35 -0700346 l.extend(p.get_broken_list())
347 return l
348
J. Richard Barnette96db3492015-03-27 17:23:52 -0700349 def get_broken(self, pool=None):
350 """Return the number of broken DUTs in a pool.
351
352 @param pool The pool to be counted. If `None`, return the
353 total across all pools.
354
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700355 @return The total number of broken DUTs in the selected pool(s).
J. Richard Barnette96db3492015-03-27 17:23:52 -0700356 """
Richard Barnette5de01eb2017-12-15 09:53:42 -0800357 return self._count_pool(_HostSetInventory.get_broken, pool)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700358
xixuan12ce04f2016-03-10 13:16:30 -0800359 def get_idle_list(self, pool=None):
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800360 """Return a list of all idle DUTs in the given pool.
xixuan12ce04f2016-03-10 13:16:30 -0800361
Richard Barnettee8eee312018-04-27 13:12:04 -0400362 Go through all HostJobHistory objects across all pools,
363 selecting all DUTs identified as idle.
xixuan12ce04f2016-03-10 13:16:30 -0800364
365 @param pool: The pool to be counted. If `None`, return the total list
366 across all pools.
367
368 @return A list of HostJobHistory objects.
xixuan12ce04f2016-03-10 13:16:30 -0800369 """
370 if pool is None:
371 l = []
Richard Barnette5de01eb2017-12-15 09:53:42 -0800372 for p in self._histories_by_pool.itervalues():
xixuan12ce04f2016-03-10 13:16:30 -0800373 l.extend(p.get_idle_list())
374 return l
375 else:
Richard Barnette5de01eb2017-12-15 09:53:42 -0800376 return self._histories_by_pool[pool].get_idle_list()
xixuan12ce04f2016-03-10 13:16:30 -0800377
xixuan12ce04f2016-03-10 13:16:30 -0800378 def get_idle(self, pool=None):
379 """Return the number of idle DUTs in a pool.
380
381 @param pool: The pool to be counted. If `None`, return the total
382 across all pools.
383
384 @return The total number of idle DUTs in the selected pool(s).
385 """
Richard Barnette5de01eb2017-12-15 09:53:42 -0800386 return self._count_pool(_HostSetInventory.get_idle, pool)
xixuan12ce04f2016-03-10 13:16:30 -0800387
Richard Barnette5de01eb2017-12-15 09:53:42 -0800388 def get_spares_buffer(self, spare_pool=SPARE_POOL):
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700389 """Return the the nominal number of working spares.
390
391 Calculates and returns how many working spares there would
392 be in the spares pool if all broken DUTs were in the spares
393 pool. This number may be negative, indicating a shortfall
394 in the critical pools.
395
396 @return The total number DUTs in the spares pool, less the total
397 number of broken DUTs in all pools.
398 """
Richard Barnette5de01eb2017-12-15 09:53:42 -0800399 return self.get_total(spare_pool) - self.get_broken()
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700400
J. Richard Barnette96db3492015-03-27 17:23:52 -0700401 def get_total(self, pool=None):
402 """Return the total number of DUTs in a pool.
403
404 @param pool The pool to be counted. If `None`, return the
405 total across all pools.
406
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700407 @return The total number of DUTs in the selected pool(s).
J. Richard Barnette96db3492015-03-27 17:23:52 -0700408 """
Richard Barnette5de01eb2017-12-15 09:53:42 -0800409 return self._count_pool(_HostSetInventory.get_total, pool)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700410
Richard Barnettee8eee312018-04-27 13:12:04 -0400411 def get_all_histories(self, pool=None):
412 if pool is None:
413 for p in self._histories_by_pool.itervalues():
414 for h in p.get_all_histories():
415 yield h
416 else:
417 for h in self._histories_by_pool[pool].get_all_histories():
418 yield h
419
J. Richard Barnette96db3492015-03-27 17:23:52 -0700420
Richard Barnette5de01eb2017-12-15 09:53:42 -0800421def _eligible_host(afehost):
422 """Return whether this host is eligible for monitoring.
423
424 A host is eligible if it has a (unique) 'model' label, it's in
425 exactly one pool, and it has no labels from the
426 `_EXCLUDED_LABELS` set.
427
428 @param afehost The host to be tested for eligibility.
429 """
430 # DUTs without an existing, unique 'model' or 'pool' label
431 # aren't meant to exist in the managed inventory; their presence
432 # generally indicates an error in the database. Unfortunately
433 # such errors have been seen to occur from time to time.
434 #
435 # The _LabInventory constructor requires hosts to conform to the
436 # label restrictions, and may fail if they don't. Failing an
437 # inventory run for a single bad entry is the wrong thing, so we
438 # ignore the problem children here, to keep them out of the
439 # inventory.
440 models = [l for l in afehost.labels
441 if l.startswith(constants.Labels.MODEL_PREFIX)]
442 pools = [l for l in afehost.labels
443 if l.startswith(constants.Labels.POOL_PREFIX)]
444 excluded = _EXCLUDED_LABELS.intersection(afehost.labels)
445 return len(models) == 1 and len(pools) == 1 and not excluded
446
447
448class _LabInventory(collections.Mapping):
J. Richard Barnette96db3492015-03-27 17:23:52 -0700449 """Collection of `HostJobHistory` objects for the Lab's inventory.
450
Richard Barnette5de01eb2017-12-15 09:53:42 -0800451 This is a dict-like collection indexed by model. Indexing returns
452 the _PoolSetInventory object associated with the model.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700453 """
454
455 @classmethod
Richard Barnette5de01eb2017-12-15 09:53:42 -0800456 def create_inventory(cls, afe, start_time, end_time, modellist=[]):
J. Richard Barnette96db3492015-03-27 17:23:52 -0700457 """Return a Lab inventory with specified parameters.
458
Richard Barnette5de01eb2017-12-15 09:53:42 -0800459 By default, gathers inventory from `HostJobHistory` objects for
460 all DUTs in the `MANAGED_POOLS` list. If `modellist` is
461 supplied, the inventory will be restricted to only the given
462 models.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700463
Richard Barnette5de01eb2017-12-15 09:53:42 -0800464 @param afe AFE object for constructing the
465 `HostJobHistory` objects.
466 @param start_time Start time for the `HostJobHistory` objects.
467 @param end_time End time for the `HostJobHistory` objects.
468 @param modellist List of models to include. If empty,
469 include all available models.
470 @return A `_LabInventory` object for the specified models.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700471 """
Richard Barnette5de01eb2017-12-15 09:53:42 -0800472 target_pools = MANAGED_POOLS
473 label_list = [constants.Labels.POOL_PREFIX + l for l in target_pools]
J. Richard Barnette96db3492015-03-27 17:23:52 -0700474 afehosts = afe.get_hosts(labels__name__in=label_list)
Richard Barnette5de01eb2017-12-15 09:53:42 -0800475 if modellist:
J. Richard Barnetteb8bc570c2016-03-17 17:03:57 -0700476 # We're deliberately not checking host eligibility in this
477 # code path. This is a debug path, not used in production;
478 # it may be useful to include ineligible hosts here.
Richard Barnette5de01eb2017-12-15 09:53:42 -0800479 modelhosts = []
480 for model in modellist:
481 model_label = constants.Labels.MODEL_PREFIX + model
J. Richard Barnette96db3492015-03-27 17:23:52 -0700482 host_list = [h for h in afehosts
Richard Barnette5de01eb2017-12-15 09:53:42 -0800483 if model_label in h.labels]
484 modelhosts.extend(host_list)
485 afehosts = modelhosts
J. Richard Barnetteb8bc570c2016-03-17 17:03:57 -0700486 else:
Richard Barnette3a404492018-02-08 13:57:01 -0800487 afehosts = [h for h in afehosts if _eligible_host(h)]
J. Richard Barnette96db3492015-03-27 17:23:52 -0700488 create = lambda host: (
489 status_history.HostJobHistory(afe, host,
490 start_time, end_time))
Richard Barnette5de01eb2017-12-15 09:53:42 -0800491 return cls([create(host) for host in afehosts], target_pools)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700492
Richard Barnette5de01eb2017-12-15 09:53:42 -0800493 def __init__(self, histories, pools):
494 models = {h.host_model for h in histories}
495 self._modeldata = {model: _PoolSetInventory(pools) for model in models}
J. Richard Barnette96db3492015-03-27 17:23:52 -0700496 self._dut_count = len(histories)
Richard Barnette5de01eb2017-12-15 09:53:42 -0800497 for h in histories:
498 self[h.host_model].record_host(h)
499 self._boards = {h.host_board for h in histories}
Prathmesh Prabhu154cb2b2017-11-08 17:36:51 -0800500
Richard Barnette5de01eb2017-12-15 09:53:42 -0800501 def __getitem__(self, key):
502 return self._modeldata.__getitem__(key)
Prathmesh Prabhu021e7842017-11-08 18:05:45 -0800503
Richard Barnette5de01eb2017-12-15 09:53:42 -0800504 def __len__(self):
505 return self._modeldata.__len__()
506
Richard Barnette5de01eb2017-12-15 09:53:42 -0800507 def __iter__(self):
508 return self._modeldata.__iter__()
509
J. Richard Barnette96db3492015-03-27 17:23:52 -0700510 def get_num_duts(self):
511 """Return the total number of DUTs in the inventory."""
512 return self._dut_count
513
Prathmesh Prabhua5a0e3d2017-11-09 08:53:53 -0800514 def get_num_models(self):
515 """Return the total number of models in the inventory."""
Richard Barnette5de01eb2017-12-15 09:53:42 -0800516 return len(self)
517
Richard Barnette5de01eb2017-12-15 09:53:42 -0800518 def get_pool_models(self, pool):
519 """Return all models in `pool`.
520
521 @param pool The pool to be inventoried for models.
522 """
523 return {m for m, h in self.iteritems() if h.get_total(pool)}
524
Richard Barnette5de01eb2017-12-15 09:53:42 -0800525 def get_boards(self):
526 return self._boards
Prathmesh Prabhua5a0e3d2017-11-09 08:53:53 -0800527
528
Richard Barnettee8eee312018-04-27 13:12:04 -0400529def _reportable_models(inventory, spare_pool=SPARE_POOL):
530 """Iterate over all models subject to reporting.
531
532 Yields the contents of `inventory.iteritems()` filtered to include
533 only reportable models. A model is reportable if it has DUTs in
534 both `spare_pool` and at least one other pool.
535
536 @param spare_pool The spare pool to be tested for reporting.
537 """
538 for model, poolset in inventory.iteritems():
539 spares = poolset.get_total(spare_pool)
540 total = poolset.get_total()
541 if spares != 0 and spares != total:
542 yield model, poolset
543
544
545def _all_dut_histories(inventory):
546 for poolset in inventory.itervalues():
547 for h in poolset.get_all_histories():
548 yield h
549
550
J. Richard Barnettef6839282015-06-01 16:00:35 -0700551def _sort_by_location(inventory_list):
552 """Return a list of DUTs, organized by location.
553
554 Take the given list of `HostJobHistory` objects, separate it
555 into a list per lab, and sort each lab's list by location. The
556 order of sorting within a lab is
557 * By row number within the lab,
558 * then by rack number within the row,
559 * then by host shelf number within the rack.
560
561 Return a list of the sorted lists.
562
563 Implementation note: host locations are sorted by converting
564 each location into a base 100 number. If row, rack or
565 host numbers exceed the range [0..99], then sorting will
566 break down.
567
568 @return A list of sorted lists of DUTs.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700569 """
570 BASE = 100
571 lab_lists = {}
572 for history in inventory_list:
573 location = _HOSTNAME_PATTERN.match(history.host.hostname)
574 if location:
575 lab = location.group(1)
576 key = 0
577 for idx in location.group(2, 3, 4):
578 key = BASE * key + int(idx)
579 lab_lists.setdefault(lab, []).append((key, history))
580 return_list = []
581 for dut_list in lab_lists.values():
582 dut_list.sort(key=lambda t: t[0])
583 return_list.append([t[1] for t in dut_list])
584 return return_list
585
586
587def _score_repair_set(buffer_counts, repair_list):
588 """Return a numeric score rating a set of DUTs to be repaired.
589
Richard Barnette5de01eb2017-12-15 09:53:42 -0800590 `buffer_counts` is a dictionary mapping model names to the size of
591 the model's spares buffer.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700592
Richard Barnette5de01eb2017-12-15 09:53:42 -0800593 `repair_list` is a list of `HostJobHistory` objects for the DUTs to
594 be repaired.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700595
596 This function calculates the new set of buffer counts that would
Richard Barnette5de01eb2017-12-15 09:53:42 -0800597 result from the proposed repairs, and scores the new set using two
598 numbers:
599 * Worst case buffer count for any model (higher is better). This
600 is the more significant number for comparison.
601 * Number of models at the worst case (lower is better). This is
602 the less significant number.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700603
Richard Barnette5de01eb2017-12-15 09:53:42 -0800604 Implementation note: The score could fail to reflect the intended
605 criteria if there are more than 1000 models in the inventory.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700606
Richard Barnette5de01eb2017-12-15 09:53:42 -0800607 @param spare_counts A dictionary mapping models to buffer counts.
608 @param repair_list A list of `HostJobHistory` objects for the
609 DUTs to be repaired.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700610 @return A numeric score.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700611 """
612 # Go through `buffer_counts`, and create a list of new counts
Richard Barnette5de01eb2017-12-15 09:53:42 -0800613 # that records the buffer count for each model after repair.
614 # The new list of counts discards the model names, as they don't
J. Richard Barnettef6839282015-06-01 16:00:35 -0700615 # contribute to the final score.
Richard Barnette5de01eb2017-12-15 09:53:42 -0800616 _NMODELS = 1000
617 pools = {h.host_pool for h in repair_list}
618 repair_inventory = _LabInventory(repair_list, pools)
J. Richard Barnettef6839282015-06-01 16:00:35 -0700619 new_counts = []
Richard Barnette5de01eb2017-12-15 09:53:42 -0800620 for m, c in buffer_counts.iteritems():
621 if m in repair_inventory:
622 newcount = repair_inventory[m].get_total()
J. Richard Barnettef6839282015-06-01 16:00:35 -0700623 else:
624 newcount = 0
625 new_counts.append(c + newcount)
626 # Go through the new list of counts. Find the worst available
627 # spares count, and count how many times that worst case occurs.
628 worst_count = new_counts[0]
629 num_worst = 1
630 for c in new_counts[1:]:
631 if c == worst_count:
632 num_worst += 1
633 elif c < worst_count:
634 worst_count = c
635 num_worst = 1
636 # Return the calculated score
Richard Barnette5de01eb2017-12-15 09:53:42 -0800637 return _NMODELS * worst_count - num_worst
J. Richard Barnettef6839282015-06-01 16:00:35 -0700638
639
640def _generate_repair_recommendation(inventory, num_recommend):
641 """Return a summary of selected DUTs needing repair.
642
Richard Barnette5de01eb2017-12-15 09:53:42 -0800643 Returns a message recommending a list of broken DUTs to be repaired.
644 The list of DUTs is selected based on these criteria:
J. Richard Barnettef6839282015-06-01 16:00:35 -0700645 * No more than `num_recommend` DUTs will be listed.
646 * All DUTs must be in the same lab.
Richard Barnette5de01eb2017-12-15 09:53:42 -0800647 * DUTs should be selected for some degree of physical proximity.
648 * DUTs for models with a low spares buffer are more important than
649 DUTs with larger buffers.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700650
Richard Barnette5de01eb2017-12-15 09:53:42 -0800651 The algorithm used will guarantee that at least one DUT from a model
652 with the lowest spares buffer will be recommended. If the worst
653 spares buffer number is shared by more than one model, the algorithm
654 will tend to prefer repair sets that include more of those models
655 over sets that cover fewer models.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700656
Richard Barnette5de01eb2017-12-15 09:53:42 -0800657 @param inventory `_LabInventory` object from which to generate
658 recommendations.
J. Richard Barnette1df6a562015-06-09 10:06:17 -0700659 @param num_recommend Number of DUTs to recommend for repair.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700660 """
661 logging.debug('Creating DUT repair recommendations')
Richard Barnette5de01eb2017-12-15 09:53:42 -0800662 model_buffer_counts = {}
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700663 broken_list = []
Richard Barnettee8eee312018-04-27 13:12:04 -0400664 for model, counts in _reportable_models(inventory):
Richard Barnette5de01eb2017-12-15 09:53:42 -0800665 logging.debug('Listing failed DUTs for %s', model)
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700666 if counts.get_broken() != 0:
Richard Barnette5de01eb2017-12-15 09:53:42 -0800667 model_buffer_counts[model] = counts.get_spares_buffer()
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700668 broken_list.extend(counts.get_broken_list())
J. Richard Barnette55127432015-10-13 17:01:56 -0700669 # N.B. The logic inside this loop may seem complicated, but
J. Richard Barnettef6839282015-06-01 16:00:35 -0700670 # simplification is hard:
671 # * Calculating an initial recommendation outside of
672 # the loop likely would make things more complicated,
673 # not less.
674 # * It's necessary to calculate an initial lab slice once per
675 # lab _before_ the while loop, in case the number of broken
676 # DUTs in a lab is less than `num_recommend`.
J. Richard Barnette55127432015-10-13 17:01:56 -0700677 recommendation = None
678 best_score = None
679 for lab_duts in _sort_by_location(broken_list):
J. Richard Barnettef6839282015-06-01 16:00:35 -0700680 start = 0
681 end = num_recommend
682 lab_slice = lab_duts[start : end]
Richard Barnette5de01eb2017-12-15 09:53:42 -0800683 lab_score = _score_repair_set(model_buffer_counts, lab_slice)
J. Richard Barnettef6839282015-06-01 16:00:35 -0700684 while end < len(lab_duts):
685 start += 1
686 end += 1
687 new_slice = lab_duts[start : end]
Richard Barnette5de01eb2017-12-15 09:53:42 -0800688 new_score = _score_repair_set(model_buffer_counts, new_slice)
J. Richard Barnettef6839282015-06-01 16:00:35 -0700689 if new_score > lab_score:
690 lab_slice = new_slice
691 lab_score = new_score
692 if recommendation is None or lab_score > best_score:
693 recommendation = lab_slice
694 best_score = lab_score
Richard Barnette5de01eb2017-12-15 09:53:42 -0800695 # N.B. The trailing space in `line_fmt` is manadatory: Without it,
696 # Gmail will parse the URL wrong. Don't ask. If you simply _must_
J. Richard Barnette5d0fa512016-04-05 17:39:52 -0700697 # know more, go try it yourself...
698 line_fmt = '%-30s %-16s %-6s\n %s '
J. Richard Barnette1df6a562015-06-09 10:06:17 -0700699 message = ['Repair recommendations:\n',
Richard Barnette5de01eb2017-12-15 09:53:42 -0800700 line_fmt % ( 'Hostname', 'Model', 'Servo?', 'Logs URL')]
J. Richard Barnettef6839282015-06-01 16:00:35 -0700701 for h in recommendation:
702 servo_name = servo_host.make_servo_hostname(h.host.hostname)
J. Richard Barnette5d0fa512016-04-05 17:39:52 -0700703 servo_present = utils.host_is_in_lab_zone(servo_name)
704 _, event = h.last_diagnosis()
705 line = line_fmt % (
Richard Barnette5de01eb2017-12-15 09:53:42 -0800706 h.host.hostname, h.host_model,
J. Richard Barnette5d0fa512016-04-05 17:39:52 -0700707 'Yes' if servo_present else 'No', event.job_url)
J. Richard Barnettef6839282015-06-01 16:00:35 -0700708 message.append(line)
709 return '\n'.join(message)
710
711
Richard Barnette5de01eb2017-12-15 09:53:42 -0800712def _generate_model_inventory_message(inventory):
713 """Generate the "model inventory" e-mail message.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700714
Richard Barnette5de01eb2017-12-15 09:53:42 -0800715 The model inventory is a list by model summarizing the number of
716 working, broken, and idle DUTs, and the total shortfall or surplus
J. Richard Barnette96db3492015-03-27 17:23:52 -0700717 of working devices relative to the minimum critical pool
718 requirement.
719
Richard Barnette5de01eb2017-12-15 09:53:42 -0800720 The report omits models with no DUTs in the spare pool or with no
721 DUTs in a critical pool.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700722
723 N.B. For sample output text formattted as users can expect to
724 see it in e-mail and log files, refer to the unit tests.
725
Richard Barnette5de01eb2017-12-15 09:53:42 -0800726 @param inventory `_LabInventory` object to be reported on.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700727 @return String with the inventory message to be sent.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700728 """
Richard Barnette5de01eb2017-12-15 09:53:42 -0800729 logging.debug('Creating model inventory')
J. Richard Barnetted3ba33a2015-10-14 11:20:49 -0700730 nworking = 0
731 nbroken = 0
xixuan12ce04f2016-03-10 13:16:30 -0800732 nidle = 0
Richard Barnette5de01eb2017-12-15 09:53:42 -0800733 nbroken_models = 0
734 ntotal_models = 0
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700735 summaries = []
Richard Barnette5de01eb2017-12-15 09:53:42 -0800736 column_names = (
737 'Model', 'Avail', 'Bad', 'Idle', 'Good', 'Spare', 'Total')
Richard Barnettee8eee312018-04-27 13:12:04 -0400738 for model, counts in _reportable_models(inventory):
Richard Barnette5de01eb2017-12-15 09:53:42 -0800739 logging.debug('Counting %2d DUTS for model %s',
740 counts.get_total(), model)
741 # Summary elements laid out in the same order as the column
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700742 # headers:
Richard Barnette5de01eb2017-12-15 09:53:42 -0800743 # Model Avail Bad Idle Good Spare Total
xixuan12ce04f2016-03-10 13:16:30 -0800744 # e[0] e[1] e[2] e[3] e[4] e[5] e[6]
Richard Barnette5de01eb2017-12-15 09:53:42 -0800745 element = (model,
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700746 counts.get_spares_buffer(),
747 counts.get_broken(),
xixuan12ce04f2016-03-10 13:16:30 -0800748 counts.get_idle(),
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700749 counts.get_working(),
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700750 counts.get_total(SPARE_POOL),
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700751 counts.get_total())
J. Richard Barnetteea5a4ba2016-02-18 16:34:50 -0800752 if element[2]:
753 summaries.append(element)
Richard Barnette5de01eb2017-12-15 09:53:42 -0800754 nbroken_models += 1
755 ntotal_models += 1
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700756 nbroken += element[2]
xixuan12ce04f2016-03-10 13:16:30 -0800757 nidle += element[3]
758 nworking += element[4]
759 ntotal = nworking + nbroken + nidle
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700760 summaries = sorted(summaries, key=lambda e: (e[1], -e[2]))
J. Richard Barnetted3ba33a2015-10-14 11:20:49 -0700761 broken_percent = int(round(100.0 * nbroken / ntotal))
xixuan12ce04f2016-03-10 13:16:30 -0800762 idle_percent = int(round(100.0 * nidle / ntotal))
763 working_percent = 100 - broken_percent - idle_percent
J. Richard Barnetted3ba33a2015-10-14 11:20:49 -0700764 message = ['Summary of DUTs in inventory:',
xixuan12ce04f2016-03-10 13:16:30 -0800765 '%10s %10s %10s %6s' % ('Bad', 'Idle', 'Good', 'Total'),
766 '%5d %3d%% %5d %3d%% %5d %3d%% %6d' % (
J. Richard Barnetted3ba33a2015-10-14 11:20:49 -0700767 nbroken, broken_percent,
xixuan12ce04f2016-03-10 13:16:30 -0800768 nidle, idle_percent,
J. Richard Barnetted3ba33a2015-10-14 11:20:49 -0700769 nworking, working_percent,
770 ntotal),
771 '',
Richard Barnette5de01eb2017-12-15 09:53:42 -0800772 'Models with failures: %d' % nbroken_models,
773 'Models in inventory: %d' % ntotal_models,
J. Richard Barnetted3ba33a2015-10-14 11:20:49 -0700774 '', '',
Richard Barnette5de01eb2017-12-15 09:53:42 -0800775 'Full model inventory:\n',
776 '%-22s %5s %5s %5s %5s %5s %5s' % column_names]
J. Richard Barnette96db3492015-03-27 17:23:52 -0700777 message.extend(
xixuan12ce04f2016-03-10 13:16:30 -0800778 ['%-22s %5d %5d %5d %5d %5d %5d' % e for e in summaries])
J. Richard Barnette96db3492015-03-27 17:23:52 -0700779 return '\n'.join(message)
780
781
J. Richard Barnette4845fcf2015-04-20 14:26:25 -0700782_POOL_INVENTORY_HEADER = '''\
Richard Barnette5de01eb2017-12-15 09:53:42 -0800783Notice to Infrastructure deputies: All models shown below are at
J. Richard Barnettec9a143c2015-06-04 11:11:19 -0700784less than full strength, please take action to resolve the issues.
785Once you're satisified that failures won't recur, failed DUTs can
786be replaced with spares by running `balance_pool`. Detailed
787instructions can be found here:
J. Richard Barnette4845fcf2015-04-20 14:26:25 -0700788 http://go/cros-manage-duts
789'''
790
791
J. Richard Barnette96db3492015-03-27 17:23:52 -0700792def _generate_pool_inventory_message(inventory):
793 """Generate the "pool inventory" e-mail message.
794
Richard Barnette5de01eb2017-12-15 09:53:42 -0800795 The pool inventory is a list by pool and model summarizing the
796 number of working and broken DUTs in the pool. Only models with
J. Richard Barnette96db3492015-03-27 17:23:52 -0700797 at least one broken DUT are included in the list.
798
Richard Barnette5de01eb2017-12-15 09:53:42 -0800799 N.B. For sample output text formattted as users can expect to see it
800 in e-mail and log files, refer to the unit tests.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700801
Richard Barnette5de01eb2017-12-15 09:53:42 -0800802 @param inventory `_LabInventory` object to be reported on.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700803 @return String with the inventory message to be sent.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700804 """
805 logging.debug('Creating pool inventory')
J. Richard Barnette4845fcf2015-04-20 14:26:25 -0700806 message = [_POOL_INVENTORY_HEADER]
J. Richard Barnette96db3492015-03-27 17:23:52 -0700807 newline = ''
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700808 for pool in CRITICAL_POOLS:
J. Richard Barnette96db3492015-03-27 17:23:52 -0700809 message.append(
Richard Barnette5de01eb2017-12-15 09:53:42 -0800810 '%sStatus for pool:%s, by model:' % (newline, pool))
J. Richard Barnette96db3492015-03-27 17:23:52 -0700811 message.append(
xixuan12ce04f2016-03-10 13:16:30 -0800812 '%-20s %5s %5s %5s %5s' % (
Richard Barnette5de01eb2017-12-15 09:53:42 -0800813 'Model', 'Bad', 'Idle', 'Good', 'Total'))
J. Richard Barnette96db3492015-03-27 17:23:52 -0700814 data_list = []
Richard Barnette5de01eb2017-12-15 09:53:42 -0800815 for model, counts in inventory.iteritems():
Richard Barnette254d5b42016-07-06 19:13:23 -0700816 logging.debug('Counting %2d DUTs for %s, %s',
Richard Barnette5de01eb2017-12-15 09:53:42 -0800817 counts.get_total(pool), model, pool)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700818 broken = counts.get_broken(pool)
xixuan12ce04f2016-03-10 13:16:30 -0800819 idle = counts.get_idle(pool)
Richard Barnette5de01eb2017-12-15 09:53:42 -0800820 # models at full strength are not reported
821 if not broken and not idle:
J. Richard Barnette96db3492015-03-27 17:23:52 -0700822 continue
823 working = counts.get_working(pool)
824 total = counts.get_total(pool)
Richard Barnette5de01eb2017-12-15 09:53:42 -0800825 data_list.append((model, broken, idle, working, total))
J. Richard Barnette96db3492015-03-27 17:23:52 -0700826 if data_list:
827 data_list = sorted(data_list, key=lambda d: -d[1])
828 message.extend(
xixuan12ce04f2016-03-10 13:16:30 -0800829 ['%-20s %5d %5d %5d %5d' % t for t in data_list])
J. Richard Barnette96db3492015-03-27 17:23:52 -0700830 else:
Richard Barnette5de01eb2017-12-15 09:53:42 -0800831 message.append('(All models at full strength)')
J. Richard Barnette96db3492015-03-27 17:23:52 -0700832 newline = '\n'
833 return '\n'.join(message)
834
835
xixuan12ce04f2016-03-10 13:16:30 -0800836_IDLE_INVENTORY_HEADER = '''\
837Notice to Infrastructure deputies: The hosts shown below haven't
838run any jobs for at least 24 hours. Please check each host; locked
839hosts should normally be unlocked; stuck jobs should normally be
840aborted.
841'''
842
843
844def _generate_idle_inventory_message(inventory):
845 """Generate the "idle inventory" e-mail message.
846
Richard Barnette5de01eb2017-12-15 09:53:42 -0800847 The idle inventory is a host list with corresponding pool and model,
Richard Barnettee8eee312018-04-27 13:12:04 -0400848 where the hosts are identified as idle.
xixuan12ce04f2016-03-10 13:16:30 -0800849
850 N.B. For sample output text format as users can expect to
851 see it in e-mail and log files, refer to the unit tests.
852
Richard Barnette5de01eb2017-12-15 09:53:42 -0800853 @param inventory `_LabInventory` object to be reported on.
xixuan12ce04f2016-03-10 13:16:30 -0800854 @return String with the inventory message to be sent.
xixuan12ce04f2016-03-10 13:16:30 -0800855 """
856 logging.debug('Creating idle inventory')
857 message = [_IDLE_INVENTORY_HEADER]
858 message.append('Idle Host List:')
Richard Barnette5de01eb2017-12-15 09:53:42 -0800859 message.append('%-30s %-20s %s' % ('Hostname', 'Model', 'Pool'))
xixuan12ce04f2016-03-10 13:16:30 -0800860 data_list = []
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700861 for pool in MANAGED_POOLS:
Richard Barnette5de01eb2017-12-15 09:53:42 -0800862 for model, counts in inventory.iteritems():
Richard Barnette254d5b42016-07-06 19:13:23 -0700863 logging.debug('Counting %2d DUTs for %s, %s',
Richard Barnette5de01eb2017-12-15 09:53:42 -0800864 counts.get_total(pool), model, pool)
865 data_list.extend([(dut.host.hostname, model, pool)
xixuan12ce04f2016-03-10 13:16:30 -0800866 for dut in counts.get_idle_list(pool)])
867 if data_list:
868 message.extend(['%-30s %-20s %s' % t for t in data_list])
869 else:
870 message.append('(No idle DUTs)')
871 return '\n'.join(message)
872
873
J. Richard Barnette96db3492015-03-27 17:23:52 -0700874def _send_email(arguments, tag, subject, recipients, body):
875 """Send an inventory e-mail message.
876
Richard Barnette5de01eb2017-12-15 09:53:42 -0800877 The message is logged in the selected log directory using `tag` for
878 the file name.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700879
Richard Barnette5de01eb2017-12-15 09:53:42 -0800880 If the --debug option was requested, the message is neither logged
881 nor sent, but merely printed on stdout.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700882
883 @param arguments Parsed command-line options.
884 @param tag Tag identifying the inventory for logging
885 purposes.
886 @param subject E-mail Subject: header line.
887 @param recipients E-mail addresses for the To: header line.
888 @param body E-mail message body.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700889 """
890 logging.debug('Generating email: "%s"', subject)
891 all_recipients = ', '.join(recipients)
892 report_body = '\n'.join([
893 'To: %s' % all_recipients,
894 'Subject: %s' % subject,
895 '', body, ''])
J. Richard Barnette02e82432015-10-13 16:02:47 -0700896 if arguments.debug:
J. Richard Barnette96db3492015-03-27 17:23:52 -0700897 print report_body
898 else:
899 filename = os.path.join(arguments.logdir, tag)
900 try:
901 report_file = open(filename, 'w')
902 report_file.write(report_body)
903 report_file.close()
904 except EnvironmentError as e:
905 logging.error('Failed to write %s: %s', filename, e)
906 try:
907 gmail_lib.send_email(all_recipients, subject, body)
908 except Exception as e:
909 logging.error('Failed to send e-mail to %s: %s',
910 all_recipients, e)
911
912
Richard Barnette5de01eb2017-12-15 09:53:42 -0800913def _populate_model_counts(inventory):
914 """Gather model counts while providing interactive feedback.
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700915
916 Gathering the status of all individual DUTs in the lab can take
917 considerable time (~30 minutes at the time of this writing).
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700918 Normally, we pay that cost by querying as we go. However, with
919 the `--debug` option, we expect a human being to be watching the
Richard Barnette5de01eb2017-12-15 09:53:42 -0800920 progress in real time. So, we force the first (expensive) queries
921 to happen up front, and provide simple ASCII output on sys.stdout
922 to show a progress bar and results.
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700923
Richard Barnette5de01eb2017-12-15 09:53:42 -0800924 @param inventory `_LabInventory` object from which to gather
925 counts.
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700926 """
927 n = 0
928 total_broken = 0
Richard Barnette5de01eb2017-12-15 09:53:42 -0800929 for counts in inventory.itervalues():
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700930 n += 1
931 if n % 10 == 5:
932 c = '+'
933 elif n % 10 == 0:
934 c = '%d' % ((n / 10) % 10)
935 else:
936 c = '.'
937 sys.stdout.write(c)
938 sys.stdout.flush()
Richard Barnette5de01eb2017-12-15 09:53:42 -0800939 # This next call is where all the time goes - it forces all of a
940 # model's `HostJobHistory` objects to query the database and
941 # cache their results.
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700942 total_broken += counts.get_broken()
943 sys.stdout.write('\n')
944 sys.stdout.write('Found %d broken DUTs\n' % total_broken)
945
946
Richard Barnette5de01eb2017-12-15 09:53:42 -0800947def _perform_model_inventory(arguments, inventory, timestamp):
948 """Perform the model inventory report.
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700949
Richard Barnette5de01eb2017-12-15 09:53:42 -0800950 The model inventory report consists of the following:
951 * A list of DUTs that are recommended to be repaired. This list
952 is optional, and only appears if the `--recommend` option is
953 present.
954 * A list of all models that have failed DUTs, with counts
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700955 of working, broken, and spare DUTs, among others.
956
957 @param arguments Command-line arguments as returned by
958 `ArgumentParser`
Richard Barnette5de01eb2017-12-15 09:53:42 -0800959 @param inventory `_LabInventory` object to be reported on.
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700960 @param timestamp A string used to identify this run's timestamp
961 in logs and email output.
962 """
963 if arguments.recommend:
964 recommend_message = _generate_repair_recommendation(
965 inventory, arguments.recommend) + '\n\n\n'
966 else:
967 recommend_message = ''
Richard Barnette5de01eb2017-12-15 09:53:42 -0800968 model_message = _generate_model_inventory_message(inventory)
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700969 _send_email(arguments,
Richard Barnette5de01eb2017-12-15 09:53:42 -0800970 'models-%s.txt' % timestamp,
971 'DUT model inventory %s' % timestamp,
972 arguments.model_notify,
973 recommend_message + model_message)
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700974
975
976def _perform_pool_inventory(arguments, inventory, timestamp):
977 """Perform the pool inventory report.
978
979 The pool inventory report consists of the following:
980 * A list of all critical pools that have failed DUTs, with counts
981 of working, broken, and idle DUTs.
Richard Barnette5de01eb2017-12-15 09:53:42 -0800982 * A list of all idle DUTs by hostname including the model and
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700983 pool.
984
985 @param arguments Command-line arguments as returned by
986 `ArgumentParser`
Richard Barnette5de01eb2017-12-15 09:53:42 -0800987 @param inventory `_LabInventory` object to be reported on.
Richard Barnettecf5d8342017-10-24 18:13:11 -0700988 @param timestamp A string used to identify this run's timestamp in
989 logs and email output.
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700990 """
991 pool_message = _generate_pool_inventory_message(inventory)
992 idle_message = _generate_idle_inventory_message(inventory)
993 _send_email(arguments,
994 'pools-%s.txt' % timestamp,
995 'DUT pool inventory %s' % timestamp,
996 arguments.pool_notify,
997 pool_message + '\n\n\n' + idle_message)
998
999
Richard Barnettecf5d8342017-10-24 18:13:11 -07001000def _dut_in_repair_loop(history):
1001 """Return whether a DUT's history indicates a repair loop.
1002
1003 A DUT is considered looping if it runs no tests, and no tasks pass
1004 other than repair tasks.
1005
1006 @param history An instance of `status_history.HostJobHistory` to be
1007 scanned for a repair loop. The caller guarantees
1008 that this history corresponds to a working DUT.
1009 @returns Return a true value if the DUT's most recent history
1010 indicates a repair loop.
1011 """
1012 # Our caller passes only histories for working DUTs; that means
1013 # we've already paid the cost of fetching the diagnosis task, and
1014 # we know that the task was successful. The diagnosis task will be
1015 # one of the tasks we must scan to find a loop, so if the task isn't
1016 # a repair task, then our history includes a successful non-repair
1017 # task, and we're not looping.
1018 #
Richard Barnette1ca30e62018-04-09 16:45:58 -07001019 # The for loop below is very expensive, because it must fetch the
Richard Barnettecf5d8342017-10-24 18:13:11 -07001020 # full history, regardless of how many tasks we examine. At the
1021 # time of this writing, this check against the diagnosis task
1022 # reduces the cost of finding loops in the full inventory from hours
1023 # to minutes.
1024 if history.last_diagnosis()[1].name != 'Repair':
1025 return False
1026 repair_ok_count = 0
1027 for task in history:
1028 if not task.is_special:
1029 # This is a test, so we're not looping.
1030 return False
1031 if task.diagnosis == status_history.BROKEN:
1032 # Failed a repair, so we're not looping.
1033 return False
1034 if (task.diagnosis == status_history.WORKING
1035 and task.name != 'Repair'):
1036 # Non-repair task succeeded, so we're not looping.
1037 return False
1038 # At this point, we have either a failed non-repair task, or
1039 # a successful repair.
1040 if task.name == 'Repair':
1041 repair_ok_count += 1
1042 if repair_ok_count >= _REPAIR_LOOP_THRESHOLD:
1043 return True
1044
1045
Richard Barnette1ca30e62018-04-09 16:45:58 -07001046def _report_untestable_dut(history, state):
1047 fields = {
1048 'dut_hostname': history.hostname,
1049 'model': history.host_model,
1050 'pool': history.host_pool,
1051 'state': state,
1052 }
Richard Barnettee8eee312018-04-27 13:12:04 -04001053 logging.info('DUT in state %(state)s: %(dut_hostname)s, '
1054 'model: %(model)s, pool: %(pool)s', fields)
Richard Barnette1ca30e62018-04-09 16:45:58 -07001055 _UNTESTABLE_PRESENCE_METRIC.set(True, fields=fields)
Richard Barnettecf5d8342017-10-24 18:13:11 -07001056
Richard Barnettecf5d8342017-10-24 18:13:11 -07001057
Richard Barnette1ca30e62018-04-09 16:45:58 -07001058def _report_untestable_dut_metrics(inventory):
1059 """Scan the inventory for DUTs unable to run tests.
1060
1061 DUTs in the inventory are judged "untestable" if they meet one of
1062 two criteria:
1063 * The DUT is stuck in a repair loop; that is, it regularly passes
1064 repair, but never passes other operations.
1065 * The DUT runs no tasks at all, but is not locked.
1066
1067 This routine walks through the given inventory looking for DUTs in
1068 either of these states. Results are reported via a Monarch presence
1069 metric.
1070
1071 Note: To make sure that DUTs aren't flagged as "idle" merely
1072 because there's no work, a separate job runs prior to regular
1073 inventory runs which schedules trivial work on any DUT that appears
1074 idle.
1075
1076 @param inventory `_LabInventory` object to be reported on.
1077 """
Richard Barnettee8eee312018-04-27 13:12:04 -04001078 logging.info('Scanning for untestable DUTs.')
1079 for history in _all_dut_histories(inventory):
1080 # Managed DUTs with names that don't match
1081 # _HOSTNAME_PATTERN shouldn't be possible. However, we
1082 # don't want arbitrary strings being attached to the
1083 # 'dut_hostname' field, so for safety, we exclude all
1084 # anomalies.
1085 if not _HOSTNAME_PATTERN.match(history.hostname):
1086 continue
1087 if _host_is_working(history):
1088 if _dut_in_repair_loop(history):
1089 _report_untestable_dut(history, 'repair_loop')
1090 elif _host_is_idle(history):
1091 if not history.host.locked:
1092 _report_untestable_dut(history, 'idle_unlocked')
Richard Barnettecf5d8342017-10-24 18:13:11 -07001093
1094
Richard Barnette3dcbb6a2017-10-23 17:57:50 -07001095def _log_startup(arguments, startup_time):
1096 """Log the start of this inventory run.
1097
1098 Print various log messages indicating the start of the run. Return
1099 a string based on `startup_time` that will be used to identify this
1100 run in log files and e-mail messages.
1101
1102 @param startup_time A UNIX timestamp marking the moment when
1103 this inventory run began.
1104 @returns A timestamp string that will be used to identify this run
1105 in logs and email output.
1106 """
1107 timestamp = time.strftime('%Y-%m-%d.%H',
1108 time.localtime(startup_time))
1109 logging.debug('Starting lab inventory for %s', timestamp)
Richard Barnette5de01eb2017-12-15 09:53:42 -08001110 if arguments.model_notify:
Richard Barnette3dcbb6a2017-10-23 17:57:50 -07001111 if arguments.recommend:
1112 logging.debug('Will include repair recommendations')
Richard Barnette5de01eb2017-12-15 09:53:42 -08001113 logging.debug('Will include model inventory')
Richard Barnette3dcbb6a2017-10-23 17:57:50 -07001114 if arguments.pool_notify:
1115 logging.debug('Will include pool inventory')
1116 return timestamp
1117
1118
1119def _create_inventory(arguments, end_time):
1120 """Create the `_LabInventory` instance to use for reporting.
1121
1122 @param end_time A UNIX timestamp for the end of the time range
1123 to be searched in this inventory run.
1124 """
1125 start_time = end_time - arguments.duration * 60 * 60
1126 afe = frontend_wrappers.RetryingAFE(server=None)
1127 inventory = _LabInventory.create_inventory(
Richard Barnette5de01eb2017-12-15 09:53:42 -08001128 afe, start_time, end_time, arguments.modelnames)
1129 logging.info('Found %d hosts across %d models',
Richard Barnette3dcbb6a2017-10-23 17:57:50 -07001130 inventory.get_num_duts(),
Richard Barnette5de01eb2017-12-15 09:53:42 -08001131 inventory.get_num_models())
Richard Barnette3dcbb6a2017-10-23 17:57:50 -07001132 return inventory
1133
1134
Richard Barnettecf5d8342017-10-24 18:13:11 -07001135def _perform_inventory_reports(arguments):
1136 """Perform all inventory checks requested on the command line.
1137
1138 Create the initial inventory and run through the inventory reports
1139 as called for by the parsed command-line arguments.
1140
1141 @param arguments Command-line arguments as returned by
1142 `ArgumentParser`.
1143 """
1144 startup_time = time.time()
1145 timestamp = _log_startup(arguments, startup_time)
1146 inventory = _create_inventory(arguments, startup_time)
1147 if arguments.debug:
Richard Barnette5de01eb2017-12-15 09:53:42 -08001148 _populate_model_counts(inventory)
1149 if arguments.model_notify:
1150 _perform_model_inventory(arguments, inventory, timestamp)
Richard Barnettecf5d8342017-10-24 18:13:11 -07001151 if arguments.pool_notify:
1152 _perform_pool_inventory(arguments, inventory, timestamp)
Richard Barnette1ca30e62018-04-09 16:45:58 -07001153 if arguments.report_untestable:
1154 _report_untestable_dut_metrics(inventory)
Richard Barnettecf5d8342017-10-24 18:13:11 -07001155
1156
J. Richard Barnette96db3492015-03-27 17:23:52 -07001157def _separate_email_addresses(address_list):
1158 """Parse a list of comma-separated lists of e-mail addresses.
1159
1160 @param address_list A list of strings containing comma
1161 separate e-mail addresses.
1162 @return A list of the individual e-mail addresses.
J. Richard Barnette96db3492015-03-27 17:23:52 -07001163 """
1164 newlist = []
1165 for arg in address_list:
1166 newlist.extend([email.strip() for email in arg.split(',')])
1167 return newlist
1168
1169
1170def _verify_arguments(arguments):
1171 """Validate command-line arguments.
1172
Richard Barnette5de01eb2017-12-15 09:53:42 -08001173 Join comma separated e-mail addresses for `--model-notify` and
J. Richard Barnette96db3492015-03-27 17:23:52 -07001174 `--pool-notify` in separate option arguments into a single list.
1175
Richard Barnette54150302018-02-26 10:42:46 -08001176 For non-debug uses, require that at least one inventory report be
1177 requested. For debug, if a report isn't specified, treat it as "run
1178 all the reports."
J. Richard Barnette02e82432015-10-13 16:02:47 -07001179
1180 The return value indicates success or failure; in the case of
1181 failure, we also write an error message to stderr.
1182
J. Richard Barnette96db3492015-03-27 17:23:52 -07001183 @param arguments Command-line arguments as returned by
1184 `ArgumentParser`
J. Richard Barnette02e82432015-10-13 16:02:47 -07001185 @return True if the arguments are semantically good, or False
1186 if the arguments don't meet requirements.
J. Richard Barnette96db3492015-03-27 17:23:52 -07001187 """
Richard Barnette5de01eb2017-12-15 09:53:42 -08001188 arguments.model_notify = _separate_email_addresses(
1189 arguments.model_notify)
J. Richard Barnette96db3492015-03-27 17:23:52 -07001190 arguments.pool_notify = _separate_email_addresses(
1191 arguments.pool_notify)
Richard Barnette54150302018-02-26 10:42:46 -08001192 if not any([arguments.model_notify, arguments.pool_notify,
Richard Barnette1ca30e62018-04-09 16:45:58 -07001193 arguments.report_untestable]):
J. Richard Barnette02e82432015-10-13 16:02:47 -07001194 if not arguments.debug:
Richard Barnette54150302018-02-26 10:42:46 -08001195 sys.stderr.write('Must request at least one report via '
1196 '--model-notify, --pool-notify, or '
Richard Barnette1ca30e62018-04-09 16:45:58 -07001197 '--report-untestable\n')
J. Richard Barnette02e82432015-10-13 16:02:47 -07001198 return False
1199 else:
Richard Barnette54150302018-02-26 10:42:46 -08001200 # We want to run all the e-mail reports. An empty notify
1201 # list will cause a report to be skipped, so make sure the
J. Richard Barnette02e82432015-10-13 16:02:47 -07001202 # lists are non-empty.
Richard Barnette5de01eb2017-12-15 09:53:42 -08001203 arguments.model_notify = ['']
J. Richard Barnette02e82432015-10-13 16:02:47 -07001204 arguments.pool_notify = ['']
1205 return True
J. Richard Barnette96db3492015-03-27 17:23:52 -07001206
1207
Richard Barnette3dcbb6a2017-10-23 17:57:50 -07001208def _get_default_logdir(script):
J. Richard Barnette96db3492015-03-27 17:23:52 -07001209 """Get the default directory for the `--logdir` option.
1210
1211 The default log directory is based on the parent directory
1212 containing this script.
1213
1214 @param script Path to this script file.
1215 @return A path to a directory.
J. Richard Barnette96db3492015-03-27 17:23:52 -07001216 """
1217 basedir = os.path.dirname(os.path.abspath(script))
1218 basedir = os.path.dirname(basedir)
1219 return os.path.join(basedir, _LOGDIR)
1220
1221
1222def _parse_command(argv):
1223 """Parse the command line arguments.
1224
1225 Create an argument parser for this command's syntax, parse the
1226 command line, and return the result of the ArgumentParser
1227 parse_args() method.
1228
1229 @param argv Standard command line argument vector; argv[0] is
1230 assumed to be the command name.
1231 @return Result returned by ArgumentParser.parse_args().
J. Richard Barnette96db3492015-03-27 17:23:52 -07001232 """
1233 parser = argparse.ArgumentParser(
1234 prog=argv[0],
1235 description='Gather and report lab inventory statistics')
1236 parser.add_argument('-d', '--duration', type=int,
1237 default=_DEFAULT_DURATION, metavar='HOURS',
1238 help='number of hours back to search for status'
1239 ' (default: %d)' % _DEFAULT_DURATION)
Richard Barnette5de01eb2017-12-15 09:53:42 -08001240 parser.add_argument('--model-notify', action='append',
J. Richard Barnette96db3492015-03-27 17:23:52 -07001241 default=[], metavar='ADDRESS',
Richard Barnette5de01eb2017-12-15 09:53:42 -08001242 help='Generate model inventory message, '
J. Richard Barnette96db3492015-03-27 17:23:52 -07001243 'and send it to the given e-mail address(es)')
1244 parser.add_argument('--pool-notify', action='append',
1245 default=[], metavar='ADDRESS',
1246 help='Generate pool inventory message, '
1247 'and send it to the given address(es)')
J. Richard Barnette1df6a562015-06-09 10:06:17 -07001248 parser.add_argument('-r', '--recommend', type=int, default=None,
J. Richard Barnettef6839282015-06-01 16:00:35 -07001249 help=('Specify how many DUTs should be '
J. Richard Barnette1df6a562015-06-09 10:06:17 -07001250 'recommended for repair (default: no '
1251 'recommendation)'))
Richard Barnette1ca30e62018-04-09 16:45:58 -07001252 parser.add_argument('--report-untestable', action='store_true',
1253 help='Check for devices unable to run tests.')
J. Richard Barnette02e82432015-10-13 16:02:47 -07001254 parser.add_argument('--debug', action='store_true',
Prathmesh Prabhu6b48ede2018-05-07 14:33:07 -07001255 help='Print e-mail, metrics messages on stdout '
J. Richard Barnette96db3492015-03-27 17:23:52 -07001256 'without sending them.')
Richard Barnette3dcbb6a2017-10-23 17:57:50 -07001257 parser.add_argument('--logdir', default=_get_default_logdir(argv[0]),
J. Richard Barnette96db3492015-03-27 17:23:52 -07001258 help='Directory where logs will be written.')
Richard Barnette5de01eb2017-12-15 09:53:42 -08001259 parser.add_argument('modelnames', nargs='*',
1260 metavar='MODEL',
1261 help='names of models to report on '
1262 '(default: all models)')
J. Richard Barnette96db3492015-03-27 17:23:52 -07001263 arguments = parser.parse_args(argv[1:])
J. Richard Barnette02e82432015-10-13 16:02:47 -07001264 if not _verify_arguments(arguments):
1265 return None
J. Richard Barnette96db3492015-03-27 17:23:52 -07001266 return arguments
1267
1268
1269def _configure_logging(arguments):
1270 """Configure the `logging` module for our needs.
1271
Richard Barnette3dcbb6a2017-10-23 17:57:50 -07001272 How we log depends on whether the `--debug` option was provided on
1273 the command line.
1274 * Without the option, we configure the logging to capture all
1275 potentially relevant events in a log file. The log file is
1276 configured to rotate once a week on Friday evening, preserving
1277 ~3 months worth of history.
1278 * With the option, we expect stdout to contain other
1279 human-readable output (including the contents of the e-mail
Richard Barnettecf5d8342017-10-24 18:13:11 -07001280 messages), so we restrict the output to INFO level.
1281
1282 For convenience, when `--debug` is on, the logging format has
1283 no adornments, so that a call like `logging.info(msg)` simply writes
1284 `msg` to stdout, plus a trailing newline.
J. Richard Barnette96db3492015-03-27 17:23:52 -07001285
1286 @param arguments Command-line arguments as returned by
1287 `ArgumentParser`
J. Richard Barnette96db3492015-03-27 17:23:52 -07001288 """
J. Richard Barnettef6839282015-06-01 16:00:35 -07001289 root_logger = logging.getLogger()
J. Richard Barnette02e82432015-10-13 16:02:47 -07001290 if arguments.debug:
J. Richard Barnettef6839282015-06-01 16:00:35 -07001291 root_logger.setLevel(logging.INFO)
J. Richard Barnette96db3492015-03-27 17:23:52 -07001292 handler = logging.StreamHandler(sys.stdout)
1293 handler.setFormatter(logging.Formatter())
1294 else:
Richard Barnette5af97402016-04-18 11:00:26 -07001295 if not os.path.exists(arguments.logdir):
1296 os.mkdir(arguments.logdir)
J. Richard Barnettef6839282015-06-01 16:00:35 -07001297 root_logger.setLevel(logging.DEBUG)
J. Richard Barnette96db3492015-03-27 17:23:52 -07001298 logfile = os.path.join(arguments.logdir, _LOGFILE)
1299 handler = logging.handlers.TimedRotatingFileHandler(
1300 logfile, when='W4', backupCount=13)
1301 formatter = logging.Formatter(_LOG_FORMAT,
1302 time_utils.TIME_FMT)
1303 handler.setFormatter(formatter)
J. Richard Barnettef6839282015-06-01 16:00:35 -07001304 # TODO(jrbarnette) This is gross. Importing client.bin.utils
1305 # implicitly imported logging_config, which calls
1306 # logging.basicConfig() *at module level*. That gives us an
1307 # extra logging handler that we don't want. So, clear out all
1308 # the handlers here.
1309 for h in root_logger.handlers:
1310 root_logger.removeHandler(h)
1311 root_logger.addHandler(handler)
J. Richard Barnette96db3492015-03-27 17:23:52 -07001312
1313
J. Richard Barnette96db3492015-03-27 17:23:52 -07001314def main(argv):
1315 """Standard main routine.
Richard Barnettecf5d8342017-10-24 18:13:11 -07001316
1317 @param argv Command line arguments, including `sys.argv[0]`.
J. Richard Barnette96db3492015-03-27 17:23:52 -07001318 """
1319 arguments = _parse_command(argv)
J. Richard Barnette02e82432015-10-13 16:02:47 -07001320 if not arguments:
1321 sys.exit(1)
J. Richard Barnette96db3492015-03-27 17:23:52 -07001322 _configure_logging(arguments)
Prathmesh Prabhu6b48ede2018-05-07 14:33:07 -07001323
1324 if arguments.debug:
1325 logging.info('--debug mode: Will not report metrics to monarch')
1326 metrics_file = '/dev/null'
1327 else:
1328 metrics_file = None
1329
Prathmesh Prabhu58728f42018-05-07 14:37:35 -07001330 with site_utils.SetupTsMonGlobalState(
1331 'lab_inventory', debug_file=metrics_file,
1332 auto_flush=False):
Prathmesh Prabhub69a6cc2018-05-07 14:49:33 -07001333 success = False
Prathmesh Prabhu58728f42018-05-07 14:37:35 -07001334 try:
Prathmesh Prabhub69a6cc2018-05-07 14:49:33 -07001335 with metrics.SecondsTimer('%s/duration' % _METRICS_PREFIX):
1336 _perform_inventory_reports(arguments)
1337 success = True
Prathmesh Prabhu58728f42018-05-07 14:37:35 -07001338 except KeyboardInterrupt:
1339 pass
1340 except (EnvironmentError, Exception):
1341 # Our cron setup doesn't preserve stderr, so drop extra breadcrumbs.
1342 logging.exception('Error escaped main')
1343 raise
1344 finally:
Prathmesh Prabhub69a6cc2018-05-07 14:49:33 -07001345 metrics.Counter('%s/tick' % _METRICS_PREFIX).increment(
1346 fields={'success': success})
Prathmesh Prabhu58728f42018-05-07 14:37:35 -07001347 metrics.Flush()
J. Richard Barnette96db3492015-03-27 17:23:52 -07001348
1349
Kevin Chengcf0ad2b2016-04-19 14:51:39 -07001350def get_inventory(afe):
J. Richard Barnetteaa868932015-10-23 13:28:59 -07001351 end_time = int(time.time())
1352 start_time = end_time - 24 * 60 * 60
Kevin Chengcf0ad2b2016-04-19 14:51:39 -07001353 return _LabInventory.create_inventory(afe, start_time, end_time)
1354
1355
1356def get_managed_boards(afe):
Richard Barnette5de01eb2017-12-15 09:53:42 -08001357 return get_inventory(afe).get_boards()
J. Richard Barnetteaa868932015-10-23 13:28:59 -07001358
1359
J. Richard Barnette96db3492015-03-27 17:23:52 -07001360if __name__ == '__main__':
1361 main(sys.argv)