blob: 58b50b9ffc670b2102e771df40266e8edc190b62 [file] [log] [blame]
J. Richard Barnette96db3492015-03-27 17:23:52 -07001#!/usr/bin/env python
2# Copyright 2015 The Chromium OS Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6"""Create e-mail reports of the Lab's DUT inventory.
7
8Gathers a list of all DUTs of interest in the Lab, segregated by
Richard Barnette5de01eb2017-12-15 09:53:42 -08009model and pool, and determines whether each DUT is working or
J. Richard Barnette96db3492015-03-27 17:23:52 -070010broken. Then, send one or more e-mail reports summarizing the
11status to e-mail addresses provided on the command line.
12
Richard Barnette5de01eb2017-12-15 09:53:42 -080013usage: lab_inventory.py [ options ] [ model ... ]
J. Richard Barnette96db3492015-03-27 17:23:52 -070014
15Options:
16--duration / -d <hours>
17 How far back in time to search job history to determine DUT
18 status.
19
Richard Barnette5de01eb2017-12-15 09:53:42 -080020--model-notify <address>[,<address>]
21 Send the "model status" e-mail to all the specified e-mail
J. Richard Barnette96db3492015-03-27 17:23:52 -070022 addresses.
23
24--pool-notify <address>[,<address>]
25 Send the "pool status" e-mail to all the specified e-mail
26 addresses.
27
J. Richard Barnette1df6a562015-06-09 10:06:17 -070028--recommend <number>
Richard Barnette5de01eb2017-12-15 09:53:42 -080029 When generating the "model status" e-mail, include a list of
J. Richard Barnette1df6a562015-06-09 10:06:17 -070030 <number> specific DUTs to be recommended for repair.
31
Richard Barnette1ca30e62018-04-09 16:45:58 -070032--report-untestable
33 Scan the inventory for DUTs that can't test because they're stuck in
34 repair loops, or because the scheduler can't give them work.
Richard Barnettecf5d8342017-10-24 18:13:11 -070035
J. Richard Barnette96db3492015-03-27 17:23:52 -070036--logdir <directory>
37 Log progress and actions in a file under this directory. Text
38 of any e-mail sent will also be logged in a timestamped file in
39 this directory.
40
J. Richard Barnette02e82432015-10-13 16:02:47 -070041--debug
Richard Barnettecf5d8342017-10-24 18:13:11 -070042 Suppress all logging, metrics reporting, and sending e-mail.
43 Instead, write the output that would be generated onto stdout.
J. Richard Barnette96db3492015-03-27 17:23:52 -070044
Richard Barnette5de01eb2017-12-15 09:53:42 -080045<model> arguments:
46 With no arguments, gathers the status for all models in the lab.
47 With one or more named models on the command line, restricts
48 reporting to just those models.
J. Richard Barnette96db3492015-03-27 17:23:52 -070049"""
50
51
52import argparse
Prathmesh Prabhu021e7842017-11-08 18:05:45 -080053import collections
J. Richard Barnette96db3492015-03-27 17:23:52 -070054import logging
55import logging.handlers
56import os
J. Richard Barnettef6839282015-06-01 16:00:35 -070057import re
J. Richard Barnette96db3492015-03-27 17:23:52 -070058import sys
59import time
60
61import common
J. Richard Barnettef6839282015-06-01 16:00:35 -070062from autotest_lib.client.bin import utils
J. Richard Barnette96db3492015-03-27 17:23:52 -070063from autotest_lib.client.common_lib import time_utils
Xixuan Wu93e646c2017-12-07 18:36:10 -080064from autotest_lib.server import constants
Richard Barnettecf5d8342017-10-24 18:13:11 -070065from autotest_lib.server import site_utils
J. Richard Barnettea7c514e2015-09-15 11:13:23 -070066from autotest_lib.server.cros.dynamic_suite import frontend_wrappers
J. Richard Barnettef6839282015-06-01 16:00:35 -070067from autotest_lib.server.hosts import servo_host
Aviv Keshet7ee95862016-08-30 15:18:27 -070068from autotest_lib.server.lib import status_history
J. Richard Barnette96db3492015-03-27 17:23:52 -070069from autotest_lib.site_utils import gmail_lib
Richard Barnettecf5d8342017-10-24 18:13:11 -070070from chromite.lib import metrics
J. Richard Barnette96db3492015-03-27 17:23:52 -070071
72
Richard Barnette673573b2016-12-12 09:46:39 -080073CRITICAL_POOLS = constants.Pools.CRITICAL_POOLS
74SPARE_POOL = constants.Pools.SPARE_POOL
75MANAGED_POOLS = constants.Pools.MANAGED_POOLS
J. Richard Barnette96db3492015-03-27 17:23:52 -070076
J. Richard Barnetteb8bc570c2016-03-17 17:03:57 -070077# _EXCLUDED_LABELS - A set of labels that disqualify a DUT from
Richard Barnetteeabcf392017-09-01 15:10:54 -070078# monitoring by this script. Currently, we're excluding these:
79# + 'adb' - We're not ready to monitor Android or Brillo hosts.
80# + 'board:guado_moblab' - These are maintained by a separate
81# process that doesn't use this script.
Kevin Chengcf0ad2b2016-04-19 14:51:39 -070082
Richard Barnetteeabcf392017-09-01 15:10:54 -070083_EXCLUDED_LABELS = {'adb', 'board:guado_moblab'}
J. Richard Barnetteb8bc570c2016-03-17 17:03:57 -070084
J. Richard Barnette96db3492015-03-27 17:23:52 -070085# _DEFAULT_DURATION:
86# Default value used for the --duration command line option.
87# Specifies how far back in time to search in order to determine
88# DUT status.
89
90_DEFAULT_DURATION = 24
91
J. Richard Barnette96db3492015-03-27 17:23:52 -070092# _LOGDIR:
Richard Barnettecf5d8342017-10-24 18:13:11 -070093# Relative path used in the calculation of the default setting for
94# the --logdir option. The full path is relative to the root of the
95# autotest directory, as determined from sys.argv[0].
J. Richard Barnette96db3492015-03-27 17:23:52 -070096# _LOGFILE:
97# Basename of a file to which general log information will be
98# written.
99# _LOG_FORMAT:
100# Format string for log messages.
101
102_LOGDIR = os.path.join('logs', 'dut-data')
103_LOGFILE = 'lab-inventory.log'
104_LOG_FORMAT = '%(asctime)s | %(levelname)-10s | %(message)s'
105
J. Richard Barnettef6839282015-06-01 16:00:35 -0700106# Pattern describing location-based host names in the Chrome OS test
107# labs. Each DUT hostname designates the DUT's location:
108# * A lab (room) that's physically separated from other labs
109# (i.e. there's a door).
110# * A row (or aisle) of DUTs within the lab.
111# * A vertical rack of shelves on the row.
112# * A specific host on one shelf of the rack.
113
114_HOSTNAME_PATTERN = re.compile(
115 r'(chromeos\d+)-row(\d+)-rack(\d+)-host(\d+)')
116
Richard Barnettecf5d8342017-10-24 18:13:11 -0700117# _REPAIR_LOOP_THRESHOLD:
118# The number of repeated Repair tasks that must be seen to declare
119# that a DUT is stuck in a repair loop.
120
121_REPAIR_LOOP_THRESHOLD = 4
122
J. Richard Barnette96db3492015-03-27 17:23:52 -0700123
Richard Barnette1ca30e62018-04-09 16:45:58 -0700124_UNTESTABLE_PRESENCE_METRIC = metrics.BooleanMetric(
125 'chromeos/autotest/inventory/untestable',
126 'DUTs that cannot be scheduled for testing')
127
128
Richard Barnette5de01eb2017-12-15 09:53:42 -0800129class _HostSetInventory(object):
130 """Maintains a set of related `HostJobHistory` objects.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700131
Richard Barnette5de01eb2017-12-15 09:53:42 -0800132 The collection is segregated into disjoint categories of "working",
133 "broken", and "idle" DUTs. Accessor methods allow finding both the
134 list of DUTs in each category, as well as counts of each category.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700135
J. Richard Barnettef6839282015-06-01 16:00:35 -0700136 Performance note: Certain methods in this class are potentially
137 expensive:
138 * `get_working()`
139 * `get_working_list()`
140 * `get_broken()`
141 * `get_broken_list()`
xixuan12ce04f2016-03-10 13:16:30 -0800142 * `get_idle()`
143 * `get_idle_list()`
J. Richard Barnettef6839282015-06-01 16:00:35 -0700144 The first time any one of these methods is called, it causes
145 multiple RPC calls with a relatively expensive set of database
146 queries. However, the results of the queries are cached in the
147 individual `HostJobHistory` objects, so only the first call
148 actually pays the full cost.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700149
xixuan12ce04f2016-03-10 13:16:30 -0800150 Additionally, `get_working_list()`, `get_broken_list()` and
151 `get_idle_list()` cache their return values to avoid recalculating
Richard Barnette5de01eb2017-12-15 09:53:42 -0800152 lists at every call; this caching is separate from the caching of
153 RPC results described above.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700154
Richard Barnette5de01eb2017-12-15 09:53:42 -0800155 This class is deliberately constructed to delay the RPC cost until
156 the accessor methods are called (rather than to query in
J. Richard Barnette96db3492015-03-27 17:23:52 -0700157 `record_host()`) so that it's possible to construct a complete
158 `_LabInventory` without making the expensive queries at creation
Richard Barnette5de01eb2017-12-15 09:53:42 -0800159 time. `_populate_model_counts()`, below, assumes this behavior.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700160
Richard Barnette5de01eb2017-12-15 09:53:42 -0800161 Current usage of this class is that all DUTs are part of a single
162 scheduling pool of DUTs; however, this class make no assumptions
163 about the actual relationship among the DUTs.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700164 """
165
166 def __init__(self):
167 self._histories = []
J. Richard Barnettef6839282015-06-01 16:00:35 -0700168 self._working_list = None
169 self._broken_list = None
xixuan12ce04f2016-03-10 13:16:30 -0800170 self._idle_list = None
J. Richard Barnette96db3492015-03-27 17:23:52 -0700171
J. Richard Barnette96db3492015-03-27 17:23:52 -0700172 def record_host(self, host_history):
173 """Add one `HostJobHistory` object to the collection.
174
175 @param host_history The `HostJobHistory` object to be
176 remembered.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700177 """
J. Richard Barnettef6839282015-06-01 16:00:35 -0700178 self._working_list = None
179 self._broken_list = None
xixuan12ce04f2016-03-10 13:16:30 -0800180 self._idle_list = None
J. Richard Barnette96db3492015-03-27 17:23:52 -0700181 self._histories.append(host_history)
182
J. Richard Barnettef6839282015-06-01 16:00:35 -0700183 def get_working_list(self):
184 """Return a list of all working DUTs in the pool.
185
186 Filter `self._histories` for histories where the last
187 diagnosis is `WORKING`.
188
189 Cache the result so that we only cacluate it once.
190
191 @return A list of HostJobHistory objects.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700192 """
193 if self._working_list is None:
194 self._working_list = [h for h in self._histories
195 if h.last_diagnosis()[0] == status_history.WORKING]
196 return self._working_list
197
J. Richard Barnette96db3492015-03-27 17:23:52 -0700198 def get_working(self):
J. Richard Barnettef6839282015-06-01 16:00:35 -0700199 """Return the number of working DUTs in the pool."""
200 return len(self.get_working_list())
201
J. Richard Barnettef6839282015-06-01 16:00:35 -0700202 def get_broken_list(self):
203 """Return a list of all broken DUTs in the pool.
204
205 Filter `self._histories` for histories where the last
xixuan12ce04f2016-03-10 13:16:30 -0800206 diagnosis is `BROKEN`.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700207
208 Cache the result so that we only cacluate it once.
209
210 @return A list of HostJobHistory objects.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700211 """
212 if self._broken_list is None:
213 self._broken_list = [h for h in self._histories
xixuan12ce04f2016-03-10 13:16:30 -0800214 if h.last_diagnosis()[0] == status_history.BROKEN]
J. Richard Barnettef6839282015-06-01 16:00:35 -0700215 return self._broken_list
J. Richard Barnette96db3492015-03-27 17:23:52 -0700216
J. Richard Barnette96db3492015-03-27 17:23:52 -0700217 def get_broken(self):
J. Richard Barnettef6839282015-06-01 16:00:35 -0700218 """Return the number of broken DUTs in the pool."""
219 return len(self.get_broken_list())
J. Richard Barnette96db3492015-03-27 17:23:52 -0700220
xixuan12ce04f2016-03-10 13:16:30 -0800221 def get_idle_list(self):
222 """Return a list of all idle DUTs in the pool.
223
224 Filter `self._histories` for histories where the last
225 diagnosis is `UNUSED` or `UNKNOWN`.
226
227 Cache the result so that we only cacluate it once.
228
229 @return A list of HostJobHistory objects.
xixuan12ce04f2016-03-10 13:16:30 -0800230 """
Richard Barnette5de01eb2017-12-15 09:53:42 -0800231 idle_statuses = {status_history.UNUSED, status_history.UNKNOWN}
xixuan12ce04f2016-03-10 13:16:30 -0800232 if self._idle_list is None:
233 self._idle_list = [h for h in self._histories
Richard Barnette5de01eb2017-12-15 09:53:42 -0800234 if h.last_diagnosis()[0] in idle_statuses]
xixuan12ce04f2016-03-10 13:16:30 -0800235 return self._idle_list
236
xixuan12ce04f2016-03-10 13:16:30 -0800237 def get_idle(self):
238 """Return the number of idle DUTs in the pool."""
239 return len(self.get_idle_list())
240
J. Richard Barnette96db3492015-03-27 17:23:52 -0700241 def get_total(self):
J. Richard Barnettef6839282015-06-01 16:00:35 -0700242 """Return the total number of DUTs in the pool."""
J. Richard Barnette96db3492015-03-27 17:23:52 -0700243 return len(self._histories)
244
245
Richard Barnette5de01eb2017-12-15 09:53:42 -0800246class _PoolSetInventory(object):
247 """Maintains a set of `HostJobHistory`s for a set of pools.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700248
Richard Barnette5de01eb2017-12-15 09:53:42 -0800249 The collection is segregated into disjoint categories of "working",
250 "broken", and "idle" DUTs. Accessor methods allow finding both the
251 list of DUTs in each category, as well as counts of each category.
252 Accessor queries can be for an individual pool, or against all
253 pools.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700254
Richard Barnette5de01eb2017-12-15 09:53:42 -0800255 Performance note: This class relies on `_HostSetInventory`. Public
256 methods in this class generally rely on methods of the same name in
257 the underlying class, and so will have the same underlying
258 performance characteristics.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700259 """
260
Richard Barnette5de01eb2017-12-15 09:53:42 -0800261 def __init__(self, pools):
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800262 self._histories_by_pool = {
Richard Barnette5de01eb2017-12-15 09:53:42 -0800263 pool: _HostSetInventory() for pool in pools
J. Richard Barnette96db3492015-03-27 17:23:52 -0700264 }
265
266 def record_host(self, host_history):
267 """Add one `HostJobHistory` object to the collection.
268
269 @param host_history The `HostJobHistory` object to be
270 remembered.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700271 """
J. Richard Barnette3d0590a2015-04-29 12:56:12 -0700272 pool = host_history.host_pool
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800273 self._histories_by_pool[pool].record_host(host_history)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700274
J. Richard Barnette96db3492015-03-27 17:23:52 -0700275 def _count_pool(self, get_pool_count, pool=None):
276 """Internal helper to count hosts in a given pool.
277
278 The `get_pool_count` parameter is a function to calculate
279 the exact count of interest for the pool.
280
281 @param get_pool_count Function to return a count from a
282 _PoolCount object.
283 @param pool The pool to be counted. If `None`,
284 return the total across all pools.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700285 """
286 if pool is None:
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800287 return sum([get_pool_count(cached_history) for cached_history in
288 self._histories_by_pool.values()])
J. Richard Barnette96db3492015-03-27 17:23:52 -0700289 else:
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800290 return get_pool_count(self._histories_by_pool[pool])
J. Richard Barnette96db3492015-03-27 17:23:52 -0700291
J. Richard Barnettef6839282015-06-01 16:00:35 -0700292 def get_working_list(self):
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800293 """Return a list of all working DUTs (across all pools).
J. Richard Barnettef6839282015-06-01 16:00:35 -0700294
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800295 Go through all HostJobHistory objects across all pools, selecting the
296 ones where the last diagnosis is `WORKING`.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700297
298 @return A list of HostJobHistory objects.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700299 """
300 l = []
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800301 for p in self._histories_by_pool.values():
J. Richard Barnettef6839282015-06-01 16:00:35 -0700302 l.extend(p.get_working_list())
303 return l
304
J. Richard Barnette96db3492015-03-27 17:23:52 -0700305 def get_working(self, pool=None):
306 """Return the number of working DUTs in a pool.
307
308 @param pool The pool to be counted. If `None`, return the
309 total across all pools.
310
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700311 @return The total number of working DUTs in the selected
312 pool(s).
J. Richard Barnette96db3492015-03-27 17:23:52 -0700313 """
Richard Barnette5de01eb2017-12-15 09:53:42 -0800314 return self._count_pool(_HostSetInventory.get_working, pool)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700315
J. Richard Barnettef6839282015-06-01 16:00:35 -0700316 def get_broken_list(self):
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800317 """Return a list of all broken DUTs (across all pools).
J. Richard Barnettef6839282015-06-01 16:00:35 -0700318
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800319 Go through all HostJobHistory objects in the across all pools,
xixuan12ce04f2016-03-10 13:16:30 -0800320 selecting the ones where the last diagnosis is `BROKEN`.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700321
322 @return A list of HostJobHistory objects.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700323 """
324 l = []
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800325 for p in self._histories_by_pool.values():
J. Richard Barnettef6839282015-06-01 16:00:35 -0700326 l.extend(p.get_broken_list())
327 return l
328
J. Richard Barnette96db3492015-03-27 17:23:52 -0700329 def get_broken(self, pool=None):
330 """Return the number of broken DUTs in a pool.
331
332 @param pool The pool to be counted. If `None`, return the
333 total across all pools.
334
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700335 @return The total number of broken DUTs in the selected pool(s).
J. Richard Barnette96db3492015-03-27 17:23:52 -0700336 """
Richard Barnette5de01eb2017-12-15 09:53:42 -0800337 return self._count_pool(_HostSetInventory.get_broken, pool)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700338
xixuan12ce04f2016-03-10 13:16:30 -0800339 def get_idle_list(self, pool=None):
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800340 """Return a list of all idle DUTs in the given pool.
xixuan12ce04f2016-03-10 13:16:30 -0800341
Prathmesh Prabhu0ecbf322017-11-08 17:04:24 -0800342 Go through all HostJobHistory objects in the given pool, selecting the
343 ones where the last diagnosis is `UNUSED` or `UNKNOWN`.
xixuan12ce04f2016-03-10 13:16:30 -0800344
345 @param pool: The pool to be counted. If `None`, return the total list
346 across all pools.
347
348 @return A list of HostJobHistory objects.
xixuan12ce04f2016-03-10 13:16:30 -0800349 """
350 if pool is None:
351 l = []
Richard Barnette5de01eb2017-12-15 09:53:42 -0800352 for p in self._histories_by_pool.itervalues():
xixuan12ce04f2016-03-10 13:16:30 -0800353 l.extend(p.get_idle_list())
354 return l
355 else:
Richard Barnette5de01eb2017-12-15 09:53:42 -0800356 return self._histories_by_pool[pool].get_idle_list()
xixuan12ce04f2016-03-10 13:16:30 -0800357
xixuan12ce04f2016-03-10 13:16:30 -0800358 def get_idle(self, pool=None):
359 """Return the number of idle DUTs in a pool.
360
361 @param pool: The pool to be counted. If `None`, return the total
362 across all pools.
363
364 @return The total number of idle DUTs in the selected pool(s).
365 """
Richard Barnette5de01eb2017-12-15 09:53:42 -0800366 return self._count_pool(_HostSetInventory.get_idle, pool)
xixuan12ce04f2016-03-10 13:16:30 -0800367
Richard Barnette5de01eb2017-12-15 09:53:42 -0800368 def get_spares_buffer(self, spare_pool=SPARE_POOL):
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700369 """Return the the nominal number of working spares.
370
371 Calculates and returns how many working spares there would
372 be in the spares pool if all broken DUTs were in the spares
373 pool. This number may be negative, indicating a shortfall
374 in the critical pools.
375
376 @return The total number DUTs in the spares pool, less the total
377 number of broken DUTs in all pools.
378 """
Richard Barnette5de01eb2017-12-15 09:53:42 -0800379 return self.get_total(spare_pool) - self.get_broken()
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700380
J. Richard Barnette96db3492015-03-27 17:23:52 -0700381 def get_total(self, pool=None):
382 """Return the total number of DUTs in a pool.
383
384 @param pool The pool to be counted. If `None`, return the
385 total across all pools.
386
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700387 @return The total number of DUTs in the selected pool(s).
J. Richard Barnette96db3492015-03-27 17:23:52 -0700388 """
Richard Barnette5de01eb2017-12-15 09:53:42 -0800389 return self._count_pool(_HostSetInventory.get_total, pool)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700390
391
Richard Barnette5de01eb2017-12-15 09:53:42 -0800392def _eligible_host(afehost):
393 """Return whether this host is eligible for monitoring.
394
395 A host is eligible if it has a (unique) 'model' label, it's in
396 exactly one pool, and it has no labels from the
397 `_EXCLUDED_LABELS` set.
398
399 @param afehost The host to be tested for eligibility.
400 """
401 # DUTs without an existing, unique 'model' or 'pool' label
402 # aren't meant to exist in the managed inventory; their presence
403 # generally indicates an error in the database. Unfortunately
404 # such errors have been seen to occur from time to time.
405 #
406 # The _LabInventory constructor requires hosts to conform to the
407 # label restrictions, and may fail if they don't. Failing an
408 # inventory run for a single bad entry is the wrong thing, so we
409 # ignore the problem children here, to keep them out of the
410 # inventory.
411 models = [l for l in afehost.labels
412 if l.startswith(constants.Labels.MODEL_PREFIX)]
413 pools = [l for l in afehost.labels
414 if l.startswith(constants.Labels.POOL_PREFIX)]
415 excluded = _EXCLUDED_LABELS.intersection(afehost.labels)
416 return len(models) == 1 and len(pools) == 1 and not excluded
417
418
419class _LabInventory(collections.Mapping):
J. Richard Barnette96db3492015-03-27 17:23:52 -0700420 """Collection of `HostJobHistory` objects for the Lab's inventory.
421
Richard Barnette5de01eb2017-12-15 09:53:42 -0800422 This is a dict-like collection indexed by model. Indexing returns
423 the _PoolSetInventory object associated with the model.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700424 """
425
426 @classmethod
Richard Barnette5de01eb2017-12-15 09:53:42 -0800427 def create_inventory(cls, afe, start_time, end_time, modellist=[]):
J. Richard Barnette96db3492015-03-27 17:23:52 -0700428 """Return a Lab inventory with specified parameters.
429
Richard Barnette5de01eb2017-12-15 09:53:42 -0800430 By default, gathers inventory from `HostJobHistory` objects for
431 all DUTs in the `MANAGED_POOLS` list. If `modellist` is
432 supplied, the inventory will be restricted to only the given
433 models.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700434
Richard Barnette5de01eb2017-12-15 09:53:42 -0800435 @param afe AFE object for constructing the
436 `HostJobHistory` objects.
437 @param start_time Start time for the `HostJobHistory` objects.
438 @param end_time End time for the `HostJobHistory` objects.
439 @param modellist List of models to include. If empty,
440 include all available models.
441 @return A `_LabInventory` object for the specified models.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700442 """
Richard Barnette5de01eb2017-12-15 09:53:42 -0800443 target_pools = MANAGED_POOLS
444 label_list = [constants.Labels.POOL_PREFIX + l for l in target_pools]
J. Richard Barnette96db3492015-03-27 17:23:52 -0700445 afehosts = afe.get_hosts(labels__name__in=label_list)
Richard Barnette5de01eb2017-12-15 09:53:42 -0800446 if modellist:
J. Richard Barnetteb8bc570c2016-03-17 17:03:57 -0700447 # We're deliberately not checking host eligibility in this
448 # code path. This is a debug path, not used in production;
449 # it may be useful to include ineligible hosts here.
Richard Barnette5de01eb2017-12-15 09:53:42 -0800450 modelhosts = []
451 for model in modellist:
452 model_label = constants.Labels.MODEL_PREFIX + model
J. Richard Barnette96db3492015-03-27 17:23:52 -0700453 host_list = [h for h in afehosts
Richard Barnette5de01eb2017-12-15 09:53:42 -0800454 if model_label in h.labels]
455 modelhosts.extend(host_list)
456 afehosts = modelhosts
J. Richard Barnetteb8bc570c2016-03-17 17:03:57 -0700457 else:
Richard Barnette3a404492018-02-08 13:57:01 -0800458 afehosts = [h for h in afehosts if _eligible_host(h)]
J. Richard Barnette96db3492015-03-27 17:23:52 -0700459 create = lambda host: (
460 status_history.HostJobHistory(afe, host,
461 start_time, end_time))
Richard Barnette5de01eb2017-12-15 09:53:42 -0800462 return cls([create(host) for host in afehosts], target_pools)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700463
Richard Barnette5de01eb2017-12-15 09:53:42 -0800464 def __init__(self, histories, pools):
465 models = {h.host_model for h in histories}
466 self._modeldata = {model: _PoolSetInventory(pools) for model in models}
J. Richard Barnette96db3492015-03-27 17:23:52 -0700467 self._dut_count = len(histories)
Richard Barnette5de01eb2017-12-15 09:53:42 -0800468 for h in histories:
469 self[h.host_model].record_host(h)
470 self._boards = {h.host_board for h in histories}
Prathmesh Prabhu154cb2b2017-11-08 17:36:51 -0800471
Richard Barnette5de01eb2017-12-15 09:53:42 -0800472 def __getitem__(self, key):
473 return self._modeldata.__getitem__(key)
Prathmesh Prabhu021e7842017-11-08 18:05:45 -0800474
Richard Barnette5de01eb2017-12-15 09:53:42 -0800475 def __len__(self):
476 return self._modeldata.__len__()
477
Richard Barnette5de01eb2017-12-15 09:53:42 -0800478 def __iter__(self):
479 return self._modeldata.__iter__()
480
Richard Barnette5de01eb2017-12-15 09:53:42 -0800481 def reportable_items(self, spare_pool=SPARE_POOL):
Richard Barnettedf01f1b2018-04-20 14:44:40 -0400482 """Iterate over all items subject to reporting.
Richard Barnette5de01eb2017-12-15 09:53:42 -0800483
484 Yields the contents of `self.iteritems()` filtered to include
485 only reportable models. A model is reportable if it has DUTs in
486 both `spare_pool` and at least one other pool.
487
488 @param spare_pool The spare pool to be tested for reporting.
Prathmesh Prabhu021e7842017-11-08 18:05:45 -0800489 """
Richard Barnette5de01eb2017-12-15 09:53:42 -0800490 for model, histories in self.iteritems():
491 spares = histories.get_total(spare_pool)
Prathmesh Prabhua5a0e3d2017-11-09 08:53:53 -0800492 total = histories.get_total()
Richard Barnette5de01eb2017-12-15 09:53:42 -0800493 if spares != 0 and spares != total:
494 yield model, histories
J. Richard Barnettef6839282015-06-01 16:00:35 -0700495
J. Richard Barnette96db3492015-03-27 17:23:52 -0700496 def get_num_duts(self):
497 """Return the total number of DUTs in the inventory."""
498 return self._dut_count
499
Prathmesh Prabhua5a0e3d2017-11-09 08:53:53 -0800500 def get_num_models(self):
501 """Return the total number of models in the inventory."""
Richard Barnette5de01eb2017-12-15 09:53:42 -0800502 return len(self)
503
Richard Barnette5de01eb2017-12-15 09:53:42 -0800504 def get_pool_models(self, pool):
505 """Return all models in `pool`.
506
507 @param pool The pool to be inventoried for models.
508 """
509 return {m for m, h in self.iteritems() if h.get_total(pool)}
510
Richard Barnette5de01eb2017-12-15 09:53:42 -0800511 def get_boards(self):
512 return self._boards
Prathmesh Prabhua5a0e3d2017-11-09 08:53:53 -0800513
514
J. Richard Barnettef6839282015-06-01 16:00:35 -0700515def _sort_by_location(inventory_list):
516 """Return a list of DUTs, organized by location.
517
518 Take the given list of `HostJobHistory` objects, separate it
519 into a list per lab, and sort each lab's list by location. The
520 order of sorting within a lab is
521 * By row number within the lab,
522 * then by rack number within the row,
523 * then by host shelf number within the rack.
524
525 Return a list of the sorted lists.
526
527 Implementation note: host locations are sorted by converting
528 each location into a base 100 number. If row, rack or
529 host numbers exceed the range [0..99], then sorting will
530 break down.
531
532 @return A list of sorted lists of DUTs.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700533 """
534 BASE = 100
535 lab_lists = {}
536 for history in inventory_list:
537 location = _HOSTNAME_PATTERN.match(history.host.hostname)
538 if location:
539 lab = location.group(1)
540 key = 0
541 for idx in location.group(2, 3, 4):
542 key = BASE * key + int(idx)
543 lab_lists.setdefault(lab, []).append((key, history))
544 return_list = []
545 for dut_list in lab_lists.values():
546 dut_list.sort(key=lambda t: t[0])
547 return_list.append([t[1] for t in dut_list])
548 return return_list
549
550
551def _score_repair_set(buffer_counts, repair_list):
552 """Return a numeric score rating a set of DUTs to be repaired.
553
Richard Barnette5de01eb2017-12-15 09:53:42 -0800554 `buffer_counts` is a dictionary mapping model names to the size of
555 the model's spares buffer.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700556
Richard Barnette5de01eb2017-12-15 09:53:42 -0800557 `repair_list` is a list of `HostJobHistory` objects for the DUTs to
558 be repaired.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700559
560 This function calculates the new set of buffer counts that would
Richard Barnette5de01eb2017-12-15 09:53:42 -0800561 result from the proposed repairs, and scores the new set using two
562 numbers:
563 * Worst case buffer count for any model (higher is better). This
564 is the more significant number for comparison.
565 * Number of models at the worst case (lower is better). This is
566 the less significant number.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700567
Richard Barnette5de01eb2017-12-15 09:53:42 -0800568 Implementation note: The score could fail to reflect the intended
569 criteria if there are more than 1000 models in the inventory.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700570
Richard Barnette5de01eb2017-12-15 09:53:42 -0800571 @param spare_counts A dictionary mapping models to buffer counts.
572 @param repair_list A list of `HostJobHistory` objects for the
573 DUTs to be repaired.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700574 @return A numeric score.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700575 """
576 # Go through `buffer_counts`, and create a list of new counts
Richard Barnette5de01eb2017-12-15 09:53:42 -0800577 # that records the buffer count for each model after repair.
578 # The new list of counts discards the model names, as they don't
J. Richard Barnettef6839282015-06-01 16:00:35 -0700579 # contribute to the final score.
Richard Barnette5de01eb2017-12-15 09:53:42 -0800580 _NMODELS = 1000
581 pools = {h.host_pool for h in repair_list}
582 repair_inventory = _LabInventory(repair_list, pools)
J. Richard Barnettef6839282015-06-01 16:00:35 -0700583 new_counts = []
Richard Barnette5de01eb2017-12-15 09:53:42 -0800584 for m, c in buffer_counts.iteritems():
585 if m in repair_inventory:
586 newcount = repair_inventory[m].get_total()
J. Richard Barnettef6839282015-06-01 16:00:35 -0700587 else:
588 newcount = 0
589 new_counts.append(c + newcount)
590 # Go through the new list of counts. Find the worst available
591 # spares count, and count how many times that worst case occurs.
592 worst_count = new_counts[0]
593 num_worst = 1
594 for c in new_counts[1:]:
595 if c == worst_count:
596 num_worst += 1
597 elif c < worst_count:
598 worst_count = c
599 num_worst = 1
600 # Return the calculated score
Richard Barnette5de01eb2017-12-15 09:53:42 -0800601 return _NMODELS * worst_count - num_worst
J. Richard Barnettef6839282015-06-01 16:00:35 -0700602
603
604def _generate_repair_recommendation(inventory, num_recommend):
605 """Return a summary of selected DUTs needing repair.
606
Richard Barnette5de01eb2017-12-15 09:53:42 -0800607 Returns a message recommending a list of broken DUTs to be repaired.
608 The list of DUTs is selected based on these criteria:
J. Richard Barnettef6839282015-06-01 16:00:35 -0700609 * No more than `num_recommend` DUTs will be listed.
610 * All DUTs must be in the same lab.
Richard Barnette5de01eb2017-12-15 09:53:42 -0800611 * DUTs should be selected for some degree of physical proximity.
612 * DUTs for models with a low spares buffer are more important than
613 DUTs with larger buffers.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700614
Richard Barnette5de01eb2017-12-15 09:53:42 -0800615 The algorithm used will guarantee that at least one DUT from a model
616 with the lowest spares buffer will be recommended. If the worst
617 spares buffer number is shared by more than one model, the algorithm
618 will tend to prefer repair sets that include more of those models
619 over sets that cover fewer models.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700620
Richard Barnette5de01eb2017-12-15 09:53:42 -0800621 @param inventory `_LabInventory` object from which to generate
622 recommendations.
J. Richard Barnette1df6a562015-06-09 10:06:17 -0700623 @param num_recommend Number of DUTs to recommend for repair.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700624 """
625 logging.debug('Creating DUT repair recommendations')
Richard Barnette5de01eb2017-12-15 09:53:42 -0800626 model_buffer_counts = {}
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700627 broken_list = []
Richard Barnette5de01eb2017-12-15 09:53:42 -0800628 for model, counts in inventory.reportable_items():
629 logging.debug('Listing failed DUTs for %s', model)
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700630 if counts.get_broken() != 0:
Richard Barnette5de01eb2017-12-15 09:53:42 -0800631 model_buffer_counts[model] = counts.get_spares_buffer()
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700632 broken_list.extend(counts.get_broken_list())
J. Richard Barnette55127432015-10-13 17:01:56 -0700633 # N.B. The logic inside this loop may seem complicated, but
J. Richard Barnettef6839282015-06-01 16:00:35 -0700634 # simplification is hard:
635 # * Calculating an initial recommendation outside of
636 # the loop likely would make things more complicated,
637 # not less.
638 # * It's necessary to calculate an initial lab slice once per
639 # lab _before_ the while loop, in case the number of broken
640 # DUTs in a lab is less than `num_recommend`.
J. Richard Barnette55127432015-10-13 17:01:56 -0700641 recommendation = None
642 best_score = None
643 for lab_duts in _sort_by_location(broken_list):
J. Richard Barnettef6839282015-06-01 16:00:35 -0700644 start = 0
645 end = num_recommend
646 lab_slice = lab_duts[start : end]
Richard Barnette5de01eb2017-12-15 09:53:42 -0800647 lab_score = _score_repair_set(model_buffer_counts, lab_slice)
J. Richard Barnettef6839282015-06-01 16:00:35 -0700648 while end < len(lab_duts):
649 start += 1
650 end += 1
651 new_slice = lab_duts[start : end]
Richard Barnette5de01eb2017-12-15 09:53:42 -0800652 new_score = _score_repair_set(model_buffer_counts, new_slice)
J. Richard Barnettef6839282015-06-01 16:00:35 -0700653 if new_score > lab_score:
654 lab_slice = new_slice
655 lab_score = new_score
656 if recommendation is None or lab_score > best_score:
657 recommendation = lab_slice
658 best_score = lab_score
Richard Barnette5de01eb2017-12-15 09:53:42 -0800659 # N.B. The trailing space in `line_fmt` is manadatory: Without it,
660 # Gmail will parse the URL wrong. Don't ask. If you simply _must_
J. Richard Barnette5d0fa512016-04-05 17:39:52 -0700661 # know more, go try it yourself...
662 line_fmt = '%-30s %-16s %-6s\n %s '
J. Richard Barnette1df6a562015-06-09 10:06:17 -0700663 message = ['Repair recommendations:\n',
Richard Barnette5de01eb2017-12-15 09:53:42 -0800664 line_fmt % ( 'Hostname', 'Model', 'Servo?', 'Logs URL')]
J. Richard Barnettef6839282015-06-01 16:00:35 -0700665 for h in recommendation:
666 servo_name = servo_host.make_servo_hostname(h.host.hostname)
J. Richard Barnette5d0fa512016-04-05 17:39:52 -0700667 servo_present = utils.host_is_in_lab_zone(servo_name)
668 _, event = h.last_diagnosis()
669 line = line_fmt % (
Richard Barnette5de01eb2017-12-15 09:53:42 -0800670 h.host.hostname, h.host_model,
J. Richard Barnette5d0fa512016-04-05 17:39:52 -0700671 'Yes' if servo_present else 'No', event.job_url)
J. Richard Barnettef6839282015-06-01 16:00:35 -0700672 message.append(line)
673 return '\n'.join(message)
674
675
Richard Barnette5de01eb2017-12-15 09:53:42 -0800676def _generate_model_inventory_message(inventory):
677 """Generate the "model inventory" e-mail message.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700678
Richard Barnette5de01eb2017-12-15 09:53:42 -0800679 The model inventory is a list by model summarizing the number of
680 working, broken, and idle DUTs, and the total shortfall or surplus
J. Richard Barnette96db3492015-03-27 17:23:52 -0700681 of working devices relative to the minimum critical pool
682 requirement.
683
Richard Barnette5de01eb2017-12-15 09:53:42 -0800684 The report omits models with no DUTs in the spare pool or with no
685 DUTs in a critical pool.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700686
687 N.B. For sample output text formattted as users can expect to
688 see it in e-mail and log files, refer to the unit tests.
689
Richard Barnette5de01eb2017-12-15 09:53:42 -0800690 @param inventory `_LabInventory` object to be reported on.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700691 @return String with the inventory message to be sent.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700692 """
Richard Barnette5de01eb2017-12-15 09:53:42 -0800693 logging.debug('Creating model inventory')
J. Richard Barnetted3ba33a2015-10-14 11:20:49 -0700694 nworking = 0
695 nbroken = 0
xixuan12ce04f2016-03-10 13:16:30 -0800696 nidle = 0
Richard Barnette5de01eb2017-12-15 09:53:42 -0800697 nbroken_models = 0
698 ntotal_models = 0
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700699 summaries = []
Richard Barnette5de01eb2017-12-15 09:53:42 -0800700 column_names = (
701 'Model', 'Avail', 'Bad', 'Idle', 'Good', 'Spare', 'Total')
702 for model, counts in inventory.reportable_items():
703 logging.debug('Counting %2d DUTS for model %s',
704 counts.get_total(), model)
705 # Summary elements laid out in the same order as the column
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700706 # headers:
Richard Barnette5de01eb2017-12-15 09:53:42 -0800707 # Model Avail Bad Idle Good Spare Total
xixuan12ce04f2016-03-10 13:16:30 -0800708 # e[0] e[1] e[2] e[3] e[4] e[5] e[6]
Richard Barnette5de01eb2017-12-15 09:53:42 -0800709 element = (model,
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700710 counts.get_spares_buffer(),
711 counts.get_broken(),
xixuan12ce04f2016-03-10 13:16:30 -0800712 counts.get_idle(),
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700713 counts.get_working(),
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700714 counts.get_total(SPARE_POOL),
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700715 counts.get_total())
J. Richard Barnetteea5a4ba2016-02-18 16:34:50 -0800716 if element[2]:
717 summaries.append(element)
Richard Barnette5de01eb2017-12-15 09:53:42 -0800718 nbroken_models += 1
719 ntotal_models += 1
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700720 nbroken += element[2]
xixuan12ce04f2016-03-10 13:16:30 -0800721 nidle += element[3]
722 nworking += element[4]
723 ntotal = nworking + nbroken + nidle
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700724 summaries = sorted(summaries, key=lambda e: (e[1], -e[2]))
J. Richard Barnetted3ba33a2015-10-14 11:20:49 -0700725 broken_percent = int(round(100.0 * nbroken / ntotal))
xixuan12ce04f2016-03-10 13:16:30 -0800726 idle_percent = int(round(100.0 * nidle / ntotal))
727 working_percent = 100 - broken_percent - idle_percent
J. Richard Barnetted3ba33a2015-10-14 11:20:49 -0700728 message = ['Summary of DUTs in inventory:',
xixuan12ce04f2016-03-10 13:16:30 -0800729 '%10s %10s %10s %6s' % ('Bad', 'Idle', 'Good', 'Total'),
730 '%5d %3d%% %5d %3d%% %5d %3d%% %6d' % (
J. Richard Barnetted3ba33a2015-10-14 11:20:49 -0700731 nbroken, broken_percent,
xixuan12ce04f2016-03-10 13:16:30 -0800732 nidle, idle_percent,
J. Richard Barnetted3ba33a2015-10-14 11:20:49 -0700733 nworking, working_percent,
734 ntotal),
735 '',
Richard Barnette5de01eb2017-12-15 09:53:42 -0800736 'Models with failures: %d' % nbroken_models,
737 'Models in inventory: %d' % ntotal_models,
J. Richard Barnetted3ba33a2015-10-14 11:20:49 -0700738 '', '',
Richard Barnette5de01eb2017-12-15 09:53:42 -0800739 'Full model inventory:\n',
740 '%-22s %5s %5s %5s %5s %5s %5s' % column_names]
J. Richard Barnette96db3492015-03-27 17:23:52 -0700741 message.extend(
xixuan12ce04f2016-03-10 13:16:30 -0800742 ['%-22s %5d %5d %5d %5d %5d %5d' % e for e in summaries])
J. Richard Barnette96db3492015-03-27 17:23:52 -0700743 return '\n'.join(message)
744
745
J. Richard Barnette4845fcf2015-04-20 14:26:25 -0700746_POOL_INVENTORY_HEADER = '''\
Richard Barnette5de01eb2017-12-15 09:53:42 -0800747Notice to Infrastructure deputies: All models shown below are at
J. Richard Barnettec9a143c2015-06-04 11:11:19 -0700748less than full strength, please take action to resolve the issues.
749Once you're satisified that failures won't recur, failed DUTs can
750be replaced with spares by running `balance_pool`. Detailed
751instructions can be found here:
J. Richard Barnette4845fcf2015-04-20 14:26:25 -0700752 http://go/cros-manage-duts
753'''
754
755
J. Richard Barnette96db3492015-03-27 17:23:52 -0700756def _generate_pool_inventory_message(inventory):
757 """Generate the "pool inventory" e-mail message.
758
Richard Barnette5de01eb2017-12-15 09:53:42 -0800759 The pool inventory is a list by pool and model summarizing the
760 number of working and broken DUTs in the pool. Only models with
J. Richard Barnette96db3492015-03-27 17:23:52 -0700761 at least one broken DUT are included in the list.
762
Richard Barnette5de01eb2017-12-15 09:53:42 -0800763 N.B. For sample output text formattted as users can expect to see it
764 in e-mail and log files, refer to the unit tests.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700765
Richard Barnette5de01eb2017-12-15 09:53:42 -0800766 @param inventory `_LabInventory` object to be reported on.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700767 @return String with the inventory message to be sent.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700768 """
769 logging.debug('Creating pool inventory')
J. Richard Barnette4845fcf2015-04-20 14:26:25 -0700770 message = [_POOL_INVENTORY_HEADER]
J. Richard Barnette96db3492015-03-27 17:23:52 -0700771 newline = ''
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700772 for pool in CRITICAL_POOLS:
J. Richard Barnette96db3492015-03-27 17:23:52 -0700773 message.append(
Richard Barnette5de01eb2017-12-15 09:53:42 -0800774 '%sStatus for pool:%s, by model:' % (newline, pool))
J. Richard Barnette96db3492015-03-27 17:23:52 -0700775 message.append(
xixuan12ce04f2016-03-10 13:16:30 -0800776 '%-20s %5s %5s %5s %5s' % (
Richard Barnette5de01eb2017-12-15 09:53:42 -0800777 'Model', 'Bad', 'Idle', 'Good', 'Total'))
J. Richard Barnette96db3492015-03-27 17:23:52 -0700778 data_list = []
Richard Barnette5de01eb2017-12-15 09:53:42 -0800779 for model, counts in inventory.iteritems():
Richard Barnette254d5b42016-07-06 19:13:23 -0700780 logging.debug('Counting %2d DUTs for %s, %s',
Richard Barnette5de01eb2017-12-15 09:53:42 -0800781 counts.get_total(pool), model, pool)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700782 broken = counts.get_broken(pool)
xixuan12ce04f2016-03-10 13:16:30 -0800783 idle = counts.get_idle(pool)
Richard Barnette5de01eb2017-12-15 09:53:42 -0800784 # models at full strength are not reported
785 if not broken and not idle:
J. Richard Barnette96db3492015-03-27 17:23:52 -0700786 continue
787 working = counts.get_working(pool)
788 total = counts.get_total(pool)
Richard Barnette5de01eb2017-12-15 09:53:42 -0800789 data_list.append((model, broken, idle, working, total))
J. Richard Barnette96db3492015-03-27 17:23:52 -0700790 if data_list:
791 data_list = sorted(data_list, key=lambda d: -d[1])
792 message.extend(
xixuan12ce04f2016-03-10 13:16:30 -0800793 ['%-20s %5d %5d %5d %5d' % t for t in data_list])
J. Richard Barnette96db3492015-03-27 17:23:52 -0700794 else:
Richard Barnette5de01eb2017-12-15 09:53:42 -0800795 message.append('(All models at full strength)')
J. Richard Barnette96db3492015-03-27 17:23:52 -0700796 newline = '\n'
797 return '\n'.join(message)
798
799
xixuan12ce04f2016-03-10 13:16:30 -0800800_IDLE_INVENTORY_HEADER = '''\
801Notice to Infrastructure deputies: The hosts shown below haven't
802run any jobs for at least 24 hours. Please check each host; locked
803hosts should normally be unlocked; stuck jobs should normally be
804aborted.
805'''
806
807
808def _generate_idle_inventory_message(inventory):
809 """Generate the "idle inventory" e-mail message.
810
Richard Barnette5de01eb2017-12-15 09:53:42 -0800811 The idle inventory is a host list with corresponding pool and model,
xixuan12ce04f2016-03-10 13:16:30 -0800812 where the hosts are idle (`UNKWOWN` or `UNUSED`).
813
814 N.B. For sample output text format as users can expect to
815 see it in e-mail and log files, refer to the unit tests.
816
Richard Barnette5de01eb2017-12-15 09:53:42 -0800817 @param inventory `_LabInventory` object to be reported on.
xixuan12ce04f2016-03-10 13:16:30 -0800818 @return String with the inventory message to be sent.
xixuan12ce04f2016-03-10 13:16:30 -0800819 """
820 logging.debug('Creating idle inventory')
821 message = [_IDLE_INVENTORY_HEADER]
822 message.append('Idle Host List:')
Richard Barnette5de01eb2017-12-15 09:53:42 -0800823 message.append('%-30s %-20s %s' % ('Hostname', 'Model', 'Pool'))
xixuan12ce04f2016-03-10 13:16:30 -0800824 data_list = []
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700825 for pool in MANAGED_POOLS:
Richard Barnette5de01eb2017-12-15 09:53:42 -0800826 for model, counts in inventory.iteritems():
Richard Barnette254d5b42016-07-06 19:13:23 -0700827 logging.debug('Counting %2d DUTs for %s, %s',
Richard Barnette5de01eb2017-12-15 09:53:42 -0800828 counts.get_total(pool), model, pool)
829 data_list.extend([(dut.host.hostname, model, pool)
xixuan12ce04f2016-03-10 13:16:30 -0800830 for dut in counts.get_idle_list(pool)])
831 if data_list:
832 message.extend(['%-30s %-20s %s' % t for t in data_list])
833 else:
834 message.append('(No idle DUTs)')
835 return '\n'.join(message)
836
837
J. Richard Barnette96db3492015-03-27 17:23:52 -0700838def _send_email(arguments, tag, subject, recipients, body):
839 """Send an inventory e-mail message.
840
Richard Barnette5de01eb2017-12-15 09:53:42 -0800841 The message is logged in the selected log directory using `tag` for
842 the file name.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700843
Richard Barnette5de01eb2017-12-15 09:53:42 -0800844 If the --debug option was requested, the message is neither logged
845 nor sent, but merely printed on stdout.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700846
847 @param arguments Parsed command-line options.
848 @param tag Tag identifying the inventory for logging
849 purposes.
850 @param subject E-mail Subject: header line.
851 @param recipients E-mail addresses for the To: header line.
852 @param body E-mail message body.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700853 """
854 logging.debug('Generating email: "%s"', subject)
855 all_recipients = ', '.join(recipients)
856 report_body = '\n'.join([
857 'To: %s' % all_recipients,
858 'Subject: %s' % subject,
859 '', body, ''])
J. Richard Barnette02e82432015-10-13 16:02:47 -0700860 if arguments.debug:
J. Richard Barnette96db3492015-03-27 17:23:52 -0700861 print report_body
862 else:
863 filename = os.path.join(arguments.logdir, tag)
864 try:
865 report_file = open(filename, 'w')
866 report_file.write(report_body)
867 report_file.close()
868 except EnvironmentError as e:
869 logging.error('Failed to write %s: %s', filename, e)
870 try:
871 gmail_lib.send_email(all_recipients, subject, body)
872 except Exception as e:
873 logging.error('Failed to send e-mail to %s: %s',
874 all_recipients, e)
875
876
Richard Barnette5de01eb2017-12-15 09:53:42 -0800877def _populate_model_counts(inventory):
878 """Gather model counts while providing interactive feedback.
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700879
880 Gathering the status of all individual DUTs in the lab can take
881 considerable time (~30 minutes at the time of this writing).
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700882 Normally, we pay that cost by querying as we go. However, with
883 the `--debug` option, we expect a human being to be watching the
Richard Barnette5de01eb2017-12-15 09:53:42 -0800884 progress in real time. So, we force the first (expensive) queries
885 to happen up front, and provide simple ASCII output on sys.stdout
886 to show a progress bar and results.
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700887
Richard Barnette5de01eb2017-12-15 09:53:42 -0800888 @param inventory `_LabInventory` object from which to gather
889 counts.
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700890 """
891 n = 0
892 total_broken = 0
Richard Barnette5de01eb2017-12-15 09:53:42 -0800893 for counts in inventory.itervalues():
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700894 n += 1
895 if n % 10 == 5:
896 c = '+'
897 elif n % 10 == 0:
898 c = '%d' % ((n / 10) % 10)
899 else:
900 c = '.'
901 sys.stdout.write(c)
902 sys.stdout.flush()
Richard Barnette5de01eb2017-12-15 09:53:42 -0800903 # This next call is where all the time goes - it forces all of a
904 # model's `HostJobHistory` objects to query the database and
905 # cache their results.
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700906 total_broken += counts.get_broken()
907 sys.stdout.write('\n')
908 sys.stdout.write('Found %d broken DUTs\n' % total_broken)
909
910
Richard Barnette5de01eb2017-12-15 09:53:42 -0800911def _perform_model_inventory(arguments, inventory, timestamp):
912 """Perform the model inventory report.
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700913
Richard Barnette5de01eb2017-12-15 09:53:42 -0800914 The model inventory report consists of the following:
915 * A list of DUTs that are recommended to be repaired. This list
916 is optional, and only appears if the `--recommend` option is
917 present.
918 * A list of all models that have failed DUTs, with counts
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700919 of working, broken, and spare DUTs, among others.
920
921 @param arguments Command-line arguments as returned by
922 `ArgumentParser`
Richard Barnette5de01eb2017-12-15 09:53:42 -0800923 @param inventory `_LabInventory` object to be reported on.
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700924 @param timestamp A string used to identify this run's timestamp
925 in logs and email output.
926 """
927 if arguments.recommend:
928 recommend_message = _generate_repair_recommendation(
929 inventory, arguments.recommend) + '\n\n\n'
930 else:
931 recommend_message = ''
Richard Barnette5de01eb2017-12-15 09:53:42 -0800932 model_message = _generate_model_inventory_message(inventory)
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700933 _send_email(arguments,
Richard Barnette5de01eb2017-12-15 09:53:42 -0800934 'models-%s.txt' % timestamp,
935 'DUT model inventory %s' % timestamp,
936 arguments.model_notify,
937 recommend_message + model_message)
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700938
939
940def _perform_pool_inventory(arguments, inventory, timestamp):
941 """Perform the pool inventory report.
942
943 The pool inventory report consists of the following:
944 * A list of all critical pools that have failed DUTs, with counts
945 of working, broken, and idle DUTs.
Richard Barnette5de01eb2017-12-15 09:53:42 -0800946 * A list of all idle DUTs by hostname including the model and
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700947 pool.
948
949 @param arguments Command-line arguments as returned by
950 `ArgumentParser`
Richard Barnette5de01eb2017-12-15 09:53:42 -0800951 @param inventory `_LabInventory` object to be reported on.
Richard Barnettecf5d8342017-10-24 18:13:11 -0700952 @param timestamp A string used to identify this run's timestamp in
953 logs and email output.
Richard Barnette3dcbb6a2017-10-23 17:57:50 -0700954 """
955 pool_message = _generate_pool_inventory_message(inventory)
956 idle_message = _generate_idle_inventory_message(inventory)
957 _send_email(arguments,
958 'pools-%s.txt' % timestamp,
959 'DUT pool inventory %s' % timestamp,
960 arguments.pool_notify,
961 pool_message + '\n\n\n' + idle_message)
962
963
Richard Barnettecf5d8342017-10-24 18:13:11 -0700964def _dut_in_repair_loop(history):
965 """Return whether a DUT's history indicates a repair loop.
966
967 A DUT is considered looping if it runs no tests, and no tasks pass
968 other than repair tasks.
969
970 @param history An instance of `status_history.HostJobHistory` to be
971 scanned for a repair loop. The caller guarantees
972 that this history corresponds to a working DUT.
973 @returns Return a true value if the DUT's most recent history
974 indicates a repair loop.
975 """
976 # Our caller passes only histories for working DUTs; that means
977 # we've already paid the cost of fetching the diagnosis task, and
978 # we know that the task was successful. The diagnosis task will be
979 # one of the tasks we must scan to find a loop, so if the task isn't
980 # a repair task, then our history includes a successful non-repair
981 # task, and we're not looping.
982 #
Richard Barnette1ca30e62018-04-09 16:45:58 -0700983 # The for loop below is very expensive, because it must fetch the
Richard Barnettecf5d8342017-10-24 18:13:11 -0700984 # full history, regardless of how many tasks we examine. At the
985 # time of this writing, this check against the diagnosis task
986 # reduces the cost of finding loops in the full inventory from hours
987 # to minutes.
988 if history.last_diagnosis()[1].name != 'Repair':
989 return False
990 repair_ok_count = 0
991 for task in history:
992 if not task.is_special:
993 # This is a test, so we're not looping.
994 return False
995 if task.diagnosis == status_history.BROKEN:
996 # Failed a repair, so we're not looping.
997 return False
998 if (task.diagnosis == status_history.WORKING
999 and task.name != 'Repair'):
1000 # Non-repair task succeeded, so we're not looping.
1001 return False
1002 # At this point, we have either a failed non-repair task, or
1003 # a successful repair.
1004 if task.name == 'Repair':
1005 repair_ok_count += 1
1006 if repair_ok_count >= _REPAIR_LOOP_THRESHOLD:
1007 return True
1008
1009
Richard Barnette1ca30e62018-04-09 16:45:58 -07001010def _report_untestable_dut(history, state):
1011 fields = {
1012 'dut_hostname': history.hostname,
1013 'model': history.host_model,
1014 'pool': history.host_pool,
1015 'state': state,
1016 }
1017 logging.info('Untestable DUT: %(dut_hostname)s, model: %(model)s, '
1018 'pool: %(pool)s', fields)
1019 _UNTESTABLE_PRESENCE_METRIC.set(True, fields=fields)
Richard Barnettecf5d8342017-10-24 18:13:11 -07001020
Richard Barnettecf5d8342017-10-24 18:13:11 -07001021
Richard Barnette1ca30e62018-04-09 16:45:58 -07001022def _report_repair_loop_metrics(inventory):
1023 """Find and report DUTs stuck in a repair loop.
1024
1025 Go through `inventory`, and find and report any DUT identified as
1026 being in a repair loop.
1027
Richard Barnette5de01eb2017-12-15 09:53:42 -08001028 @param inventory `_LabInventory` object to be reported on.
Richard Barnettecf5d8342017-10-24 18:13:11 -07001029 """
Richard Barnettecf5d8342017-10-24 18:13:11 -07001030 logging.info('Scanning for DUTs in repair loops.')
Richard Barnette5de01eb2017-12-15 09:53:42 -08001031 for counts in inventory.itervalues():
Richard Barnettecf5d8342017-10-24 18:13:11 -07001032 for history in counts.get_working_list():
1033 # Managed DUTs with names that don't match
1034 # _HOSTNAME_PATTERN shouldn't be possible. However, we
1035 # don't want arbitrary strings being attached to the
1036 # 'dut_hostname' field, so for safety, we exclude all
1037 # anomalies.
1038 if not _HOSTNAME_PATTERN.match(history.hostname):
1039 continue
1040 if _dut_in_repair_loop(history):
Richard Barnette1ca30e62018-04-09 16:45:58 -07001041 _report_untestable_dut(history, 'repair_loop')
1042
1043
1044def _report_idle_dut_metrics(inventory):
1045 """Find and report idle, unlocked DUTs.
1046
1047 Go through `inventory`, and find and report any DUT identified as
1048 "idle" that is not also locked.
1049
1050 @param inventory `_LabInventory` object to be reported on.
1051 """
1052 logging.info('Scanning for idle, unlocked DUTs.')
1053 for counts in inventory.itervalues():
1054 for history in counts.get_idle_list():
1055 # Managed DUTs with names that don't match
1056 # _HOSTNAME_PATTERN shouldn't be possible. However, we
1057 # don't want arbitrary strings being attached to the
1058 # 'dut_hostname' field, so for safety, we exclude all
1059 # anomalies.
1060 if not _HOSTNAME_PATTERN.match(history.hostname):
1061 continue
1062 if not history.host.locked:
1063 _report_untestable_dut(history, 'idle_unlocked')
1064
1065
1066def _report_untestable_dut_metrics(inventory):
1067 """Scan the inventory for DUTs unable to run tests.
1068
1069 DUTs in the inventory are judged "untestable" if they meet one of
1070 two criteria:
1071 * The DUT is stuck in a repair loop; that is, it regularly passes
1072 repair, but never passes other operations.
1073 * The DUT runs no tasks at all, but is not locked.
1074
1075 This routine walks through the given inventory looking for DUTs in
1076 either of these states. Results are reported via a Monarch presence
1077 metric.
1078
1079 Note: To make sure that DUTs aren't flagged as "idle" merely
1080 because there's no work, a separate job runs prior to regular
1081 inventory runs which schedules trivial work on any DUT that appears
1082 idle.
1083
1084 @param inventory `_LabInventory` object to be reported on.
1085 """
1086 _report_repair_loop_metrics(inventory)
1087 _report_idle_dut_metrics(inventory)
Richard Barnettecf5d8342017-10-24 18:13:11 -07001088
1089
Richard Barnette3dcbb6a2017-10-23 17:57:50 -07001090def _log_startup(arguments, startup_time):
1091 """Log the start of this inventory run.
1092
1093 Print various log messages indicating the start of the run. Return
1094 a string based on `startup_time` that will be used to identify this
1095 run in log files and e-mail messages.
1096
1097 @param startup_time A UNIX timestamp marking the moment when
1098 this inventory run began.
1099 @returns A timestamp string that will be used to identify this run
1100 in logs and email output.
1101 """
1102 timestamp = time.strftime('%Y-%m-%d.%H',
1103 time.localtime(startup_time))
1104 logging.debug('Starting lab inventory for %s', timestamp)
Richard Barnette5de01eb2017-12-15 09:53:42 -08001105 if arguments.model_notify:
Richard Barnette3dcbb6a2017-10-23 17:57:50 -07001106 if arguments.recommend:
1107 logging.debug('Will include repair recommendations')
Richard Barnette5de01eb2017-12-15 09:53:42 -08001108 logging.debug('Will include model inventory')
Richard Barnette3dcbb6a2017-10-23 17:57:50 -07001109 if arguments.pool_notify:
1110 logging.debug('Will include pool inventory')
1111 return timestamp
1112
1113
1114def _create_inventory(arguments, end_time):
1115 """Create the `_LabInventory` instance to use for reporting.
1116
1117 @param end_time A UNIX timestamp for the end of the time range
1118 to be searched in this inventory run.
1119 """
1120 start_time = end_time - arguments.duration * 60 * 60
1121 afe = frontend_wrappers.RetryingAFE(server=None)
1122 inventory = _LabInventory.create_inventory(
Richard Barnette5de01eb2017-12-15 09:53:42 -08001123 afe, start_time, end_time, arguments.modelnames)
1124 logging.info('Found %d hosts across %d models',
Richard Barnette3dcbb6a2017-10-23 17:57:50 -07001125 inventory.get_num_duts(),
Richard Barnette5de01eb2017-12-15 09:53:42 -08001126 inventory.get_num_models())
Richard Barnette3dcbb6a2017-10-23 17:57:50 -07001127 return inventory
1128
1129
Richard Barnettecf5d8342017-10-24 18:13:11 -07001130def _perform_inventory_reports(arguments):
1131 """Perform all inventory checks requested on the command line.
1132
1133 Create the initial inventory and run through the inventory reports
1134 as called for by the parsed command-line arguments.
1135
1136 @param arguments Command-line arguments as returned by
1137 `ArgumentParser`.
1138 """
1139 startup_time = time.time()
1140 timestamp = _log_startup(arguments, startup_time)
1141 inventory = _create_inventory(arguments, startup_time)
1142 if arguments.debug:
Richard Barnette5de01eb2017-12-15 09:53:42 -08001143 _populate_model_counts(inventory)
1144 if arguments.model_notify:
1145 _perform_model_inventory(arguments, inventory, timestamp)
Richard Barnettecf5d8342017-10-24 18:13:11 -07001146 if arguments.pool_notify:
1147 _perform_pool_inventory(arguments, inventory, timestamp)
Richard Barnette1ca30e62018-04-09 16:45:58 -07001148 if arguments.report_untestable:
1149 _report_untestable_dut_metrics(inventory)
Richard Barnettecf5d8342017-10-24 18:13:11 -07001150
1151
J. Richard Barnette96db3492015-03-27 17:23:52 -07001152def _separate_email_addresses(address_list):
1153 """Parse a list of comma-separated lists of e-mail addresses.
1154
1155 @param address_list A list of strings containing comma
1156 separate e-mail addresses.
1157 @return A list of the individual e-mail addresses.
J. Richard Barnette96db3492015-03-27 17:23:52 -07001158 """
1159 newlist = []
1160 for arg in address_list:
1161 newlist.extend([email.strip() for email in arg.split(',')])
1162 return newlist
1163
1164
1165def _verify_arguments(arguments):
1166 """Validate command-line arguments.
1167
Richard Barnette5de01eb2017-12-15 09:53:42 -08001168 Join comma separated e-mail addresses for `--model-notify` and
J. Richard Barnette96db3492015-03-27 17:23:52 -07001169 `--pool-notify` in separate option arguments into a single list.
1170
Richard Barnette54150302018-02-26 10:42:46 -08001171 For non-debug uses, require that at least one inventory report be
1172 requested. For debug, if a report isn't specified, treat it as "run
1173 all the reports."
J. Richard Barnette02e82432015-10-13 16:02:47 -07001174
1175 The return value indicates success or failure; in the case of
1176 failure, we also write an error message to stderr.
1177
J. Richard Barnette96db3492015-03-27 17:23:52 -07001178 @param arguments Command-line arguments as returned by
1179 `ArgumentParser`
J. Richard Barnette02e82432015-10-13 16:02:47 -07001180 @return True if the arguments are semantically good, or False
1181 if the arguments don't meet requirements.
J. Richard Barnette96db3492015-03-27 17:23:52 -07001182 """
Richard Barnette5de01eb2017-12-15 09:53:42 -08001183 arguments.model_notify = _separate_email_addresses(
1184 arguments.model_notify)
J. Richard Barnette96db3492015-03-27 17:23:52 -07001185 arguments.pool_notify = _separate_email_addresses(
1186 arguments.pool_notify)
Richard Barnette54150302018-02-26 10:42:46 -08001187 if not any([arguments.model_notify, arguments.pool_notify,
Richard Barnette1ca30e62018-04-09 16:45:58 -07001188 arguments.report_untestable]):
J. Richard Barnette02e82432015-10-13 16:02:47 -07001189 if not arguments.debug:
Richard Barnette54150302018-02-26 10:42:46 -08001190 sys.stderr.write('Must request at least one report via '
1191 '--model-notify, --pool-notify, or '
Richard Barnette1ca30e62018-04-09 16:45:58 -07001192 '--report-untestable\n')
J. Richard Barnette02e82432015-10-13 16:02:47 -07001193 return False
1194 else:
Richard Barnette54150302018-02-26 10:42:46 -08001195 # We want to run all the e-mail reports. An empty notify
1196 # list will cause a report to be skipped, so make sure the
J. Richard Barnette02e82432015-10-13 16:02:47 -07001197 # lists are non-empty.
Richard Barnette5de01eb2017-12-15 09:53:42 -08001198 arguments.model_notify = ['']
J. Richard Barnette02e82432015-10-13 16:02:47 -07001199 arguments.pool_notify = ['']
1200 return True
J. Richard Barnette96db3492015-03-27 17:23:52 -07001201
1202
Richard Barnette3dcbb6a2017-10-23 17:57:50 -07001203def _get_default_logdir(script):
J. Richard Barnette96db3492015-03-27 17:23:52 -07001204 """Get the default directory for the `--logdir` option.
1205
1206 The default log directory is based on the parent directory
1207 containing this script.
1208
1209 @param script Path to this script file.
1210 @return A path to a directory.
J. Richard Barnette96db3492015-03-27 17:23:52 -07001211 """
1212 basedir = os.path.dirname(os.path.abspath(script))
1213 basedir = os.path.dirname(basedir)
1214 return os.path.join(basedir, _LOGDIR)
1215
1216
1217def _parse_command(argv):
1218 """Parse the command line arguments.
1219
1220 Create an argument parser for this command's syntax, parse the
1221 command line, and return the result of the ArgumentParser
1222 parse_args() method.
1223
1224 @param argv Standard command line argument vector; argv[0] is
1225 assumed to be the command name.
1226 @return Result returned by ArgumentParser.parse_args().
J. Richard Barnette96db3492015-03-27 17:23:52 -07001227 """
1228 parser = argparse.ArgumentParser(
1229 prog=argv[0],
1230 description='Gather and report lab inventory statistics')
1231 parser.add_argument('-d', '--duration', type=int,
1232 default=_DEFAULT_DURATION, metavar='HOURS',
1233 help='number of hours back to search for status'
1234 ' (default: %d)' % _DEFAULT_DURATION)
Richard Barnette5de01eb2017-12-15 09:53:42 -08001235 parser.add_argument('--model-notify', action='append',
J. Richard Barnette96db3492015-03-27 17:23:52 -07001236 default=[], metavar='ADDRESS',
Richard Barnette5de01eb2017-12-15 09:53:42 -08001237 help='Generate model inventory message, '
J. Richard Barnette96db3492015-03-27 17:23:52 -07001238 'and send it to the given e-mail address(es)')
1239 parser.add_argument('--pool-notify', action='append',
1240 default=[], metavar='ADDRESS',
1241 help='Generate pool inventory message, '
1242 'and send it to the given address(es)')
J. Richard Barnette1df6a562015-06-09 10:06:17 -07001243 parser.add_argument('-r', '--recommend', type=int, default=None,
J. Richard Barnettef6839282015-06-01 16:00:35 -07001244 help=('Specify how many DUTs should be '
J. Richard Barnette1df6a562015-06-09 10:06:17 -07001245 'recommended for repair (default: no '
1246 'recommendation)'))
Richard Barnette1ca30e62018-04-09 16:45:58 -07001247 parser.add_argument('--report-untestable', action='store_true',
1248 help='Check for devices unable to run tests.')
J. Richard Barnette02e82432015-10-13 16:02:47 -07001249 parser.add_argument('--debug', action='store_true',
Prathmesh Prabhu6b48ede2018-05-07 14:33:07 -07001250 help='Print e-mail, metrics messages on stdout '
J. Richard Barnette96db3492015-03-27 17:23:52 -07001251 'without sending them.')
Richard Barnette3dcbb6a2017-10-23 17:57:50 -07001252 parser.add_argument('--logdir', default=_get_default_logdir(argv[0]),
J. Richard Barnette96db3492015-03-27 17:23:52 -07001253 help='Directory where logs will be written.')
Richard Barnette5de01eb2017-12-15 09:53:42 -08001254 parser.add_argument('modelnames', nargs='*',
1255 metavar='MODEL',
1256 help='names of models to report on '
1257 '(default: all models)')
J. Richard Barnette96db3492015-03-27 17:23:52 -07001258 arguments = parser.parse_args(argv[1:])
J. Richard Barnette02e82432015-10-13 16:02:47 -07001259 if not _verify_arguments(arguments):
1260 return None
J. Richard Barnette96db3492015-03-27 17:23:52 -07001261 return arguments
1262
1263
1264def _configure_logging(arguments):
1265 """Configure the `logging` module for our needs.
1266
Richard Barnette3dcbb6a2017-10-23 17:57:50 -07001267 How we log depends on whether the `--debug` option was provided on
1268 the command line.
1269 * Without the option, we configure the logging to capture all
1270 potentially relevant events in a log file. The log file is
1271 configured to rotate once a week on Friday evening, preserving
1272 ~3 months worth of history.
1273 * With the option, we expect stdout to contain other
1274 human-readable output (including the contents of the e-mail
Richard Barnettecf5d8342017-10-24 18:13:11 -07001275 messages), so we restrict the output to INFO level.
1276
1277 For convenience, when `--debug` is on, the logging format has
1278 no adornments, so that a call like `logging.info(msg)` simply writes
1279 `msg` to stdout, plus a trailing newline.
J. Richard Barnette96db3492015-03-27 17:23:52 -07001280
1281 @param arguments Command-line arguments as returned by
1282 `ArgumentParser`
J. Richard Barnette96db3492015-03-27 17:23:52 -07001283 """
J. Richard Barnettef6839282015-06-01 16:00:35 -07001284 root_logger = logging.getLogger()
J. Richard Barnette02e82432015-10-13 16:02:47 -07001285 if arguments.debug:
J. Richard Barnettef6839282015-06-01 16:00:35 -07001286 root_logger.setLevel(logging.INFO)
J. Richard Barnette96db3492015-03-27 17:23:52 -07001287 handler = logging.StreamHandler(sys.stdout)
1288 handler.setFormatter(logging.Formatter())
1289 else:
Richard Barnette5af97402016-04-18 11:00:26 -07001290 if not os.path.exists(arguments.logdir):
1291 os.mkdir(arguments.logdir)
J. Richard Barnettef6839282015-06-01 16:00:35 -07001292 root_logger.setLevel(logging.DEBUG)
J. Richard Barnette96db3492015-03-27 17:23:52 -07001293 logfile = os.path.join(arguments.logdir, _LOGFILE)
1294 handler = logging.handlers.TimedRotatingFileHandler(
1295 logfile, when='W4', backupCount=13)
1296 formatter = logging.Formatter(_LOG_FORMAT,
1297 time_utils.TIME_FMT)
1298 handler.setFormatter(formatter)
J. Richard Barnettef6839282015-06-01 16:00:35 -07001299 # TODO(jrbarnette) This is gross. Importing client.bin.utils
1300 # implicitly imported logging_config, which calls
1301 # logging.basicConfig() *at module level*. That gives us an
1302 # extra logging handler that we don't want. So, clear out all
1303 # the handlers here.
1304 for h in root_logger.handlers:
1305 root_logger.removeHandler(h)
1306 root_logger.addHandler(handler)
J. Richard Barnette96db3492015-03-27 17:23:52 -07001307
1308
J. Richard Barnette96db3492015-03-27 17:23:52 -07001309def main(argv):
1310 """Standard main routine.
Richard Barnettecf5d8342017-10-24 18:13:11 -07001311
1312 @param argv Command line arguments, including `sys.argv[0]`.
J. Richard Barnette96db3492015-03-27 17:23:52 -07001313 """
1314 arguments = _parse_command(argv)
J. Richard Barnette02e82432015-10-13 16:02:47 -07001315 if not arguments:
1316 sys.exit(1)
J. Richard Barnette96db3492015-03-27 17:23:52 -07001317 _configure_logging(arguments)
Prathmesh Prabhu6b48ede2018-05-07 14:33:07 -07001318
1319 if arguments.debug:
1320 logging.info('--debug mode: Will not report metrics to monarch')
1321 metrics_file = '/dev/null'
1322 else:
1323 metrics_file = None
1324
Prathmesh Prabhu58728f42018-05-07 14:37:35 -07001325 with site_utils.SetupTsMonGlobalState(
1326 'lab_inventory', debug_file=metrics_file,
1327 auto_flush=False):
1328 try:
Richard Barnettecf5d8342017-10-24 18:13:11 -07001329 _perform_inventory_reports(arguments)
Prathmesh Prabhu58728f42018-05-07 14:37:35 -07001330 except KeyboardInterrupt:
1331 pass
1332 except (EnvironmentError, Exception):
1333 # Our cron setup doesn't preserve stderr, so drop extra breadcrumbs.
1334 logging.exception('Error escaped main')
1335 raise
1336 finally:
1337 metrics.Flush()
J. Richard Barnette96db3492015-03-27 17:23:52 -07001338
1339
Kevin Chengcf0ad2b2016-04-19 14:51:39 -07001340def get_inventory(afe):
J. Richard Barnetteaa868932015-10-23 13:28:59 -07001341 end_time = int(time.time())
1342 start_time = end_time - 24 * 60 * 60
Kevin Chengcf0ad2b2016-04-19 14:51:39 -07001343 return _LabInventory.create_inventory(afe, start_time, end_time)
1344
1345
1346def get_managed_boards(afe):
Richard Barnette5de01eb2017-12-15 09:53:42 -08001347 return get_inventory(afe).get_boards()
J. Richard Barnetteaa868932015-10-23 13:28:59 -07001348
1349
J. Richard Barnette96db3492015-03-27 17:23:52 -07001350if __name__ == '__main__':
1351 main(sys.argv)