blob: ba34fe0df1ac1f06b7cf813d1d10c86a9989e1b5 [file] [log] [blame]
J. Richard Barnette96db3492015-03-27 17:23:52 -07001#!/usr/bin/env python
2# Copyright 2015 The Chromium OS Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6"""Create e-mail reports of the Lab's DUT inventory.
7
8Gathers a list of all DUTs of interest in the Lab, segregated by
9board and pool, and determines whether each DUT is working or
10broken. Then, send one or more e-mail reports summarizing the
11status to e-mail addresses provided on the command line.
12
13usage: lab_inventory.py [ options ] [ board ... ]
14
15Options:
16--duration / -d <hours>
17 How far back in time to search job history to determine DUT
18 status.
19
20--board-notify <address>[,<address>]
21 Send the "board status" e-mail to all the specified e-mail
22 addresses.
23
24--pool-notify <address>[,<address>]
25 Send the "pool status" e-mail to all the specified e-mail
26 addresses.
27
J. Richard Barnette1df6a562015-06-09 10:06:17 -070028--recommend <number>
29 When generating the "board status" e-mail, included a list of
30 <number> specific DUTs to be recommended for repair.
31
J. Richard Barnette96db3492015-03-27 17:23:52 -070032--logdir <directory>
33 Log progress and actions in a file under this directory. Text
34 of any e-mail sent will also be logged in a timestamped file in
35 this directory.
36
J. Richard Barnette02e82432015-10-13 16:02:47 -070037--debug
J. Richard Barnette96db3492015-03-27 17:23:52 -070038 Suppress all logging and sending e-mail. Instead, write the
39 output that would be generated onto stdout.
40
41<board> arguments:
42 With no arguments, gathers the status for all boards in the lab.
43 With one or more named boards on the command line, restricts
44 reporting to just those boards.
45
46"""
47
48
49import argparse
50import logging
51import logging.handlers
52import os
J. Richard Barnettef6839282015-06-01 16:00:35 -070053import re
J. Richard Barnette96db3492015-03-27 17:23:52 -070054import sys
55import time
56
57import common
J. Richard Barnettef6839282015-06-01 16:00:35 -070058from autotest_lib.client.bin import utils
J. Richard Barnette96db3492015-03-27 17:23:52 -070059from autotest_lib.client.common_lib import time_utils
J. Richard Barnettea7c514e2015-09-15 11:13:23 -070060from autotest_lib.server.cros.dynamic_suite import frontend_wrappers
J. Richard Barnettef6839282015-06-01 16:00:35 -070061from autotest_lib.server.hosts import servo_host
J. Richard Barnette96db3492015-03-27 17:23:52 -070062from autotest_lib.site_utils import gmail_lib
63from autotest_lib.site_utils import status_history
64from autotest_lib.site_utils.suite_scheduler import constants
65
66
67# The pools in the Lab that are actually of interest.
68#
69# These are general purpose pools of DUTs that are considered
70# identical for purposes of testing. That is, a device in one of
71# these pools can be shifted to another pool at will for purposes
72# of supplying test demand.
73#
74# Devices in these pools are not allowed to have special-purpose
75# attachments, or to be part of in any kind of custom fixture.
76# Devices in these pools are also required to reside in areas
77# managed by the Platforms team (i.e. at the time of this writing,
78# only in "Atlantis" or "Destiny").
79#
80# _CRITICAL_POOLS - Pools that must be kept fully supplied in order
81# to guarantee timely completion of tests from builders.
82# _SPARE_POOL - A low priority pool that is allowed to provide
83# spares to replace broken devices in the critical pools.
84# _MANAGED_POOLS - The set of all the general purpose pools
85# monitored by this script.
86
J. Richard Barnette222d7f42015-12-14 17:22:51 -080087_CRITICAL_POOLS = ['bvt', 'cq', 'continuous']
J. Richard Barnette96db3492015-03-27 17:23:52 -070088_SPARE_POOL = 'suites'
89_MANAGED_POOLS = _CRITICAL_POOLS + [_SPARE_POOL]
90
J. Richard Barnetteb8bc570c2016-03-17 17:03:57 -070091# _EXCLUDED_LABELS - A set of labels that disqualify a DUT from
92# monitoring by this script. Currently, we're excluding any
93# 'adb' host, because we're not ready to monitor Android or
94# Brillo hosts.
95_EXCLUDED_LABELS = set(['adb'])
96
J. Richard Barnette96db3492015-03-27 17:23:52 -070097# _DEFAULT_DURATION:
98# Default value used for the --duration command line option.
99# Specifies how far back in time to search in order to determine
100# DUT status.
101
102_DEFAULT_DURATION = 24
103
J. Richard Barnette96db3492015-03-27 17:23:52 -0700104# _LOGDIR:
105# Relative path used in the calculation of the default setting
106# for the --logdir option. The full path path is relative to
107# the root of the autotest directory, as determined from
108# sys.argv[0].
109# _LOGFILE:
110# Basename of a file to which general log information will be
111# written.
112# _LOG_FORMAT:
113# Format string for log messages.
114
115_LOGDIR = os.path.join('logs', 'dut-data')
116_LOGFILE = 'lab-inventory.log'
117_LOG_FORMAT = '%(asctime)s | %(levelname)-10s | %(message)s'
118
J. Richard Barnettef6839282015-06-01 16:00:35 -0700119# Pattern describing location-based host names in the Chrome OS test
120# labs. Each DUT hostname designates the DUT's location:
121# * A lab (room) that's physically separated from other labs
122# (i.e. there's a door).
123# * A row (or aisle) of DUTs within the lab.
124# * A vertical rack of shelves on the row.
125# * A specific host on one shelf of the rack.
126
127_HOSTNAME_PATTERN = re.compile(
128 r'(chromeos\d+)-row(\d+)-rack(\d+)-host(\d+)')
129
J. Richard Barnette96db3492015-03-27 17:23:52 -0700130
131class _PoolCounts(object):
132 """Maintains a set of `HostJobHistory` objects for a pool.
133
134 The collected history objects are nominally all part of a single
J. Richard Barnettef6839282015-06-01 16:00:35 -0700135 scheduling pool of DUTs. The collection maintains a list of
136 working DUTs, a list of broken DUTs, and a list of all DUTs.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700137
J. Richard Barnettef6839282015-06-01 16:00:35 -0700138 Performance note: Certain methods in this class are potentially
139 expensive:
140 * `get_working()`
141 * `get_working_list()`
142 * `get_broken()`
143 * `get_broken_list()`
144 The first time any one of these methods is called, it causes
145 multiple RPC calls with a relatively expensive set of database
146 queries. However, the results of the queries are cached in the
147 individual `HostJobHistory` objects, so only the first call
148 actually pays the full cost.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700149
J. Richard Barnettef6839282015-06-01 16:00:35 -0700150 Additionally, `get_working_list()` and `get_broken_list()` both
151 cache their return values to avoid recalculating lists at every
152 call; this caching is separate from the caching of RPC results
153 described above.
154
155 This class is deliberately constructed to delay the RPC cost
156 until the accessor methods are called (rather than to query in
J. Richard Barnette96db3492015-03-27 17:23:52 -0700157 `record_host()`) so that it's possible to construct a complete
158 `_LabInventory` without making the expensive queries at creation
J. Richard Barnettef6839282015-06-01 16:00:35 -0700159 time. `_populate_board_counts()`, below, assumes this behavior.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700160
161 """
162
163 def __init__(self):
164 self._histories = []
J. Richard Barnettef6839282015-06-01 16:00:35 -0700165 self._working_list = None
166 self._broken_list = None
J. Richard Barnette96db3492015-03-27 17:23:52 -0700167
168
169 def record_host(self, host_history):
170 """Add one `HostJobHistory` object to the collection.
171
172 @param host_history The `HostJobHistory` object to be
173 remembered.
174
175 """
J. Richard Barnettef6839282015-06-01 16:00:35 -0700176 self._working_list = None
177 self._broken_list = None
J. Richard Barnette96db3492015-03-27 17:23:52 -0700178 self._histories.append(host_history)
179
180
J. Richard Barnettef6839282015-06-01 16:00:35 -0700181 def get_working_list(self):
182 """Return a list of all working DUTs in the pool.
183
184 Filter `self._histories` for histories where the last
185 diagnosis is `WORKING`.
186
187 Cache the result so that we only cacluate it once.
188
189 @return A list of HostJobHistory objects.
190
191 """
192 if self._working_list is None:
193 self._working_list = [h for h in self._histories
194 if h.last_diagnosis()[0] == status_history.WORKING]
195 return self._working_list
196
197
J. Richard Barnette96db3492015-03-27 17:23:52 -0700198 def get_working(self):
J. Richard Barnettef6839282015-06-01 16:00:35 -0700199 """Return the number of working DUTs in the pool."""
200 return len(self.get_working_list())
201
202
203 def get_broken_list(self):
204 """Return a list of all broken DUTs in the pool.
205
206 Filter `self._histories` for histories where the last
207 diagnosis is not `WORKING`.
208
209 Cache the result so that we only cacluate it once.
210
211 @return A list of HostJobHistory objects.
212
213 """
214 if self._broken_list is None:
215 self._broken_list = [h for h in self._histories
216 if h.last_diagnosis()[0] != status_history.WORKING]
217 return self._broken_list
J. Richard Barnette96db3492015-03-27 17:23:52 -0700218
219
220 def get_broken(self):
J. Richard Barnettef6839282015-06-01 16:00:35 -0700221 """Return the number of broken DUTs in the pool."""
222 return len(self.get_broken_list())
J. Richard Barnette96db3492015-03-27 17:23:52 -0700223
224
225 def get_total(self):
J. Richard Barnettef6839282015-06-01 16:00:35 -0700226 """Return the total number of DUTs in the pool."""
J. Richard Barnette96db3492015-03-27 17:23:52 -0700227 return len(self._histories)
228
229
230class _BoardCounts(object):
231 """Maintains a set of `HostJobHistory` objects for a board.
232
233 The collected history objects are nominally all of the same
234 board. The collection maintains a count of working DUTs, a
235 count of broken DUTs, and a total count. The counts can be
236 obtained either for a single pool, or as a total across all
237 pools.
238
239 DUTs in the collection must be assigned to one of the pools
240 in `_MANAGED_POOLS`.
241
242 The `get_working()` and `get_broken()` methods rely on the
243 methods of the same name in _PoolCounts, so the performance
244 note in _PoolCounts applies here as well.
245
246 """
247
248 def __init__(self):
249 self._pools = {
250 pool: _PoolCounts() for pool in _MANAGED_POOLS
251 }
252
253 def record_host(self, host_history):
254 """Add one `HostJobHistory` object to the collection.
255
256 @param host_history The `HostJobHistory` object to be
257 remembered.
258
259 """
J. Richard Barnette3d0590a2015-04-29 12:56:12 -0700260 pool = host_history.host_pool
J. Richard Barnette96db3492015-03-27 17:23:52 -0700261 self._pools[pool].record_host(host_history)
262
263
264 def _count_pool(self, get_pool_count, pool=None):
265 """Internal helper to count hosts in a given pool.
266
267 The `get_pool_count` parameter is a function to calculate
268 the exact count of interest for the pool.
269
270 @param get_pool_count Function to return a count from a
271 _PoolCount object.
272 @param pool The pool to be counted. If `None`,
273 return the total across all pools.
274
275 """
276 if pool is None:
277 return sum([get_pool_count(counts)
278 for counts in self._pools.values()])
279 else:
280 return get_pool_count(self._pools[pool])
281
282
J. Richard Barnettef6839282015-06-01 16:00:35 -0700283 def get_working_list(self):
284 """Return a list of all working DUTs for the board.
285
286 Go through all HostJobHistory objects in the board's pools,
287 selecting the ones where the last diagnosis is `WORKING`.
288
289 @return A list of HostJobHistory objects.
290
291 """
292 l = []
293 for p in self._pools.values():
294 l.extend(p.get_working_list())
295 return l
296
297
J. Richard Barnette96db3492015-03-27 17:23:52 -0700298 def get_working(self, pool=None):
299 """Return the number of working DUTs in a pool.
300
301 @param pool The pool to be counted. If `None`, return the
302 total across all pools.
303
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700304 @return The total number of working DUTs in the selected
305 pool(s).
J. Richard Barnette96db3492015-03-27 17:23:52 -0700306 """
307 return self._count_pool(_PoolCounts.get_working, pool)
308
309
J. Richard Barnettef6839282015-06-01 16:00:35 -0700310 def get_broken_list(self):
311 """Return a list of all broken DUTs for the board.
312
313 Go through all HostJobHistory objects in the board's pools,
314 selecting the ones where the last diagnosis is not
315 `WORKING`.
316
317 @return A list of HostJobHistory objects.
318
319 """
320 l = []
321 for p in self._pools.values():
322 l.extend(p.get_broken_list())
323 return l
324
325
J. Richard Barnette96db3492015-03-27 17:23:52 -0700326 def get_broken(self, pool=None):
327 """Return the number of broken DUTs in a pool.
328
329 @param pool The pool to be counted. If `None`, return the
330 total across all pools.
331
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700332 @return The total number of broken DUTs in the selected pool(s).
J. Richard Barnette96db3492015-03-27 17:23:52 -0700333 """
334 return self._count_pool(_PoolCounts.get_broken, pool)
335
336
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700337 def get_spares_buffer(self):
338 """Return the the nominal number of working spares.
339
340 Calculates and returns how many working spares there would
341 be in the spares pool if all broken DUTs were in the spares
342 pool. This number may be negative, indicating a shortfall
343 in the critical pools.
344
345 @return The total number DUTs in the spares pool, less the total
346 number of broken DUTs in all pools.
347 """
348 return self.get_total(_SPARE_POOL) - self.get_broken()
349
350
J. Richard Barnette96db3492015-03-27 17:23:52 -0700351 def get_total(self, pool=None):
352 """Return the total number of DUTs in a pool.
353
354 @param pool The pool to be counted. If `None`, return the
355 total across all pools.
356
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700357 @return The total number of DUTs in the selected pool(s).
J. Richard Barnette96db3492015-03-27 17:23:52 -0700358 """
359 return self._count_pool(_PoolCounts.get_total, pool)
360
361
362class _LabInventory(dict):
363 """Collection of `HostJobHistory` objects for the Lab's inventory.
364
365 The collection is indexed by board. Indexing returns the
366 _BoardCounts object associated with the board.
367
368 The collection is also iterable. The iterator returns all the
369 boards in the inventory, in unspecified order.
370
371 """
372
J. Richard Barnetteb8bc570c2016-03-17 17:03:57 -0700373 @staticmethod
374 def _eligible_host(afehost):
375 """Return whether this host is eligible for monitoring.
376
377 Hosts with any label that's in `_EXCLUDED_LABELS` aren't
378 eligible.
379
380 @param afehost The host to be tested for eligibility.
381 """
382 return not len(_EXCLUDED_LABELS.intersection(afehost.labels))
383
384
J. Richard Barnette96db3492015-03-27 17:23:52 -0700385 @classmethod
386 def create_inventory(cls, afe, start_time, end_time, boardlist=[]):
387 """Return a Lab inventory with specified parameters.
388
389 By default, gathers inventory from `HostJobHistory` objects
390 for all DUTs in the `_MANAGED_POOLS` list. If `boardlist`
391 is supplied, the inventory will be restricted to only the
392 given boards.
393
394 @param afe AFE object for constructing the
395 `HostJobHistory` objects.
396 @param start_time Start time for the `HostJobHistory`
397 objects.
398 @param end_time End time for the `HostJobHistory`
399 objects.
400 @param boardlist List of boards to include. If empty,
401 include all available boards.
402 @return A `_LabInventory` object for the specified boards.
403
404 """
405 label_list = [constants.Labels.POOL_PREFIX + l
406 for l in _MANAGED_POOLS]
407 afehosts = afe.get_hosts(labels__name__in=label_list)
408 if boardlist:
J. Richard Barnetteb8bc570c2016-03-17 17:03:57 -0700409 # We're deliberately not checking host eligibility in this
410 # code path. This is a debug path, not used in production;
411 # it may be useful to include ineligible hosts here.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700412 boardhosts = []
413 for board in boardlist:
414 board_label = constants.Labels.BOARD_PREFIX + board
415 host_list = [h for h in afehosts
416 if board_label in h.labels]
417 boardhosts.extend(host_list)
418 afehosts = boardhosts
J. Richard Barnetteb8bc570c2016-03-17 17:03:57 -0700419 else:
420 afehosts = [h for h in afehosts if cls._eligible_host(h)]
J. Richard Barnette96db3492015-03-27 17:23:52 -0700421 create = lambda host: (
422 status_history.HostJobHistory(afe, host,
423 start_time, end_time))
424 return cls([create(host) for host in afehosts])
425
426
427 def __init__(self, histories):
J. Richard Barnette6948ed32015-05-06 08:57:10 -0700428 # N.B. The query that finds our hosts is restricted to those
429 # with a valid pool: label, but doesn't check for a valid
430 # board: label. In some (insufficiently) rare cases, the
431 # AFE hosts table has been known to (incorrectly) have DUTs
432 # with a pool: but no board: label. We explicitly exclude
433 # those here.
434 histories = [h for h in histories
435 if h.host_board is not None]
J. Richard Barnette3d0590a2015-04-29 12:56:12 -0700436 boards = set([h.host_board for h in histories])
J. Richard Barnette96db3492015-03-27 17:23:52 -0700437 initval = { board: _BoardCounts() for board in boards }
438 super(_LabInventory, self).__init__(initval)
439 self._dut_count = len(histories)
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700440 self._managed_boards = None
J. Richard Barnette96db3492015-03-27 17:23:52 -0700441 for h in histories:
J. Richard Barnette3d0590a2015-04-29 12:56:12 -0700442 self[h.host_board].record_host(h)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700443
444
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700445 def get_managed_boards(self):
446 """Return the set of "managed" boards.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700447
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700448 Operationally, saying a board is "managed" means that the
449 board will be included in the "board" and "repair
450 recommendations" reports. That is, if there are failures in
451 the board's inventory then lab techs will be asked to fix
452 them without a separate ticket.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700453
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700454 For purposes of implementation, a board is "managed" if it
455 has DUTs in both the spare and a non-spare (i.e. critical)
456 pool.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700457
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700458 @return A set of all the boards that have both spare and
459 non-spare pools.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700460 """
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700461 if self._managed_boards is None:
462 self._managed_boards = set()
J. Richard Barnettef6839282015-06-01 16:00:35 -0700463 for board, counts in self.items():
J. Richard Barnettef6839282015-06-01 16:00:35 -0700464 spares = counts.get_total(_SPARE_POOL)
465 total = counts.get_total()
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700466 if spares != 0 and spares != total:
467 self._managed_boards.add(board)
468 return self._managed_boards
J. Richard Barnettef6839282015-06-01 16:00:35 -0700469
470
J. Richard Barnette96db3492015-03-27 17:23:52 -0700471 def get_num_duts(self):
472 """Return the total number of DUTs in the inventory."""
473 return self._dut_count
474
475
476 def get_num_boards(self):
477 """Return the total number of boards in the inventory."""
478 return len(self)
479
480
J. Richard Barnettef6839282015-06-01 16:00:35 -0700481def _sort_by_location(inventory_list):
482 """Return a list of DUTs, organized by location.
483
484 Take the given list of `HostJobHistory` objects, separate it
485 into a list per lab, and sort each lab's list by location. The
486 order of sorting within a lab is
487 * By row number within the lab,
488 * then by rack number within the row,
489 * then by host shelf number within the rack.
490
491 Return a list of the sorted lists.
492
493 Implementation note: host locations are sorted by converting
494 each location into a base 100 number. If row, rack or
495 host numbers exceed the range [0..99], then sorting will
496 break down.
497
498 @return A list of sorted lists of DUTs.
499
500 """
501 BASE = 100
502 lab_lists = {}
503 for history in inventory_list:
504 location = _HOSTNAME_PATTERN.match(history.host.hostname)
505 if location:
506 lab = location.group(1)
507 key = 0
508 for idx in location.group(2, 3, 4):
509 key = BASE * key + int(idx)
510 lab_lists.setdefault(lab, []).append((key, history))
511 return_list = []
512 for dut_list in lab_lists.values():
513 dut_list.sort(key=lambda t: t[0])
514 return_list.append([t[1] for t in dut_list])
515 return return_list
516
517
518def _score_repair_set(buffer_counts, repair_list):
519 """Return a numeric score rating a set of DUTs to be repaired.
520
521 `buffer_counts` is a dictionary mapping board names to the
522 size of the board's spares buffer.
523
524 `repair_list` is a list of DUTs to be repaired.
525
526 This function calculates the new set of buffer counts that would
527 result from the proposed repairs, and scores the new set using
528 two numbers:
529 * Worst case buffer count for any board (higher is better).
530 This is the more siginficant number for comparison.
531 * Number of boards at the worst case (lower is better). This
532 is the less significant number.
533
534 Implementation note: The score could fail to reflect the
535 intended criteria if there are more than 1000 boards in the
536 inventory.
537
538 @param spare_counts A dictionary mapping boards to buffer counts.
539 @param repair_list A list of boards to be repaired.
540 @return A numeric score.
541
542 """
543 # Go through `buffer_counts`, and create a list of new counts
544 # that records the buffer count for each board after repair.
545 # The new list of counts discards the board names, as they don't
546 # contribute to the final score.
547 _NBOARDS = 1000
548 repair_inventory = _LabInventory(repair_list)
549 new_counts = []
550 for b, c in buffer_counts.items():
551 if b in repair_inventory:
552 newcount = repair_inventory[b].get_total()
553 else:
554 newcount = 0
555 new_counts.append(c + newcount)
556 # Go through the new list of counts. Find the worst available
557 # spares count, and count how many times that worst case occurs.
558 worst_count = new_counts[0]
559 num_worst = 1
560 for c in new_counts[1:]:
561 if c == worst_count:
562 num_worst += 1
563 elif c < worst_count:
564 worst_count = c
565 num_worst = 1
566 # Return the calculated score
567 return _NBOARDS * worst_count - num_worst
568
569
570def _generate_repair_recommendation(inventory, num_recommend):
571 """Return a summary of selected DUTs needing repair.
572
573 Returns a message recommending a list of broken DUTs to be
574 repaired. The list of DUTs is selected based on these
575 criteria:
576 * No more than `num_recommend` DUTs will be listed.
577 * All DUTs must be in the same lab.
578 * DUTs should be selected for some degree of physical
579 proximity.
580 * DUTs for boards with a low spares buffer are more important
581 than DUTs with larger buffers.
582
583 The algorithm used will guarantee that at least one DUT from a
584 board with the smallest spares buffer will be recommended. If
585 the worst spares buffer number is shared by more than one board,
586 the algorithm will tend to prefer repair sets that include more
587 of those boards over sets that cover fewer boards.
588
J. Richard Barnette1df6a562015-06-09 10:06:17 -0700589 @param inventory Inventory for generating recommendations.
590 @param num_recommend Number of DUTs to recommend for repair.
591
J. Richard Barnettef6839282015-06-01 16:00:35 -0700592 """
593 logging.debug('Creating DUT repair recommendations')
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700594 board_buffer_counts = {}
595 broken_list = []
596 for board in inventory.get_managed_boards():
597 logging.debug('Listing failed DUTs for %s', board)
598 counts = inventory[board]
599 if counts.get_broken() != 0:
600 board_buffer_counts[board] = counts.get_spares_buffer()
601 broken_list.extend(counts.get_broken_list())
J. Richard Barnette55127432015-10-13 17:01:56 -0700602 # N.B. The logic inside this loop may seem complicated, but
J. Richard Barnettef6839282015-06-01 16:00:35 -0700603 # simplification is hard:
604 # * Calculating an initial recommendation outside of
605 # the loop likely would make things more complicated,
606 # not less.
607 # * It's necessary to calculate an initial lab slice once per
608 # lab _before_ the while loop, in case the number of broken
609 # DUTs in a lab is less than `num_recommend`.
J. Richard Barnette55127432015-10-13 17:01:56 -0700610 recommendation = None
611 best_score = None
612 for lab_duts in _sort_by_location(broken_list):
J. Richard Barnettef6839282015-06-01 16:00:35 -0700613 start = 0
614 end = num_recommend
615 lab_slice = lab_duts[start : end]
616 lab_score = _score_repair_set(board_buffer_counts,
617 lab_slice)
618 while end < len(lab_duts):
619 start += 1
620 end += 1
621 new_slice = lab_duts[start : end]
622 new_score = _score_repair_set(board_buffer_counts,
623 new_slice)
624 if new_score > lab_score:
625 lab_slice = new_slice
626 lab_score = new_score
627 if recommendation is None or lab_score > best_score:
628 recommendation = lab_slice
629 best_score = lab_score
J. Richard Barnette1df6a562015-06-09 10:06:17 -0700630 message = ['Repair recommendations:\n',
631 '%-30s %-16s %s' % (
J. Richard Barnettef6839282015-06-01 16:00:35 -0700632 'Hostname', 'Board', 'Servo instructions')]
633 for h in recommendation:
634 servo_name = servo_host.make_servo_hostname(h.host.hostname)
635 if utils.host_is_in_lab_zone(servo_name):
636 servo_message = 'Repair servo first'
637 else:
638 servo_message = 'No servo present'
639 line = '%-30s %-16s %s' % (
640 h.host.hostname, h.host_board, servo_message)
641 message.append(line)
642 return '\n'.join(message)
643
644
J. Richard Barnette96db3492015-03-27 17:23:52 -0700645def _generate_board_inventory_message(inventory):
646 """Generate the "board inventory" e-mail message.
647
648 The board inventory is a list by board summarizing the number
649 of working and broken DUTs, and the total shortfall or surplus
650 of working devices relative to the minimum critical pool
651 requirement.
652
653 The report omits boards with no DUTs in the spare pool or with
654 no DUTs in a critical pool.
655
656 N.B. For sample output text formattted as users can expect to
657 see it in e-mail and log files, refer to the unit tests.
658
659 @param inventory _LabInventory object with the inventory to
660 be reported on.
661 @return String with the inventory message to be sent.
662
663 """
664 logging.debug('Creating board inventory')
J. Richard Barnetted3ba33a2015-10-14 11:20:49 -0700665 nworking = 0
666 nbroken = 0
667 nbroken_boards = 0
J. Richard Barnetteea5a4ba2016-02-18 16:34:50 -0800668 ntotal_boards = 0
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700669 summaries = []
670 for board in inventory.get_managed_boards():
671 logging.debug('Counting board inventory for %s', board)
672 counts = inventory[board]
673 # Summary elements laid out in the same order as the text
674 # headers:
675 # Board Avail Bad Good Spare Total
676 # e[0] e[1] e[2] e[3] e[4] e[5]
677 element = (board,
678 counts.get_spares_buffer(),
679 counts.get_broken(),
680 counts.get_working(),
681 counts.get_total(_SPARE_POOL),
682 counts.get_total())
J. Richard Barnetteea5a4ba2016-02-18 16:34:50 -0800683 if element[2]:
684 summaries.append(element)
685 nbroken_boards += 1
686 ntotal_boards += 1
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700687 nbroken += element[2]
688 nworking += element[3]
J. Richard Barnetted3ba33a2015-10-14 11:20:49 -0700689 ntotal = nworking + nbroken
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700690 summaries = sorted(summaries, key=lambda e: (e[1], -e[2]))
J. Richard Barnetted3ba33a2015-10-14 11:20:49 -0700691 broken_percent = int(round(100.0 * nbroken / ntotal))
692 working_percent = 100 - broken_percent
693 message = ['Summary of DUTs in inventory:',
694 '%10s %10s %6s' % ('Bad', 'Good', 'Total'),
695 '%5d %3d%% %5d %3d%% %6d' % (
696 nbroken, broken_percent,
697 nworking, working_percent,
698 ntotal),
699 '',
700 'Boards with failures: %d' % nbroken_boards,
J. Richard Barnetteea5a4ba2016-02-18 16:34:50 -0800701 'Boards in inventory: %d' % ntotal_boards,
J. Richard Barnetted3ba33a2015-10-14 11:20:49 -0700702 '', '',
703 'Full board inventory:\n',
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700704 '%-22s %5s %5s %5s %5s %5s' % (
705 'Board', 'Avail', 'Bad', 'Good',
706 'Spare', 'Total')]
J. Richard Barnette96db3492015-03-27 17:23:52 -0700707 message.extend(
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700708 ['%-22s %5d %5d %5d %5d %5d' % e for e in summaries])
J. Richard Barnette96db3492015-03-27 17:23:52 -0700709 return '\n'.join(message)
710
711
J. Richard Barnette4845fcf2015-04-20 14:26:25 -0700712_POOL_INVENTORY_HEADER = '''\
Aviv Keshet056d74c2015-07-14 09:18:43 -0700713Notice to Infrastructure deputies: All boards shown below are at
J. Richard Barnettec9a143c2015-06-04 11:11:19 -0700714less than full strength, please take action to resolve the issues.
715Once you're satisified that failures won't recur, failed DUTs can
716be replaced with spares by running `balance_pool`. Detailed
717instructions can be found here:
J. Richard Barnette4845fcf2015-04-20 14:26:25 -0700718 http://go/cros-manage-duts
719'''
720
721
J. Richard Barnette96db3492015-03-27 17:23:52 -0700722def _generate_pool_inventory_message(inventory):
723 """Generate the "pool inventory" e-mail message.
724
725 The pool inventory is a list by pool and board summarizing the
726 number of working and broken DUTs in the pool. Only boards with
727 at least one broken DUT are included in the list.
728
729 N.B. For sample output text formattted as users can expect to
730 see it in e-mail and log files, refer to the unit tests.
731
732 @param inventory _LabInventory object with the inventory to
733 be reported on.
734 @return String with the inventory message to be sent.
735
736 """
737 logging.debug('Creating pool inventory')
J. Richard Barnette4845fcf2015-04-20 14:26:25 -0700738 message = [_POOL_INVENTORY_HEADER]
J. Richard Barnette96db3492015-03-27 17:23:52 -0700739 newline = ''
740 for pool in _CRITICAL_POOLS:
741 message.append(
742 '%sStatus for pool:%s, by board:' % (newline, pool))
743 message.append(
744 '%-20s %5s %5s %5s' % (
745 'Board', 'Bad', 'Good', 'Total'))
746 data_list = []
747 for board, counts in inventory.items():
748 logging.debug('Counting inventory for %s, %s',
749 board, pool)
750 broken = counts.get_broken(pool)
751 if broken == 0:
752 continue
753 working = counts.get_working(pool)
754 total = counts.get_total(pool)
755 data_list.append((board, broken, working, total))
756 if data_list:
757 data_list = sorted(data_list, key=lambda d: -d[1])
758 message.extend(
759 ['%-20s %5d %5d %5d' % t for t in data_list])
760 else:
761 message.append('(All boards at full strength)')
762 newline = '\n'
763 return '\n'.join(message)
764
765
766def _send_email(arguments, tag, subject, recipients, body):
767 """Send an inventory e-mail message.
768
769 The message is logged in the selected log directory using `tag`
770 for the file name.
771
772 If the --print option was requested, the message is neither
773 logged nor sent, but merely printed on stdout.
774
775 @param arguments Parsed command-line options.
776 @param tag Tag identifying the inventory for logging
777 purposes.
778 @param subject E-mail Subject: header line.
779 @param recipients E-mail addresses for the To: header line.
780 @param body E-mail message body.
781
782 """
783 logging.debug('Generating email: "%s"', subject)
784 all_recipients = ', '.join(recipients)
785 report_body = '\n'.join([
786 'To: %s' % all_recipients,
787 'Subject: %s' % subject,
788 '', body, ''])
J. Richard Barnette02e82432015-10-13 16:02:47 -0700789 if arguments.debug:
J. Richard Barnette96db3492015-03-27 17:23:52 -0700790 print report_body
791 else:
792 filename = os.path.join(arguments.logdir, tag)
793 try:
794 report_file = open(filename, 'w')
795 report_file.write(report_body)
796 report_file.close()
797 except EnvironmentError as e:
798 logging.error('Failed to write %s: %s', filename, e)
799 try:
800 gmail_lib.send_email(all_recipients, subject, body)
801 except Exception as e:
802 logging.error('Failed to send e-mail to %s: %s',
803 all_recipients, e)
804
805
806def _separate_email_addresses(address_list):
807 """Parse a list of comma-separated lists of e-mail addresses.
808
809 @param address_list A list of strings containing comma
810 separate e-mail addresses.
811 @return A list of the individual e-mail addresses.
812
813 """
814 newlist = []
815 for arg in address_list:
816 newlist.extend([email.strip() for email in arg.split(',')])
817 return newlist
818
819
820def _verify_arguments(arguments):
821 """Validate command-line arguments.
822
823 Join comma separated e-mail addresses for `--board-notify` and
824 `--pool-notify` in separate option arguments into a single list.
825
J. Richard Barnette02e82432015-10-13 16:02:47 -0700826 For non-debug uses, require that notification be requested for
827 at least one report. For debug, if notification isn't specified,
828 treat it as "run all the reports."
829
830 The return value indicates success or failure; in the case of
831 failure, we also write an error message to stderr.
832
J. Richard Barnette96db3492015-03-27 17:23:52 -0700833 @param arguments Command-line arguments as returned by
834 `ArgumentParser`
J. Richard Barnette02e82432015-10-13 16:02:47 -0700835 @return True if the arguments are semantically good, or False
836 if the arguments don't meet requirements.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700837
838 """
839 arguments.board_notify = _separate_email_addresses(
840 arguments.board_notify)
841 arguments.pool_notify = _separate_email_addresses(
842 arguments.pool_notify)
J. Richard Barnette02e82432015-10-13 16:02:47 -0700843 if not arguments.board_notify and not arguments.pool_notify:
844 if not arguments.debug:
845 sys.stderr.write('Must specify at least one of '
846 '--board-notify or --pool-notify\n')
847 return False
848 else:
849 # We want to run all the reports. An empty notify list
850 # will cause a report to be skipped, so make sure the
851 # lists are non-empty.
852 arguments.board_notify = ['']
853 arguments.pool_notify = ['']
854 return True
J. Richard Barnette96db3492015-03-27 17:23:52 -0700855
856
857def _get_logdir(script):
858 """Get the default directory for the `--logdir` option.
859
860 The default log directory is based on the parent directory
861 containing this script.
862
863 @param script Path to this script file.
864 @return A path to a directory.
865
866 """
867 basedir = os.path.dirname(os.path.abspath(script))
868 basedir = os.path.dirname(basedir)
869 return os.path.join(basedir, _LOGDIR)
870
871
872def _parse_command(argv):
873 """Parse the command line arguments.
874
875 Create an argument parser for this command's syntax, parse the
876 command line, and return the result of the ArgumentParser
877 parse_args() method.
878
879 @param argv Standard command line argument vector; argv[0] is
880 assumed to be the command name.
881 @return Result returned by ArgumentParser.parse_args().
882
883 """
884 parser = argparse.ArgumentParser(
885 prog=argv[0],
886 description='Gather and report lab inventory statistics')
887 parser.add_argument('-d', '--duration', type=int,
888 default=_DEFAULT_DURATION, metavar='HOURS',
889 help='number of hours back to search for status'
890 ' (default: %d)' % _DEFAULT_DURATION)
891 parser.add_argument('--board-notify', action='append',
892 default=[], metavar='ADDRESS',
893 help='Generate board inventory message, '
894 'and send it to the given e-mail address(es)')
895 parser.add_argument('--pool-notify', action='append',
896 default=[], metavar='ADDRESS',
897 help='Generate pool inventory message, '
898 'and send it to the given address(es)')
J. Richard Barnette1df6a562015-06-09 10:06:17 -0700899 parser.add_argument('-r', '--recommend', type=int, default=None,
J. Richard Barnettef6839282015-06-01 16:00:35 -0700900 help=('Specify how many DUTs should be '
J. Richard Barnette1df6a562015-06-09 10:06:17 -0700901 'recommended for repair (default: no '
902 'recommendation)'))
J. Richard Barnette02e82432015-10-13 16:02:47 -0700903 parser.add_argument('--debug', action='store_true',
J. Richard Barnette96db3492015-03-27 17:23:52 -0700904 help='Print e-mail messages on stdout '
905 'without sending them.')
906 parser.add_argument('--logdir', default=_get_logdir(argv[0]),
907 help='Directory where logs will be written.')
908 parser.add_argument('boardnames', nargs='*',
909 metavar='BOARD',
910 help='names of boards to report on '
911 '(default: all boards)')
912 arguments = parser.parse_args(argv[1:])
J. Richard Barnette02e82432015-10-13 16:02:47 -0700913 if not _verify_arguments(arguments):
914 return None
J. Richard Barnette96db3492015-03-27 17:23:52 -0700915 return arguments
916
917
918def _configure_logging(arguments):
919 """Configure the `logging` module for our needs.
920
921 How we log depends on whether the `--print` option was
922 provided on the command line. Without the option, we log all
923 messages at DEBUG level or above, and write them to a file in
924 the directory specified by the `--logdir` option. With the
925 option, we write log messages to stdout; messages below INFO
926 level are discarded.
927
928 The log file is configured to rotate once a week on Friday
929 evening, preserving ~3 months worth of history.
930
931 @param arguments Command-line arguments as returned by
932 `ArgumentParser`
933
934 """
J. Richard Barnettef6839282015-06-01 16:00:35 -0700935 root_logger = logging.getLogger()
J. Richard Barnette02e82432015-10-13 16:02:47 -0700936 if arguments.debug:
J. Richard Barnettef6839282015-06-01 16:00:35 -0700937 root_logger.setLevel(logging.INFO)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700938 handler = logging.StreamHandler(sys.stdout)
939 handler.setFormatter(logging.Formatter())
940 else:
J. Richard Barnettef6839282015-06-01 16:00:35 -0700941 root_logger.setLevel(logging.DEBUG)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700942 logfile = os.path.join(arguments.logdir, _LOGFILE)
943 handler = logging.handlers.TimedRotatingFileHandler(
944 logfile, when='W4', backupCount=13)
945 formatter = logging.Formatter(_LOG_FORMAT,
946 time_utils.TIME_FMT)
947 handler.setFormatter(formatter)
J. Richard Barnettef6839282015-06-01 16:00:35 -0700948 # TODO(jrbarnette) This is gross. Importing client.bin.utils
949 # implicitly imported logging_config, which calls
950 # logging.basicConfig() *at module level*. That gives us an
951 # extra logging handler that we don't want. So, clear out all
952 # the handlers here.
953 for h in root_logger.handlers:
954 root_logger.removeHandler(h)
955 root_logger.addHandler(handler)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700956
957
958def _populate_board_counts(inventory):
959 """Gather board counts while providing interactive feedback.
960
961 Gathering the status of all individual DUTs in the lab can take
962 considerable time (~30 minutes at the time of this writing).
963
964 Normally, we pay that cost by querying as we go. However, with
965 the `--print` option, a human being may be watching the
966 progress. So, we force the first (expensive) queries to happen
967 up front, and provide a small ASCII progress bar to give an
968 indicator of how many boards have been processed.
969
970 @param inventory _LabInventory object with the inventory to
971 be gathered.
972
973 """
974 n = 0
J. Richard Barnettef6839282015-06-01 16:00:35 -0700975 total_broken = 0
J. Richard Barnette96db3492015-03-27 17:23:52 -0700976 for counts in inventory.values():
977 n += 1
978 if n % 10 == 5:
979 c = '+'
980 elif n % 10 == 0:
981 c = '%d' % ((n / 10) % 10)
982 else:
983 c = '.'
984 sys.stdout.write(c)
985 sys.stdout.flush()
986 # This next call is where all the time goes - it forces all
987 # of a board's HostJobHistory objects to query the database
988 # and cache their results.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700989 total_broken += counts.get_broken()
J. Richard Barnette96db3492015-03-27 17:23:52 -0700990 sys.stdout.write('\n')
J. Richard Barnettef6839282015-06-01 16:00:35 -0700991 sys.stdout.write('Found %d broken DUTs\n' % total_broken)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700992
993
994def main(argv):
995 """Standard main routine.
996 @param argv Command line arguments including `sys.argv[0]`.
997 """
998 arguments = _parse_command(argv)
J. Richard Barnette02e82432015-10-13 16:02:47 -0700999 if not arguments:
1000 sys.exit(1)
J. Richard Barnette96db3492015-03-27 17:23:52 -07001001 _configure_logging(arguments)
1002 try:
1003 end_time = int(time.time())
1004 start_time = end_time - arguments.duration * 60 * 60
1005 timestamp = time.strftime('%Y-%m-%d.%H',
1006 time.localtime(end_time))
1007 logging.debug('Starting lab inventory for %s', timestamp)
1008 if arguments.board_notify:
J. Richard Barnette1df6a562015-06-09 10:06:17 -07001009 if arguments.recommend:
1010 logging.debug('Will include repair recommendations')
J. Richard Barnette96db3492015-03-27 17:23:52 -07001011 logging.debug('Will include board inventory')
1012 if arguments.pool_notify:
1013 logging.debug('Will include pool inventory')
1014
J. Richard Barnettea7c514e2015-09-15 11:13:23 -07001015 afe = frontend_wrappers.RetryingAFE(server=None)
J. Richard Barnette96db3492015-03-27 17:23:52 -07001016 inventory = _LabInventory.create_inventory(
1017 afe, start_time, end_time, arguments.boardnames)
1018 logging.info('Found %d hosts across %d boards',
1019 inventory.get_num_duts(),
1020 inventory.get_num_boards())
1021
J. Richard Barnette02e82432015-10-13 16:02:47 -07001022 if arguments.debug:
J. Richard Barnette96db3492015-03-27 17:23:52 -07001023 _populate_board_counts(inventory)
1024
J. Richard Barnette02e82432015-10-13 16:02:47 -07001025 if arguments.board_notify:
J. Richard Barnette1df6a562015-06-09 10:06:17 -07001026 if arguments.recommend:
1027 recommend_message = _generate_repair_recommendation(
1028 inventory, arguments.recommend) + '\n\n\n'
1029 else:
1030 recommend_message = ''
1031 board_message = _generate_board_inventory_message(inventory)
J. Richard Barnette96db3492015-03-27 17:23:52 -07001032 _send_email(arguments,
1033 'boards-%s.txt' % timestamp,
1034 'DUT board inventory %s' % timestamp,
1035 arguments.board_notify,
J. Richard Barnette02e82432015-10-13 16:02:47 -07001036 recommend_message + board_message)
J. Richard Barnette96db3492015-03-27 17:23:52 -07001037
J. Richard Barnette02e82432015-10-13 16:02:47 -07001038 if arguments.pool_notify:
J. Richard Barnette96db3492015-03-27 17:23:52 -07001039 _send_email(arguments,
1040 'pools-%s.txt' % timestamp,
1041 'DUT pool inventory %s' % timestamp,
1042 arguments.pool_notify,
1043 _generate_pool_inventory_message(inventory))
1044 except KeyboardInterrupt:
1045 pass
1046 except EnvironmentError as e:
1047 logging.exception('Unexpected OS error: %s', e)
1048 except Exception as e:
1049 logging.exception('Unexpected exception: %s', e)
1050
1051
J. Richard Barnetteaa868932015-10-23 13:28:59 -07001052def get_managed_boards(afe):
1053 end_time = int(time.time())
1054 start_time = end_time - 24 * 60 * 60
1055 inventory = _LabInventory.create_inventory(
1056 afe, start_time, end_time)
1057 return inventory.get_managed_boards()
1058
1059
J. Richard Barnette96db3492015-03-27 17:23:52 -07001060if __name__ == '__main__':
1061 main(sys.argv)