blob: 965817b807820563ce5e82e42ed9f93c5c2dff35 [file] [log] [blame]
J. Richard Barnette96db3492015-03-27 17:23:52 -07001#!/usr/bin/env python
2# Copyright 2015 The Chromium OS Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6"""Create e-mail reports of the Lab's DUT inventory.
7
8Gathers a list of all DUTs of interest in the Lab, segregated by
9board and pool, and determines whether each DUT is working or
10broken. Then, send one or more e-mail reports summarizing the
11status to e-mail addresses provided on the command line.
12
13usage: lab_inventory.py [ options ] [ board ... ]
14
15Options:
16--duration / -d <hours>
17 How far back in time to search job history to determine DUT
18 status.
19
20--board-notify <address>[,<address>]
21 Send the "board status" e-mail to all the specified e-mail
22 addresses.
23
24--pool-notify <address>[,<address>]
25 Send the "pool status" e-mail to all the specified e-mail
26 addresses.
27
J. Richard Barnette1df6a562015-06-09 10:06:17 -070028--recommend <number>
29 When generating the "board status" e-mail, included a list of
30 <number> specific DUTs to be recommended for repair.
31
J. Richard Barnette96db3492015-03-27 17:23:52 -070032--logdir <directory>
33 Log progress and actions in a file under this directory. Text
34 of any e-mail sent will also be logged in a timestamped file in
35 this directory.
36
J. Richard Barnette02e82432015-10-13 16:02:47 -070037--debug
J. Richard Barnette96db3492015-03-27 17:23:52 -070038 Suppress all logging and sending e-mail. Instead, write the
39 output that would be generated onto stdout.
40
41<board> arguments:
42 With no arguments, gathers the status for all boards in the lab.
43 With one or more named boards on the command line, restricts
44 reporting to just those boards.
45
46"""
47
48
49import argparse
50import logging
51import logging.handlers
52import os
J. Richard Barnettef6839282015-06-01 16:00:35 -070053import re
J. Richard Barnette96db3492015-03-27 17:23:52 -070054import sys
55import time
56
57import common
J. Richard Barnettef6839282015-06-01 16:00:35 -070058from autotest_lib.client.bin import utils
J. Richard Barnette96db3492015-03-27 17:23:52 -070059from autotest_lib.client.common_lib import time_utils
J. Richard Barnettea7c514e2015-09-15 11:13:23 -070060from autotest_lib.server.cros.dynamic_suite import frontend_wrappers
J. Richard Barnettef6839282015-06-01 16:00:35 -070061from autotest_lib.server.hosts import servo_host
J. Richard Barnette96db3492015-03-27 17:23:52 -070062from autotest_lib.site_utils import gmail_lib
63from autotest_lib.site_utils import status_history
64from autotest_lib.site_utils.suite_scheduler import constants
65
66
67# The pools in the Lab that are actually of interest.
68#
69# These are general purpose pools of DUTs that are considered
70# identical for purposes of testing. That is, a device in one of
71# these pools can be shifted to another pool at will for purposes
72# of supplying test demand.
73#
74# Devices in these pools are not allowed to have special-purpose
75# attachments, or to be part of in any kind of custom fixture.
76# Devices in these pools are also required to reside in areas
77# managed by the Platforms team (i.e. at the time of this writing,
78# only in "Atlantis" or "Destiny").
79#
80# _CRITICAL_POOLS - Pools that must be kept fully supplied in order
81# to guarantee timely completion of tests from builders.
82# _SPARE_POOL - A low priority pool that is allowed to provide
83# spares to replace broken devices in the critical pools.
84# _MANAGED_POOLS - The set of all the general purpose pools
85# monitored by this script.
86
87_CRITICAL_POOLS = ['bvt', 'cq']
88_SPARE_POOL = 'suites'
89_MANAGED_POOLS = _CRITICAL_POOLS + [_SPARE_POOL]
90
J. Richard Barnette96db3492015-03-27 17:23:52 -070091# _DEFAULT_DURATION:
92# Default value used for the --duration command line option.
93# Specifies how far back in time to search in order to determine
94# DUT status.
95
96_DEFAULT_DURATION = 24
97
J. Richard Barnette96db3492015-03-27 17:23:52 -070098# _LOGDIR:
99# Relative path used in the calculation of the default setting
100# for the --logdir option. The full path path is relative to
101# the root of the autotest directory, as determined from
102# sys.argv[0].
103# _LOGFILE:
104# Basename of a file to which general log information will be
105# written.
106# _LOG_FORMAT:
107# Format string for log messages.
108
109_LOGDIR = os.path.join('logs', 'dut-data')
110_LOGFILE = 'lab-inventory.log'
111_LOG_FORMAT = '%(asctime)s | %(levelname)-10s | %(message)s'
112
J. Richard Barnettef6839282015-06-01 16:00:35 -0700113# Pattern describing location-based host names in the Chrome OS test
114# labs. Each DUT hostname designates the DUT's location:
115# * A lab (room) that's physically separated from other labs
116# (i.e. there's a door).
117# * A row (or aisle) of DUTs within the lab.
118# * A vertical rack of shelves on the row.
119# * A specific host on one shelf of the rack.
120
121_HOSTNAME_PATTERN = re.compile(
122 r'(chromeos\d+)-row(\d+)-rack(\d+)-host(\d+)')
123
J. Richard Barnette96db3492015-03-27 17:23:52 -0700124
125class _PoolCounts(object):
126 """Maintains a set of `HostJobHistory` objects for a pool.
127
128 The collected history objects are nominally all part of a single
J. Richard Barnettef6839282015-06-01 16:00:35 -0700129 scheduling pool of DUTs. The collection maintains a list of
130 working DUTs, a list of broken DUTs, and a list of all DUTs.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700131
J. Richard Barnettef6839282015-06-01 16:00:35 -0700132 Performance note: Certain methods in this class are potentially
133 expensive:
134 * `get_working()`
135 * `get_working_list()`
136 * `get_broken()`
137 * `get_broken_list()`
138 The first time any one of these methods is called, it causes
139 multiple RPC calls with a relatively expensive set of database
140 queries. However, the results of the queries are cached in the
141 individual `HostJobHistory` objects, so only the first call
142 actually pays the full cost.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700143
J. Richard Barnettef6839282015-06-01 16:00:35 -0700144 Additionally, `get_working_list()` and `get_broken_list()` both
145 cache their return values to avoid recalculating lists at every
146 call; this caching is separate from the caching of RPC results
147 described above.
148
149 This class is deliberately constructed to delay the RPC cost
150 until the accessor methods are called (rather than to query in
J. Richard Barnette96db3492015-03-27 17:23:52 -0700151 `record_host()`) so that it's possible to construct a complete
152 `_LabInventory` without making the expensive queries at creation
J. Richard Barnettef6839282015-06-01 16:00:35 -0700153 time. `_populate_board_counts()`, below, assumes this behavior.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700154
155 """
156
157 def __init__(self):
158 self._histories = []
J. Richard Barnettef6839282015-06-01 16:00:35 -0700159 self._working_list = None
160 self._broken_list = None
J. Richard Barnette96db3492015-03-27 17:23:52 -0700161
162
163 def record_host(self, host_history):
164 """Add one `HostJobHistory` object to the collection.
165
166 @param host_history The `HostJobHistory` object to be
167 remembered.
168
169 """
J. Richard Barnettef6839282015-06-01 16:00:35 -0700170 self._working_list = None
171 self._broken_list = None
J. Richard Barnette96db3492015-03-27 17:23:52 -0700172 self._histories.append(host_history)
173
174
J. Richard Barnettef6839282015-06-01 16:00:35 -0700175 def get_working_list(self):
176 """Return a list of all working DUTs in the pool.
177
178 Filter `self._histories` for histories where the last
179 diagnosis is `WORKING`.
180
181 Cache the result so that we only cacluate it once.
182
183 @return A list of HostJobHistory objects.
184
185 """
186 if self._working_list is None:
187 self._working_list = [h for h in self._histories
188 if h.last_diagnosis()[0] == status_history.WORKING]
189 return self._working_list
190
191
J. Richard Barnette96db3492015-03-27 17:23:52 -0700192 def get_working(self):
J. Richard Barnettef6839282015-06-01 16:00:35 -0700193 """Return the number of working DUTs in the pool."""
194 return len(self.get_working_list())
195
196
197 def get_broken_list(self):
198 """Return a list of all broken DUTs in the pool.
199
200 Filter `self._histories` for histories where the last
201 diagnosis is not `WORKING`.
202
203 Cache the result so that we only cacluate it once.
204
205 @return A list of HostJobHistory objects.
206
207 """
208 if self._broken_list is None:
209 self._broken_list = [h for h in self._histories
210 if h.last_diagnosis()[0] != status_history.WORKING]
211 return self._broken_list
J. Richard Barnette96db3492015-03-27 17:23:52 -0700212
213
214 def get_broken(self):
J. Richard Barnettef6839282015-06-01 16:00:35 -0700215 """Return the number of broken DUTs in the pool."""
216 return len(self.get_broken_list())
J. Richard Barnette96db3492015-03-27 17:23:52 -0700217
218
219 def get_total(self):
J. Richard Barnettef6839282015-06-01 16:00:35 -0700220 """Return the total number of DUTs in the pool."""
J. Richard Barnette96db3492015-03-27 17:23:52 -0700221 return len(self._histories)
222
223
224class _BoardCounts(object):
225 """Maintains a set of `HostJobHistory` objects for a board.
226
227 The collected history objects are nominally all of the same
228 board. The collection maintains a count of working DUTs, a
229 count of broken DUTs, and a total count. The counts can be
230 obtained either for a single pool, or as a total across all
231 pools.
232
233 DUTs in the collection must be assigned to one of the pools
234 in `_MANAGED_POOLS`.
235
236 The `get_working()` and `get_broken()` methods rely on the
237 methods of the same name in _PoolCounts, so the performance
238 note in _PoolCounts applies here as well.
239
240 """
241
242 def __init__(self):
243 self._pools = {
244 pool: _PoolCounts() for pool in _MANAGED_POOLS
245 }
246
247 def record_host(self, host_history):
248 """Add one `HostJobHistory` object to the collection.
249
250 @param host_history The `HostJobHistory` object to be
251 remembered.
252
253 """
J. Richard Barnette3d0590a2015-04-29 12:56:12 -0700254 pool = host_history.host_pool
J. Richard Barnette96db3492015-03-27 17:23:52 -0700255 self._pools[pool].record_host(host_history)
256
257
258 def _count_pool(self, get_pool_count, pool=None):
259 """Internal helper to count hosts in a given pool.
260
261 The `get_pool_count` parameter is a function to calculate
262 the exact count of interest for the pool.
263
264 @param get_pool_count Function to return a count from a
265 _PoolCount object.
266 @param pool The pool to be counted. If `None`,
267 return the total across all pools.
268
269 """
270 if pool is None:
271 return sum([get_pool_count(counts)
272 for counts in self._pools.values()])
273 else:
274 return get_pool_count(self._pools[pool])
275
276
J. Richard Barnettef6839282015-06-01 16:00:35 -0700277 def get_working_list(self):
278 """Return a list of all working DUTs for the board.
279
280 Go through all HostJobHistory objects in the board's pools,
281 selecting the ones where the last diagnosis is `WORKING`.
282
283 @return A list of HostJobHistory objects.
284
285 """
286 l = []
287 for p in self._pools.values():
288 l.extend(p.get_working_list())
289 return l
290
291
J. Richard Barnette96db3492015-03-27 17:23:52 -0700292 def get_working(self, pool=None):
293 """Return the number of working DUTs in a pool.
294
295 @param pool The pool to be counted. If `None`, return the
296 total across all pools.
297
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700298 @return The total number of working DUTs in the selected
299 pool(s).
J. Richard Barnette96db3492015-03-27 17:23:52 -0700300 """
301 return self._count_pool(_PoolCounts.get_working, pool)
302
303
J. Richard Barnettef6839282015-06-01 16:00:35 -0700304 def get_broken_list(self):
305 """Return a list of all broken DUTs for the board.
306
307 Go through all HostJobHistory objects in the board's pools,
308 selecting the ones where the last diagnosis is not
309 `WORKING`.
310
311 @return A list of HostJobHistory objects.
312
313 """
314 l = []
315 for p in self._pools.values():
316 l.extend(p.get_broken_list())
317 return l
318
319
J. Richard Barnette96db3492015-03-27 17:23:52 -0700320 def get_broken(self, pool=None):
321 """Return the number of broken DUTs in a pool.
322
323 @param pool The pool to be counted. If `None`, return the
324 total across all pools.
325
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700326 @return The total number of broken DUTs in the selected pool(s).
J. Richard Barnette96db3492015-03-27 17:23:52 -0700327 """
328 return self._count_pool(_PoolCounts.get_broken, pool)
329
330
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700331 def get_spares_buffer(self):
332 """Return the the nominal number of working spares.
333
334 Calculates and returns how many working spares there would
335 be in the spares pool if all broken DUTs were in the spares
336 pool. This number may be negative, indicating a shortfall
337 in the critical pools.
338
339 @return The total number DUTs in the spares pool, less the total
340 number of broken DUTs in all pools.
341 """
342 return self.get_total(_SPARE_POOL) - self.get_broken()
343
344
J. Richard Barnette96db3492015-03-27 17:23:52 -0700345 def get_total(self, pool=None):
346 """Return the total number of DUTs in a pool.
347
348 @param pool The pool to be counted. If `None`, return the
349 total across all pools.
350
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700351 @return The total number of DUTs in the selected pool(s).
J. Richard Barnette96db3492015-03-27 17:23:52 -0700352 """
353 return self._count_pool(_PoolCounts.get_total, pool)
354
355
356class _LabInventory(dict):
357 """Collection of `HostJobHistory` objects for the Lab's inventory.
358
359 The collection is indexed by board. Indexing returns the
360 _BoardCounts object associated with the board.
361
362 The collection is also iterable. The iterator returns all the
363 boards in the inventory, in unspecified order.
364
365 """
366
367 @classmethod
368 def create_inventory(cls, afe, start_time, end_time, boardlist=[]):
369 """Return a Lab inventory with specified parameters.
370
371 By default, gathers inventory from `HostJobHistory` objects
372 for all DUTs in the `_MANAGED_POOLS` list. If `boardlist`
373 is supplied, the inventory will be restricted to only the
374 given boards.
375
376 @param afe AFE object for constructing the
377 `HostJobHistory` objects.
378 @param start_time Start time for the `HostJobHistory`
379 objects.
380 @param end_time End time for the `HostJobHistory`
381 objects.
382 @param boardlist List of boards to include. If empty,
383 include all available boards.
384 @return A `_LabInventory` object for the specified boards.
385
386 """
387 label_list = [constants.Labels.POOL_PREFIX + l
388 for l in _MANAGED_POOLS]
389 afehosts = afe.get_hosts(labels__name__in=label_list)
390 if boardlist:
391 boardhosts = []
392 for board in boardlist:
393 board_label = constants.Labels.BOARD_PREFIX + board
394 host_list = [h for h in afehosts
395 if board_label in h.labels]
396 boardhosts.extend(host_list)
397 afehosts = boardhosts
398 create = lambda host: (
399 status_history.HostJobHistory(afe, host,
400 start_time, end_time))
401 return cls([create(host) for host in afehosts])
402
403
404 def __init__(self, histories):
J. Richard Barnette6948ed32015-05-06 08:57:10 -0700405 # N.B. The query that finds our hosts is restricted to those
406 # with a valid pool: label, but doesn't check for a valid
407 # board: label. In some (insufficiently) rare cases, the
408 # AFE hosts table has been known to (incorrectly) have DUTs
409 # with a pool: but no board: label. We explicitly exclude
410 # those here.
411 histories = [h for h in histories
412 if h.host_board is not None]
J. Richard Barnette3d0590a2015-04-29 12:56:12 -0700413 boards = set([h.host_board for h in histories])
J. Richard Barnette96db3492015-03-27 17:23:52 -0700414 initval = { board: _BoardCounts() for board in boards }
415 super(_LabInventory, self).__init__(initval)
416 self._dut_count = len(histories)
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700417 self._managed_boards = None
J. Richard Barnette96db3492015-03-27 17:23:52 -0700418 for h in histories:
J. Richard Barnette3d0590a2015-04-29 12:56:12 -0700419 self[h.host_board].record_host(h)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700420
421
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700422 def get_managed_boards(self):
423 """Return the set of "managed" boards.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700424
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700425 Operationally, saying a board is "managed" means that the
426 board will be included in the "board" and "repair
427 recommendations" reports. That is, if there are failures in
428 the board's inventory then lab techs will be asked to fix
429 them without a separate ticket.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700430
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700431 For purposes of implementation, a board is "managed" if it
432 has DUTs in both the spare and a non-spare (i.e. critical)
433 pool.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700434
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700435 @return A set of all the boards that have both spare and
436 non-spare pools.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700437 """
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700438 if self._managed_boards is None:
439 self._managed_boards = set()
J. Richard Barnettef6839282015-06-01 16:00:35 -0700440 for board, counts in self.items():
J. Richard Barnettef6839282015-06-01 16:00:35 -0700441 spares = counts.get_total(_SPARE_POOL)
442 total = counts.get_total()
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700443 if spares != 0 and spares != total:
444 self._managed_boards.add(board)
445 return self._managed_boards
J. Richard Barnettef6839282015-06-01 16:00:35 -0700446
447
J. Richard Barnette96db3492015-03-27 17:23:52 -0700448 def get_num_duts(self):
449 """Return the total number of DUTs in the inventory."""
450 return self._dut_count
451
452
453 def get_num_boards(self):
454 """Return the total number of boards in the inventory."""
455 return len(self)
456
457
J. Richard Barnettef6839282015-06-01 16:00:35 -0700458def _sort_by_location(inventory_list):
459 """Return a list of DUTs, organized by location.
460
461 Take the given list of `HostJobHistory` objects, separate it
462 into a list per lab, and sort each lab's list by location. The
463 order of sorting within a lab is
464 * By row number within the lab,
465 * then by rack number within the row,
466 * then by host shelf number within the rack.
467
468 Return a list of the sorted lists.
469
470 Implementation note: host locations are sorted by converting
471 each location into a base 100 number. If row, rack or
472 host numbers exceed the range [0..99], then sorting will
473 break down.
474
475 @return A list of sorted lists of DUTs.
476
477 """
478 BASE = 100
479 lab_lists = {}
480 for history in inventory_list:
481 location = _HOSTNAME_PATTERN.match(history.host.hostname)
482 if location:
483 lab = location.group(1)
484 key = 0
485 for idx in location.group(2, 3, 4):
486 key = BASE * key + int(idx)
487 lab_lists.setdefault(lab, []).append((key, history))
488 return_list = []
489 for dut_list in lab_lists.values():
490 dut_list.sort(key=lambda t: t[0])
491 return_list.append([t[1] for t in dut_list])
492 return return_list
493
494
495def _score_repair_set(buffer_counts, repair_list):
496 """Return a numeric score rating a set of DUTs to be repaired.
497
498 `buffer_counts` is a dictionary mapping board names to the
499 size of the board's spares buffer.
500
501 `repair_list` is a list of DUTs to be repaired.
502
503 This function calculates the new set of buffer counts that would
504 result from the proposed repairs, and scores the new set using
505 two numbers:
506 * Worst case buffer count for any board (higher is better).
507 This is the more siginficant number for comparison.
508 * Number of boards at the worst case (lower is better). This
509 is the less significant number.
510
511 Implementation note: The score could fail to reflect the
512 intended criteria if there are more than 1000 boards in the
513 inventory.
514
515 @param spare_counts A dictionary mapping boards to buffer counts.
516 @param repair_list A list of boards to be repaired.
517 @return A numeric score.
518
519 """
520 # Go through `buffer_counts`, and create a list of new counts
521 # that records the buffer count for each board after repair.
522 # The new list of counts discards the board names, as they don't
523 # contribute to the final score.
524 _NBOARDS = 1000
525 repair_inventory = _LabInventory(repair_list)
526 new_counts = []
527 for b, c in buffer_counts.items():
528 if b in repair_inventory:
529 newcount = repair_inventory[b].get_total()
530 else:
531 newcount = 0
532 new_counts.append(c + newcount)
533 # Go through the new list of counts. Find the worst available
534 # spares count, and count how many times that worst case occurs.
535 worst_count = new_counts[0]
536 num_worst = 1
537 for c in new_counts[1:]:
538 if c == worst_count:
539 num_worst += 1
540 elif c < worst_count:
541 worst_count = c
542 num_worst = 1
543 # Return the calculated score
544 return _NBOARDS * worst_count - num_worst
545
546
547def _generate_repair_recommendation(inventory, num_recommend):
548 """Return a summary of selected DUTs needing repair.
549
550 Returns a message recommending a list of broken DUTs to be
551 repaired. The list of DUTs is selected based on these
552 criteria:
553 * No more than `num_recommend` DUTs will be listed.
554 * All DUTs must be in the same lab.
555 * DUTs should be selected for some degree of physical
556 proximity.
557 * DUTs for boards with a low spares buffer are more important
558 than DUTs with larger buffers.
559
560 The algorithm used will guarantee that at least one DUT from a
561 board with the smallest spares buffer will be recommended. If
562 the worst spares buffer number is shared by more than one board,
563 the algorithm will tend to prefer repair sets that include more
564 of those boards over sets that cover fewer boards.
565
J. Richard Barnette1df6a562015-06-09 10:06:17 -0700566 @param inventory Inventory for generating recommendations.
567 @param num_recommend Number of DUTs to recommend for repair.
568
J. Richard Barnettef6839282015-06-01 16:00:35 -0700569 """
570 logging.debug('Creating DUT repair recommendations')
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700571 board_buffer_counts = {}
572 broken_list = []
573 for board in inventory.get_managed_boards():
574 logging.debug('Listing failed DUTs for %s', board)
575 counts = inventory[board]
576 if counts.get_broken() != 0:
577 board_buffer_counts[board] = counts.get_spares_buffer()
578 broken_list.extend(counts.get_broken_list())
J. Richard Barnette55127432015-10-13 17:01:56 -0700579 # N.B. The logic inside this loop may seem complicated, but
J. Richard Barnettef6839282015-06-01 16:00:35 -0700580 # simplification is hard:
581 # * Calculating an initial recommendation outside of
582 # the loop likely would make things more complicated,
583 # not less.
584 # * It's necessary to calculate an initial lab slice once per
585 # lab _before_ the while loop, in case the number of broken
586 # DUTs in a lab is less than `num_recommend`.
J. Richard Barnette55127432015-10-13 17:01:56 -0700587 recommendation = None
588 best_score = None
589 for lab_duts in _sort_by_location(broken_list):
J. Richard Barnettef6839282015-06-01 16:00:35 -0700590 start = 0
591 end = num_recommend
592 lab_slice = lab_duts[start : end]
593 lab_score = _score_repair_set(board_buffer_counts,
594 lab_slice)
595 while end < len(lab_duts):
596 start += 1
597 end += 1
598 new_slice = lab_duts[start : end]
599 new_score = _score_repair_set(board_buffer_counts,
600 new_slice)
601 if new_score > lab_score:
602 lab_slice = new_slice
603 lab_score = new_score
604 if recommendation is None or lab_score > best_score:
605 recommendation = lab_slice
606 best_score = lab_score
J. Richard Barnette1df6a562015-06-09 10:06:17 -0700607 message = ['Repair recommendations:\n',
608 '%-30s %-16s %s' % (
J. Richard Barnettef6839282015-06-01 16:00:35 -0700609 'Hostname', 'Board', 'Servo instructions')]
610 for h in recommendation:
611 servo_name = servo_host.make_servo_hostname(h.host.hostname)
612 if utils.host_is_in_lab_zone(servo_name):
613 servo_message = 'Repair servo first'
614 else:
615 servo_message = 'No servo present'
616 line = '%-30s %-16s %s' % (
617 h.host.hostname, h.host_board, servo_message)
618 message.append(line)
619 return '\n'.join(message)
620
621
J. Richard Barnette96db3492015-03-27 17:23:52 -0700622def _generate_board_inventory_message(inventory):
623 """Generate the "board inventory" e-mail message.
624
625 The board inventory is a list by board summarizing the number
626 of working and broken DUTs, and the total shortfall or surplus
627 of working devices relative to the minimum critical pool
628 requirement.
629
630 The report omits boards with no DUTs in the spare pool or with
631 no DUTs in a critical pool.
632
633 N.B. For sample output text formattted as users can expect to
634 see it in e-mail and log files, refer to the unit tests.
635
636 @param inventory _LabInventory object with the inventory to
637 be reported on.
638 @return String with the inventory message to be sent.
639
640 """
641 logging.debug('Creating board inventory')
J. Richard Barnetted3ba33a2015-10-14 11:20:49 -0700642 nworking = 0
643 nbroken = 0
644 nbroken_boards = 0
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700645 summaries = []
646 for board in inventory.get_managed_boards():
647 logging.debug('Counting board inventory for %s', board)
648 counts = inventory[board]
649 # Summary elements laid out in the same order as the text
650 # headers:
651 # Board Avail Bad Good Spare Total
652 # e[0] e[1] e[2] e[3] e[4] e[5]
653 element = (board,
654 counts.get_spares_buffer(),
655 counts.get_broken(),
656 counts.get_working(),
657 counts.get_total(_SPARE_POOL),
658 counts.get_total())
659 summaries.append(element)
660 nbroken += element[2]
661 nworking += element[3]
662 if element[2]:
J. Richard Barnetted3ba33a2015-10-14 11:20:49 -0700663 nbroken_boards += 1
664 ntotal = nworking + nbroken
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700665 summaries = sorted(summaries, key=lambda e: (e[1], -e[2]))
J. Richard Barnetted3ba33a2015-10-14 11:20:49 -0700666 broken_percent = int(round(100.0 * nbroken / ntotal))
667 working_percent = 100 - broken_percent
668 message = ['Summary of DUTs in inventory:',
669 '%10s %10s %6s' % ('Bad', 'Good', 'Total'),
670 '%5d %3d%% %5d %3d%% %6d' % (
671 nbroken, broken_percent,
672 nworking, working_percent,
673 ntotal),
674 '',
675 'Boards with failures: %d' % nbroken_boards,
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700676 'Boards in inventory: %d' % len(summaries),
J. Richard Barnetted3ba33a2015-10-14 11:20:49 -0700677 '', '',
678 'Full board inventory:\n',
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700679 '%-22s %5s %5s %5s %5s %5s' % (
680 'Board', 'Avail', 'Bad', 'Good',
681 'Spare', 'Total')]
J. Richard Barnette96db3492015-03-27 17:23:52 -0700682 message.extend(
J. Richard Barnettee39c8272015-10-20 17:58:30 -0700683 ['%-22s %5d %5d %5d %5d %5d' % e for e in summaries])
J. Richard Barnette96db3492015-03-27 17:23:52 -0700684 return '\n'.join(message)
685
686
J. Richard Barnette4845fcf2015-04-20 14:26:25 -0700687_POOL_INVENTORY_HEADER = '''\
Aviv Keshet056d74c2015-07-14 09:18:43 -0700688Notice to Infrastructure deputies: All boards shown below are at
J. Richard Barnettec9a143c2015-06-04 11:11:19 -0700689less than full strength, please take action to resolve the issues.
690Once you're satisified that failures won't recur, failed DUTs can
691be replaced with spares by running `balance_pool`. Detailed
692instructions can be found here:
J. Richard Barnette4845fcf2015-04-20 14:26:25 -0700693 http://go/cros-manage-duts
694'''
695
696
J. Richard Barnette96db3492015-03-27 17:23:52 -0700697def _generate_pool_inventory_message(inventory):
698 """Generate the "pool inventory" e-mail message.
699
700 The pool inventory is a list by pool and board summarizing the
701 number of working and broken DUTs in the pool. Only boards with
702 at least one broken DUT are included in the list.
703
704 N.B. For sample output text formattted as users can expect to
705 see it in e-mail and log files, refer to the unit tests.
706
707 @param inventory _LabInventory object with the inventory to
708 be reported on.
709 @return String with the inventory message to be sent.
710
711 """
712 logging.debug('Creating pool inventory')
J. Richard Barnette4845fcf2015-04-20 14:26:25 -0700713 message = [_POOL_INVENTORY_HEADER]
J. Richard Barnette96db3492015-03-27 17:23:52 -0700714 newline = ''
715 for pool in _CRITICAL_POOLS:
716 message.append(
717 '%sStatus for pool:%s, by board:' % (newline, pool))
718 message.append(
719 '%-20s %5s %5s %5s' % (
720 'Board', 'Bad', 'Good', 'Total'))
721 data_list = []
722 for board, counts in inventory.items():
723 logging.debug('Counting inventory for %s, %s',
724 board, pool)
725 broken = counts.get_broken(pool)
726 if broken == 0:
727 continue
728 working = counts.get_working(pool)
729 total = counts.get_total(pool)
730 data_list.append((board, broken, working, total))
731 if data_list:
732 data_list = sorted(data_list, key=lambda d: -d[1])
733 message.extend(
734 ['%-20s %5d %5d %5d' % t for t in data_list])
735 else:
736 message.append('(All boards at full strength)')
737 newline = '\n'
738 return '\n'.join(message)
739
740
741def _send_email(arguments, tag, subject, recipients, body):
742 """Send an inventory e-mail message.
743
744 The message is logged in the selected log directory using `tag`
745 for the file name.
746
747 If the --print option was requested, the message is neither
748 logged nor sent, but merely printed on stdout.
749
750 @param arguments Parsed command-line options.
751 @param tag Tag identifying the inventory for logging
752 purposes.
753 @param subject E-mail Subject: header line.
754 @param recipients E-mail addresses for the To: header line.
755 @param body E-mail message body.
756
757 """
758 logging.debug('Generating email: "%s"', subject)
759 all_recipients = ', '.join(recipients)
760 report_body = '\n'.join([
761 'To: %s' % all_recipients,
762 'Subject: %s' % subject,
763 '', body, ''])
J. Richard Barnette02e82432015-10-13 16:02:47 -0700764 if arguments.debug:
J. Richard Barnette96db3492015-03-27 17:23:52 -0700765 print report_body
766 else:
767 filename = os.path.join(arguments.logdir, tag)
768 try:
769 report_file = open(filename, 'w')
770 report_file.write(report_body)
771 report_file.close()
772 except EnvironmentError as e:
773 logging.error('Failed to write %s: %s', filename, e)
774 try:
775 gmail_lib.send_email(all_recipients, subject, body)
776 except Exception as e:
777 logging.error('Failed to send e-mail to %s: %s',
778 all_recipients, e)
779
780
781def _separate_email_addresses(address_list):
782 """Parse a list of comma-separated lists of e-mail addresses.
783
784 @param address_list A list of strings containing comma
785 separate e-mail addresses.
786 @return A list of the individual e-mail addresses.
787
788 """
789 newlist = []
790 for arg in address_list:
791 newlist.extend([email.strip() for email in arg.split(',')])
792 return newlist
793
794
795def _verify_arguments(arguments):
796 """Validate command-line arguments.
797
798 Join comma separated e-mail addresses for `--board-notify` and
799 `--pool-notify` in separate option arguments into a single list.
800
J. Richard Barnette02e82432015-10-13 16:02:47 -0700801 For non-debug uses, require that notification be requested for
802 at least one report. For debug, if notification isn't specified,
803 treat it as "run all the reports."
804
805 The return value indicates success or failure; in the case of
806 failure, we also write an error message to stderr.
807
J. Richard Barnette96db3492015-03-27 17:23:52 -0700808 @param arguments Command-line arguments as returned by
809 `ArgumentParser`
J. Richard Barnette02e82432015-10-13 16:02:47 -0700810 @return True if the arguments are semantically good, or False
811 if the arguments don't meet requirements.
J. Richard Barnette96db3492015-03-27 17:23:52 -0700812
813 """
814 arguments.board_notify = _separate_email_addresses(
815 arguments.board_notify)
816 arguments.pool_notify = _separate_email_addresses(
817 arguments.pool_notify)
J. Richard Barnette02e82432015-10-13 16:02:47 -0700818 if not arguments.board_notify and not arguments.pool_notify:
819 if not arguments.debug:
820 sys.stderr.write('Must specify at least one of '
821 '--board-notify or --pool-notify\n')
822 return False
823 else:
824 # We want to run all the reports. An empty notify list
825 # will cause a report to be skipped, so make sure the
826 # lists are non-empty.
827 arguments.board_notify = ['']
828 arguments.pool_notify = ['']
829 return True
J. Richard Barnette96db3492015-03-27 17:23:52 -0700830
831
832def _get_logdir(script):
833 """Get the default directory for the `--logdir` option.
834
835 The default log directory is based on the parent directory
836 containing this script.
837
838 @param script Path to this script file.
839 @return A path to a directory.
840
841 """
842 basedir = os.path.dirname(os.path.abspath(script))
843 basedir = os.path.dirname(basedir)
844 return os.path.join(basedir, _LOGDIR)
845
846
847def _parse_command(argv):
848 """Parse the command line arguments.
849
850 Create an argument parser for this command's syntax, parse the
851 command line, and return the result of the ArgumentParser
852 parse_args() method.
853
854 @param argv Standard command line argument vector; argv[0] is
855 assumed to be the command name.
856 @return Result returned by ArgumentParser.parse_args().
857
858 """
859 parser = argparse.ArgumentParser(
860 prog=argv[0],
861 description='Gather and report lab inventory statistics')
862 parser.add_argument('-d', '--duration', type=int,
863 default=_DEFAULT_DURATION, metavar='HOURS',
864 help='number of hours back to search for status'
865 ' (default: %d)' % _DEFAULT_DURATION)
866 parser.add_argument('--board-notify', action='append',
867 default=[], metavar='ADDRESS',
868 help='Generate board inventory message, '
869 'and send it to the given e-mail address(es)')
870 parser.add_argument('--pool-notify', action='append',
871 default=[], metavar='ADDRESS',
872 help='Generate pool inventory message, '
873 'and send it to the given address(es)')
J. Richard Barnette1df6a562015-06-09 10:06:17 -0700874 parser.add_argument('-r', '--recommend', type=int, default=None,
J. Richard Barnettef6839282015-06-01 16:00:35 -0700875 help=('Specify how many DUTs should be '
J. Richard Barnette1df6a562015-06-09 10:06:17 -0700876 'recommended for repair (default: no '
877 'recommendation)'))
J. Richard Barnette02e82432015-10-13 16:02:47 -0700878 parser.add_argument('--debug', action='store_true',
J. Richard Barnette96db3492015-03-27 17:23:52 -0700879 help='Print e-mail messages on stdout '
880 'without sending them.')
881 parser.add_argument('--logdir', default=_get_logdir(argv[0]),
882 help='Directory where logs will be written.')
883 parser.add_argument('boardnames', nargs='*',
884 metavar='BOARD',
885 help='names of boards to report on '
886 '(default: all boards)')
887 arguments = parser.parse_args(argv[1:])
J. Richard Barnette02e82432015-10-13 16:02:47 -0700888 if not _verify_arguments(arguments):
889 return None
J. Richard Barnette96db3492015-03-27 17:23:52 -0700890 return arguments
891
892
893def _configure_logging(arguments):
894 """Configure the `logging` module for our needs.
895
896 How we log depends on whether the `--print` option was
897 provided on the command line. Without the option, we log all
898 messages at DEBUG level or above, and write them to a file in
899 the directory specified by the `--logdir` option. With the
900 option, we write log messages to stdout; messages below INFO
901 level are discarded.
902
903 The log file is configured to rotate once a week on Friday
904 evening, preserving ~3 months worth of history.
905
906 @param arguments Command-line arguments as returned by
907 `ArgumentParser`
908
909 """
J. Richard Barnettef6839282015-06-01 16:00:35 -0700910 root_logger = logging.getLogger()
J. Richard Barnette02e82432015-10-13 16:02:47 -0700911 if arguments.debug:
J. Richard Barnettef6839282015-06-01 16:00:35 -0700912 root_logger.setLevel(logging.INFO)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700913 handler = logging.StreamHandler(sys.stdout)
914 handler.setFormatter(logging.Formatter())
915 else:
J. Richard Barnettef6839282015-06-01 16:00:35 -0700916 root_logger.setLevel(logging.DEBUG)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700917 logfile = os.path.join(arguments.logdir, _LOGFILE)
918 handler = logging.handlers.TimedRotatingFileHandler(
919 logfile, when='W4', backupCount=13)
920 formatter = logging.Formatter(_LOG_FORMAT,
921 time_utils.TIME_FMT)
922 handler.setFormatter(formatter)
J. Richard Barnettef6839282015-06-01 16:00:35 -0700923 # TODO(jrbarnette) This is gross. Importing client.bin.utils
924 # implicitly imported logging_config, which calls
925 # logging.basicConfig() *at module level*. That gives us an
926 # extra logging handler that we don't want. So, clear out all
927 # the handlers here.
928 for h in root_logger.handlers:
929 root_logger.removeHandler(h)
930 root_logger.addHandler(handler)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700931
932
933def _populate_board_counts(inventory):
934 """Gather board counts while providing interactive feedback.
935
936 Gathering the status of all individual DUTs in the lab can take
937 considerable time (~30 minutes at the time of this writing).
938
939 Normally, we pay that cost by querying as we go. However, with
940 the `--print` option, a human being may be watching the
941 progress. So, we force the first (expensive) queries to happen
942 up front, and provide a small ASCII progress bar to give an
943 indicator of how many boards have been processed.
944
945 @param inventory _LabInventory object with the inventory to
946 be gathered.
947
948 """
949 n = 0
J. Richard Barnettef6839282015-06-01 16:00:35 -0700950 total_broken = 0
J. Richard Barnette96db3492015-03-27 17:23:52 -0700951 for counts in inventory.values():
952 n += 1
953 if n % 10 == 5:
954 c = '+'
955 elif n % 10 == 0:
956 c = '%d' % ((n / 10) % 10)
957 else:
958 c = '.'
959 sys.stdout.write(c)
960 sys.stdout.flush()
961 # This next call is where all the time goes - it forces all
962 # of a board's HostJobHistory objects to query the database
963 # and cache their results.
J. Richard Barnettef6839282015-06-01 16:00:35 -0700964 total_broken += counts.get_broken()
J. Richard Barnette96db3492015-03-27 17:23:52 -0700965 sys.stdout.write('\n')
J. Richard Barnettef6839282015-06-01 16:00:35 -0700966 sys.stdout.write('Found %d broken DUTs\n' % total_broken)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700967
968
969def main(argv):
970 """Standard main routine.
971 @param argv Command line arguments including `sys.argv[0]`.
972 """
973 arguments = _parse_command(argv)
J. Richard Barnette02e82432015-10-13 16:02:47 -0700974 if not arguments:
975 sys.exit(1)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700976 _configure_logging(arguments)
977 try:
978 end_time = int(time.time())
979 start_time = end_time - arguments.duration * 60 * 60
980 timestamp = time.strftime('%Y-%m-%d.%H',
981 time.localtime(end_time))
982 logging.debug('Starting lab inventory for %s', timestamp)
983 if arguments.board_notify:
J. Richard Barnette1df6a562015-06-09 10:06:17 -0700984 if arguments.recommend:
985 logging.debug('Will include repair recommendations')
J. Richard Barnette96db3492015-03-27 17:23:52 -0700986 logging.debug('Will include board inventory')
987 if arguments.pool_notify:
988 logging.debug('Will include pool inventory')
989
J. Richard Barnettea7c514e2015-09-15 11:13:23 -0700990 afe = frontend_wrappers.RetryingAFE(server=None)
J. Richard Barnette96db3492015-03-27 17:23:52 -0700991 inventory = _LabInventory.create_inventory(
992 afe, start_time, end_time, arguments.boardnames)
993 logging.info('Found %d hosts across %d boards',
994 inventory.get_num_duts(),
995 inventory.get_num_boards())
996
J. Richard Barnette02e82432015-10-13 16:02:47 -0700997 if arguments.debug:
J. Richard Barnette96db3492015-03-27 17:23:52 -0700998 _populate_board_counts(inventory)
999
J. Richard Barnette02e82432015-10-13 16:02:47 -07001000 if arguments.board_notify:
J. Richard Barnette1df6a562015-06-09 10:06:17 -07001001 if arguments.recommend:
1002 recommend_message = _generate_repair_recommendation(
1003 inventory, arguments.recommend) + '\n\n\n'
1004 else:
1005 recommend_message = ''
1006 board_message = _generate_board_inventory_message(inventory)
J. Richard Barnette96db3492015-03-27 17:23:52 -07001007 _send_email(arguments,
1008 'boards-%s.txt' % timestamp,
1009 'DUT board inventory %s' % timestamp,
1010 arguments.board_notify,
J. Richard Barnette02e82432015-10-13 16:02:47 -07001011 recommend_message + board_message)
J. Richard Barnette96db3492015-03-27 17:23:52 -07001012
J. Richard Barnette02e82432015-10-13 16:02:47 -07001013 if arguments.pool_notify:
J. Richard Barnette96db3492015-03-27 17:23:52 -07001014 _send_email(arguments,
1015 'pools-%s.txt' % timestamp,
1016 'DUT pool inventory %s' % timestamp,
1017 arguments.pool_notify,
1018 _generate_pool_inventory_message(inventory))
1019 except KeyboardInterrupt:
1020 pass
1021 except EnvironmentError as e:
1022 logging.exception('Unexpected OS error: %s', e)
1023 except Exception as e:
1024 logging.exception('Unexpected exception: %s', e)
1025
1026
J. Richard Barnetteaa868932015-10-23 13:28:59 -07001027def get_managed_boards(afe):
1028 end_time = int(time.time())
1029 start_time = end_time - 24 * 60 * 60
1030 inventory = _LabInventory.create_inventory(
1031 afe, start_time, end_time)
1032 return inventory.get_managed_boards()
1033
1034
J. Richard Barnette96db3492015-03-27 17:23:52 -07001035if __name__ == '__main__':
1036 main(sys.argv)