blob: 656c5795f9e5dd014cc400d27cfadf7a98e32c6f [file] [log] [blame]
J. Richard Barnette91d56812015-04-21 10:22:31 -07001#!/usr/bin/env python
2# Copyright 2015 The Chromium OS Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6"""Adjust pool balances to cover DUT shortfalls.
7
8This command takes all broken DUTs in a specific pool for specific
9boards and swaps them with working DUTs taken from a selected pool
10of spares. The command is meant primarily for replacing broken DUTs
11in critical pools like BVT or CQ, but it can also be used to adjust
12pool sizes, or to create or remove pools.
13
14usage: balance_pool.py [ options ] POOL BOARD [ BOARD ... ]
15
16positional arguments:
17 POOL Name of the pool to balance
18 BOARD Names of boards to balance
19
20optional arguments:
21 -h, --help show this help message and exit
22 -t COUNT, --total COUNT
23 Set the number of DUTs in the pool to the specified
24 count for every BOARD
25 -a COUNT, --grow COUNT
26 Add the specified number of DUTs to the pool for every
27 BOARD
28 -d COUNT, --shrink COUNT
29 Remove the specified number of DUTs from the pool for
30 every BOARD
31 -s POOL, --spare POOL
32 Pool from which to draw replacement spares (default:
33 pool:suites)
34 -n, --dry-run Report actions to take in the form of shell commands
35
36
37The command attempts to remove all broken DUTs from the target POOL
38for every BOARD, and replace them with enough working DUTs taken
39from the spare pool to bring the strength of POOL to the requested
40total COUNT.
41
42If no COUNT options are supplied (i.e. there are no --total, --grow,
43or --shrink options), the command will maintain the current totals of
44DUTs for every BOARD in the target POOL.
45
46If not enough working spares are available, broken DUTs may be left
47in the pool to keep the pool at the target COUNT.
48
49When reducing pool size, working DUTs will be returned after broken
50DUTs, if it's necessary to achieve the target COUNT.
51
J. Richard Barnette70c03b02015-05-26 14:33:17 -070052If the selected target POOL is for a Freon board, *and* the selected
53spare pool has no DUTs (in any state), *and* the corresponding
54non-Freon spare pool is populated, then the non-Freon pool will
55be used for the Freon board. A similar rule applies to balancing
56non-Freon boards when there is an available Freon spare pool.
57
J. Richard Barnette91d56812015-04-21 10:22:31 -070058"""
59
60
61import argparse
62import sys
63import time
64
65import common
66from autotest_lib.server import frontend
67from autotest_lib.site_utils import status_history
68from autotest_lib.site_utils.suite_scheduler import constants
69
David James2a3cb542015-05-05 17:13:43 -070070from chromite.lib import parallel
71
J. Richard Barnette91d56812015-04-21 10:22:31 -070072
73_POOL_PREFIX = constants.Labels.POOL_PREFIX
J. Richard Barnette70c03b02015-05-26 14:33:17 -070074_BOARD_PREFIX = constants.Labels.BOARD_PREFIX
75
76_FREON_BOARD_TAG = 'freon'
J. Richard Barnette91d56812015-04-21 10:22:31 -070077
78
79def _log_message(message, *args):
80 """Log a message with optional format arguments to stdout.
81
82 This function logs a single line to stdout, with formatting
83 if necessary, and without adornments.
84
85 If `*args` are supplied, the message will be formatted using
86 the arguments.
87
88 @param message Message to be logged, possibly after formatting.
89 @param args Format arguments. If empty, the message is logged
90 without formatting.
91
92 """
93 if args:
94 message = message % args
95 sys.stdout.write('%s\n' % message)
96
97
98def _log_info(dry_run, message, *args):
99 """Log information in a dry-run dependent fashion.
100
101 This function logs a single line to stdout, with formatting
102 if necessary. When logging for a dry run, the message is
103 printed as a shell comment, rather than as unadorned text.
104
105 If `*args` are supplied, the message will be formatted using
106 the arguments.
107
108 @param message Message to be logged, possibly after formatting.
109 @param args Format arguments. If empty, the message is logged
110 without formatting.
111
112 """
113 if dry_run:
114 message = '# ' + message
115 _log_message(message, *args)
116
117
118def _log_error(message, *args):
119 """Log an error to stderr, with optional format arguments.
120
121 This function logs a single line to stderr, prefixed to indicate
122 that it is an error message.
123
124 If `*args` are supplied, the message will be formatted using
125 the arguments.
126
127 @param message Message to be logged, possibly after formatting.
128 @param args Format arguments. If empty, the message is logged
129 without formatting.
130
131 """
132 if args:
133 message = message % args
134 sys.stderr.write('ERROR: %s\n' % message)
135
136
137class _DUTPool(object):
138 """Information about a pool of DUTs for a given board.
139
140 This class collects information about all DUTs for a given
141 board and pool pair, and divides them into three categories:
142 + Working - the DUT is working for testing, and not locked.
143 + Broken - the DUT is unable to run tests, or it is locked.
144 + Ineligible - the DUT is not available to be removed from
145 this pool. The DUT may be either working or broken.
146
147 DUTs with more than one pool: label are ineligible for exchange
148 during balancing. This is done for the sake of chameleon hosts,
149 which must always be assigned to pool:suites. These DUTs are
150 always marked with pool:chameleon to prevent their reassignment.
151
152 TODO(jrbarnette): The use of `pool:chamelon` (instead of just
153 the `chameleon` label is a hack that should be eliminated.
154
155 _DUTPool instances are used to track both main pools that need
156 to be resupplied with working DUTs and spare pools that supply
157 those DUTs.
158
159 @property board Name of the board associated with
160 this pool of DUTs.
161 @property pool Name of the pool associated with
162 this pool of DUTs.
163 @property _working_hosts The list of this pool's working
164 DUTs.
165 @property _broken_hosts The list of this pool's broken
166 DUTs.
167 @property _ineligible__hosts The list of this pool's ineligible
168 DUTs.
169 @property _labels A list of labels that identify a DUT
170 as part of this pool.
171 @property _total_hosts The total number of hosts in pool.
172
173 """
174
175
J. Richard Barnette70c03b02015-05-26 14:33:17 -0700176 @staticmethod
177 def _get_platform_label(board):
178 """Return the platform label associated with `board`.
179
180 When swapping between freon and non-freon boards, the
181 platform label must also change (because wmatrix reports
182 build results against platform labels, not boards). So, we
183 must be able to get the platform label from the board name.
184
185 For non-freon boards, the platform label is based on a name
186 assigned by the firmware, which in some cases is different
187 from the board name. For freon boards, the platform label
188 is always the board name.
189
190 @param board The board name to convert to a platform label.
191 @return The platform label for the given board name.
192
193 """
194 if board.endswith(_FREON_BOARD_TAG):
195 return board
196 if board.startswith('x86-'):
197 return board[len('x86-') :]
198 platform_map = {
199 'daisy': 'snow',
200 'daisy_spring': 'spring',
201 'daisy_skate': 'skate',
202 'parrot_ivb': 'parrot_2',
203 'falco_li': 'falco'
204 }
205 return platform_map.get(board, board)
206
207
208 @staticmethod
209 def _freon_board_toggle(board):
210 """Toggle a board name between freon and non-freon.
211
212 For boards naming a freon build, return the name of the
213 associated non-freon board. For boards naming non-freon
214 builds, return the name of the associated freon board.
215
216 @param board The board name to be toggled.
217 @return A new board name, toggled for freon.
218
219 """
220 if board.endswith(_FREON_BOARD_TAG):
221 # The actual board name ends with either "-freon" or
222 # "_freon", so we have to strip off one extra character.
223 return board[: -len(_FREON_BOARD_TAG) - 1]
224 else:
225 # The actual board name will end with either "-freon" or
226 # "_freon"; we have to figure out which one to use.
227 joiner = '_'
228 if joiner in board:
229 joiner = '-'
230 return joiner.join([board, _FREON_BOARD_TAG])
231
232
233 def __init__(self, afe, board, pool, start_time, end_time,
234 use_freon=False):
J. Richard Barnette91d56812015-04-21 10:22:31 -0700235 self.board = board
236 self.pool = pool
237 self._working_hosts = []
238 self._broken_hosts = []
239 self._ineligible_hosts = []
J. Richard Barnette70c03b02015-05-26 14:33:17 -0700240 self._total_hosts = self._get_hosts(
241 afe, start_time, end_time, use_freon)
242 self._labels = set([_BOARD_PREFIX + self.board,
243 self._get_platform_label(self.board),
244 _POOL_PREFIX + self.pool])
J. Richard Barnette91d56812015-04-21 10:22:31 -0700245
246
J. Richard Barnette70c03b02015-05-26 14:33:17 -0700247 def _get_hosts(self, afe, start_time, end_time, use_freon):
J. Richard Barnette91d56812015-04-21 10:22:31 -0700248 all_histories = (
249 status_history.HostJobHistory.get_multiple_histories(
250 afe, start_time, end_time,
251 board=self.board, pool=self.pool))
J. Richard Barnette70c03b02015-05-26 14:33:17 -0700252 if not all_histories and use_freon:
253 alternate_board = self._freon_board_toggle(self.board)
254 alternate_histories = (
255 status_history.HostJobHistory.get_multiple_histories(
256 afe, start_time, end_time,
257 board=alternate_board, pool=self.pool))
258 if alternate_histories:
259 self.board = alternate_board
260 all_histories = alternate_histories
J. Richard Barnette91d56812015-04-21 10:22:31 -0700261 for h in all_histories:
262 host = h.host
263 host_pools = [l for l in host.labels
264 if l.startswith(_POOL_PREFIX)]
265 if len(host_pools) != 1:
266 self._ineligible_hosts.append(host)
267 else:
268 diag = h.last_diagnosis()[0]
269 if (diag == status_history.WORKING and
270 not host.locked):
271 self._working_hosts.append(host)
272 else:
273 self._broken_hosts.append(host)
274 return len(all_histories)
275
276
277 @property
278 def pool_labels(self):
279 """Return the AFE labels that identify this pool.
280
281 The returned labels are the labels that must be removed
282 to remove a DUT from the pool, or added to add a DUT.
283
284 @return A list of AFE labels suitable for AFE.add_labels()
285 or AFE.remove_labels().
286
287 """
288 return self._labels
289
290
291 def calculate_inventory(self, dry_run):
292 """Calculate and log how many DUTs are in this pool.
293
294 Return the total number of DUTs in the pool across all three
295 categories (working, broken, and ineligible). As a side
296 effect, log the totals.
297
298 @param dry_run Whether the logging is for a dry run or for
299 actual execution.
300
301 @return The total number of DUTs in this pool.
302
303 """
304 _log_info(dry_run, 'Balancing %s %s pool:',
305 self.board, self.pool)
306 _log_info(dry_run,
307 'Total %d DUTs, %d working, %d broken, %d reserved.',
308 self._total_hosts, len(self._working_hosts),
309 len(self._broken_hosts), len(self._ineligible_hosts))
310 return self._total_hosts
311
312
313 def calculate_spares_needed(self, dry_run, target_total):
314 """Calculate and log the spares needed to achieve a target.
315
316 Return how many working spares are needed to achieve the
317 given `target_total` with all DUTs working. Log the
318 adjustments entailed.
319
320 The spares count may be positive or negative. Positive
321 values indicate spares are needed to replace broken DUTs in
322 order to reach the target; negative numbers indicate that
323 no spares are needed, and that a corresponding number of
324 working devices can be returned.
325
326 If the new target total would require returning ineligible
327 DUTs, an error is logged, and the target total is adjusted
328 so that those DUTs are not exchanged.
329
330 @param dry_run Whether the logging is for a dry run or
331 for actual execution.
332 @param target_total The new target pool size.
333
334 @return The number of spares needed.
335
336 """
337 num_ineligible = len(self._ineligible_hosts)
338 if target_total < num_ineligible:
339 _log_error('%s %s pool: Target of %d is below '
340 'minimum of %d DUTs.',
341 self.board, self.pool,
342 target_total, num_ineligible)
343 _log_error('Adjusting target to %d DUTs.', num_ineligible)
344 target_total = num_ineligible
345 adjustment = target_total - self._total_hosts
346 if adjustment > 0:
347 add_msg = 'grow pool by %d DUTs' % adjustment
348 elif adjustment < 0:
349 add_msg = 'shrink pool by %d DUTs' % -adjustment
350 else:
351 add_msg = 'no change to pool size'
352 _log_info(dry_run, 'Target is %d working DUTs; %s.',
353 target_total, add_msg)
354 return len(self._broken_hosts) + adjustment
355
356
357 def allocate_working_spares(self, dry_run, num_requested):
358 """Allocate and log a list DUTs that can be used as spares.
359
360 Return a list of up to `num_requested` hosts from this
361 pool's list of working hosts. Log details about this pool's
362 working spares.
363
364 If the requested number of DUTs exceeds the supply, log an
365 error, and return as many working devices as possible.
366
367 @param dry_run Whether the logging is for a dry run or
368 for actual execution.
369 @param num_requested Total number of DUTs to allocate from
370 this pool's working DUTs.
371
372 @return A list of spare DUTs.
373
374 """
375 _log_info(dry_run,
376 '%s %s pool has %d spares available.',
377 self.board, self.pool, len(self._working_hosts))
378 if num_requested > len(self._working_hosts):
379 _log_error('Not enough spares: need %d, only have %d.',
380 num_requested, len(self._working_hosts))
381 return self._working_hosts[:num_requested]
382
383
384 def allocate_surplus(self, dry_run, num_broken):
385 """Allocate and log a list DUTs that can returned as surplus.
386
387 Return a list of devices that can be returned in order to
388 reduce this pool's supply. Broken DUTs will be preferred
389 over working ones. Log information about the DUTs to be
390 returned.
391
392 The `num_broken` parameter indicates the number of broken
393 DUTs to be left in the pool. If this number exceeds the
394 number of broken DUTs actually in the pool, the returned
395 list will be empty. If this number is negative, it
396 indicates a number of working DUTs to be returned in
397 addition to all broken ones.
398
399 @param dry_run Whether the logging is for a dry run or
400 for actual execution.
401 @param num_broken Total number of broken DUTs to be left in
402 this pool.
403
404 @return A list of DUTs to be returned as surplus.
405
406 """
407 if num_broken >= 0:
408 surplus = self._broken_hosts[num_broken:]
409 _log_info(dry_run,
410 '%s %s pool will return %d broken DUTs, '
411 'leaving %d still in the pool.',
412 self.board, self.pool,
413 len(surplus),
414 len(self._broken_hosts) - len(surplus))
415 return surplus
416 else:
417 _log_info(dry_run,
418 '%s %s pool will return %d surplus DUTs, '
419 'including %d working DUTs.',
420 self.board, self.pool,
421 len(self._broken_hosts) - num_broken,
422 -num_broken)
423 return (self._broken_hosts +
424 self._working_hosts[:-num_broken])
425
426
427def _exchange_labels(dry_run, hosts, target_pool, spare_pool):
428 """Reassign a list of DUTs from one pool to another.
429
430 For all the given hosts, remove all labels associated with
431 `spare_pool`, and add the labels for `target_pool`. Log the
432 action.
433
434 If `dry_run` is true, perform no changes, but log the `atest`
435 commands needed to accomplish the necessary label changes.
436
437 @param dry_run Whether the logging is for a dry run or
438 for actual execution.
439 @param hosts List of DUTs (AFE hosts) to be reassigned.
440 @param target_pool The `_DUTPool` object from which the hosts
441 are drawn.
442 @param spare_pool The `_DUTPool` object to which the hosts
443 will be added.
444
445 """
446 if not hosts:
447 return
448 _log_info(dry_run, 'Transferring %d DUTs from %s to %s.',
449 len(hosts), spare_pool.pool, target_pool.pool)
450 additions = target_pool.pool_labels
451 removals = spare_pool.pool_labels
J. Richard Barnette70c03b02015-05-26 14:33:17 -0700452 intersection = additions & removals
453 additions -= intersection
454 removals -= intersection
J. Richard Barnette91d56812015-04-21 10:22:31 -0700455 for host in hosts:
456 if not dry_run:
457 _log_message('Updating host: %s.', host.hostname)
J. Richard Barnette70c03b02015-05-26 14:33:17 -0700458 host.remove_labels(list(removals))
459 host.add_labels(list(additions))
J. Richard Barnette91d56812015-04-21 10:22:31 -0700460 else:
461 _log_message('atest label remove -m %s %s',
462 host.hostname, ' '.join(removals))
463 _log_message('atest label add -m %s %s',
464 host.hostname, ' '.join(additions))
465
466
467def _balance_board(arguments, afe, board, start_time, end_time):
468 """Balance one board as requested by command line arguments.
469
470 @param arguments Parsed command line arguments.
471 @param dry_run Whether the logging is for a dry run or
472 for actual execution.
473 @param afe AFE object to be used for the changes.
474 @param board Board to be balanced.
475 @param start_time Start time for HostJobHistory objects in
476 the DUT pools.
477 @param end_time End time for HostJobHistory objects in the
478 DUT pools.
479
480 """
481 spare_pool = _DUTPool(afe, board, arguments.spare,
J. Richard Barnette70c03b02015-05-26 14:33:17 -0700482 start_time, end_time, use_freon=True)
J. Richard Barnette91d56812015-04-21 10:22:31 -0700483 main_pool = _DUTPool(afe, board, arguments.pool,
484 start_time, end_time)
485
486 target_total = main_pool.calculate_inventory(arguments.dry_run)
487 if arguments.total is not None:
488 target_total = arguments.total
489 elif arguments.grow:
490 target_total += arguments.grow
491 elif arguments.shrink:
492 target_total -= arguments.shrink
493
494 spares_needed = main_pool.calculate_spares_needed(
495 arguments.dry_run, target_total)
496 if spares_needed > 0:
497 spare_duts = spare_pool.allocate_working_spares(
498 arguments.dry_run, spares_needed)
499 shortfall = spares_needed - len(spare_duts)
500 else:
501 spare_duts = []
502 shortfall = spares_needed
503
504 surplus_duts = main_pool.allocate_surplus(
505 arguments.dry_run, shortfall)
506 if not spare_duts and not surplus_duts:
507 _log_info(arguments.dry_run, 'No exchange required.')
508 return
509
510 _exchange_labels(arguments.dry_run, surplus_duts,
511 spare_pool, main_pool)
512 _exchange_labels(arguments.dry_run, spare_duts,
513 main_pool, spare_pool)
514
515
516def _parse_command(argv):
517 """Parse the command line arguments.
518
519 Create an argument parser for this command's syntax, parse the
520 command line, and return the result of the `ArgumentParser`
521 `parse_args()` method.
522
523 @param argv Standard command line argument vector; `argv[0]` is
524 assumed to be the command name.
525
526 @return Result returned by `ArgumentParser.parse_args()`.
527
528 """
529 parser = argparse.ArgumentParser(
530 prog=argv[0],
531 description='Balance pool shortages from spares on reserve')
532
533 count_group = parser.add_mutually_exclusive_group()
534 count_group.add_argument('-t', '--total', type=int,
535 metavar='COUNT', default=None,
536 help='Set the number of DUTs in the '
537 'pool to the specified count for '
538 'every BOARD')
539 count_group.add_argument('-a', '--grow', type=int,
540 metavar='COUNT', default=None,
541 help='Add the specified number of DUTs '
542 'to the pool for every BOARD')
543 count_group.add_argument('-d', '--shrink', type=int,
544 metavar='COUNT', default=None,
545 help='Remove the specified number of DUTs '
546 'from the pool for every BOARD')
547
548 parser.add_argument('-s', '--spare', default='suites',
549 metavar='POOL',
550 help='Pool from which to draw replacement '
551 'spares (default: pool:suites)')
552 parser.add_argument('-n', '--dry-run', action='store_true',
553 help='Report actions to take in the form of '
554 'shell commands')
555
556 parser.add_argument('pool',
557 metavar='POOL',
558 help='Name of the pool to balance')
559 parser.add_argument('boards', nargs='+',
560 metavar='BOARD',
561 help='Names of boards to balance')
562
563 arguments = parser.parse_args(argv[1:])
564 return arguments
565
566
567def main(argv):
568 """Standard main routine.
569
570 @param argv Command line arguments including `sys.argv[0]`.
571
572 """
David James2a3cb542015-05-05 17:13:43 -0700573 def balancer(i, board):
574 """Balance the specified board.
575
576 @param i The index of the board.
577 @param board The board name.
578 """
579 if i > 0:
580 _log_message('')
581 _balance_board(arguments, afe, board, start_time, end_time)
582
J. Richard Barnette91d56812015-04-21 10:22:31 -0700583 arguments = _parse_command(argv)
584 end_time = time.time()
585 start_time = end_time - 24 * 60 * 60
David James2a3cb542015-05-05 17:13:43 -0700586 afe = frontend.AFE(server=None)
587 board_args = list(enumerate(arguments.boards))
J. Richard Barnette91d56812015-04-21 10:22:31 -0700588 try:
David James2a3cb542015-05-05 17:13:43 -0700589 parallel.RunTasksInProcessPool(balancer, board_args, processes=8)
J. Richard Barnette91d56812015-04-21 10:22:31 -0700590 except KeyboardInterrupt:
591 pass
592
593
594if __name__ == '__main__':
595 main(sys.argv)