blob: 5bedde747125884825259a239edb6417392eb4fc [file] [log] [blame]
J. Richard Barnette91d56812015-04-21 10:22:31 -07001#!/usr/bin/env python
2# Copyright 2015 The Chromium OS Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6"""Adjust pool balances to cover DUT shortfalls.
7
8This command takes all broken DUTs in a specific pool for specific
9boards and swaps them with working DUTs taken from a selected pool
10of spares. The command is meant primarily for replacing broken DUTs
11in critical pools like BVT or CQ, but it can also be used to adjust
12pool sizes, or to create or remove pools.
13
14usage: balance_pool.py [ options ] POOL BOARD [ BOARD ... ]
15
16positional arguments:
17 POOL Name of the pool to balance
18 BOARD Names of boards to balance
19
20optional arguments:
21 -h, --help show this help message and exit
22 -t COUNT, --total COUNT
23 Set the number of DUTs in the pool to the specified
24 count for every BOARD
25 -a COUNT, --grow COUNT
26 Add the specified number of DUTs to the pool for every
27 BOARD
28 -d COUNT, --shrink COUNT
29 Remove the specified number of DUTs from the pool for
30 every BOARD
31 -s POOL, --spare POOL
32 Pool from which to draw replacement spares (default:
33 pool:suites)
34 -n, --dry-run Report actions to take in the form of shell commands
35
36
37The command attempts to remove all broken DUTs from the target POOL
38for every BOARD, and replace them with enough working DUTs taken
39from the spare pool to bring the strength of POOL to the requested
40total COUNT.
41
42If no COUNT options are supplied (i.e. there are no --total, --grow,
43or --shrink options), the command will maintain the current totals of
44DUTs for every BOARD in the target POOL.
45
46If not enough working spares are available, broken DUTs may be left
47in the pool to keep the pool at the target COUNT.
48
49When reducing pool size, working DUTs will be returned after broken
50DUTs, if it's necessary to achieve the target COUNT.
51
52"""
53
54
55import argparse
56import sys
57import time
58
59import common
60from autotest_lib.server import frontend
Jacob Kopczynskic6e483e2017-08-25 17:28:35 -070061from autotest_lib.server import site_utils
Aviv Keshet7ee95862016-08-30 15:18:27 -070062from autotest_lib.server.lib import status_history
Kevin Chengcf0ad2b2016-04-19 14:51:39 -070063from autotest_lib.site_utils import lab_inventory
J. Richard Barnette91d56812015-04-21 10:22:31 -070064from autotest_lib.site_utils.suite_scheduler import constants
Prathmesh Prabhu68acc402017-11-09 15:24:15 -080065from autotest_lib.utils import labellib
Jacob Kopczynskic6e483e2017-08-25 17:28:35 -070066from chromite.lib import metrics
David James2a3cb542015-05-05 17:13:43 -070067from chromite.lib import parallel
68
Jacob Kopczynskif8d90a82017-10-10 14:37:33 -070069#This must be imported after chromite.lib.metrics
70from infra_libs import ts_mon
J. Richard Barnette91d56812015-04-21 10:22:31 -070071
72_POOL_PREFIX = constants.Labels.POOL_PREFIX
Kevin Chengcf0ad2b2016-04-19 14:51:39 -070073# This is the ratio of all boards we should calculate the default max number of
74# broken boards against. It seemed like the best choice that was neither too
75# strict nor lax.
76_MAX_BROKEN_BOARDS_DEFAULT_RATIO = 3.0 / 8.0
77
78_ALL_CRITICAL_POOLS = 'all_critical_pools'
79_SPARE_DEFAULT = lab_inventory.SPARE_POOL
J. Richard Barnette91d56812015-04-21 10:22:31 -070080
81
82def _log_message(message, *args):
83 """Log a message with optional format arguments to stdout.
84
85 This function logs a single line to stdout, with formatting
86 if necessary, and without adornments.
87
88 If `*args` are supplied, the message will be formatted using
89 the arguments.
90
91 @param message Message to be logged, possibly after formatting.
92 @param args Format arguments. If empty, the message is logged
93 without formatting.
94
95 """
96 if args:
97 message = message % args
98 sys.stdout.write('%s\n' % message)
99
100
101def _log_info(dry_run, message, *args):
102 """Log information in a dry-run dependent fashion.
103
104 This function logs a single line to stdout, with formatting
105 if necessary. When logging for a dry run, the message is
106 printed as a shell comment, rather than as unadorned text.
107
108 If `*args` are supplied, the message will be formatted using
109 the arguments.
110
111 @param message Message to be logged, possibly after formatting.
112 @param args Format arguments. If empty, the message is logged
113 without formatting.
114
115 """
116 if dry_run:
117 message = '# ' + message
118 _log_message(message, *args)
119
120
121def _log_error(message, *args):
122 """Log an error to stderr, with optional format arguments.
123
124 This function logs a single line to stderr, prefixed to indicate
125 that it is an error message.
126
127 If `*args` are supplied, the message will be formatted using
128 the arguments.
129
130 @param message Message to be logged, possibly after formatting.
131 @param args Format arguments. If empty, the message is logged
132 without formatting.
133
134 """
135 if args:
136 message = message % args
137 sys.stderr.write('ERROR: %s\n' % message)
138
139
140class _DUTPool(object):
141 """Information about a pool of DUTs for a given board.
142
143 This class collects information about all DUTs for a given
144 board and pool pair, and divides them into three categories:
145 + Working - the DUT is working for testing, and not locked.
146 + Broken - the DUT is unable to run tests, or it is locked.
147 + Ineligible - the DUT is not available to be removed from
148 this pool. The DUT may be either working or broken.
149
150 DUTs with more than one pool: label are ineligible for exchange
151 during balancing. This is done for the sake of chameleon hosts,
152 which must always be assigned to pool:suites. These DUTs are
153 always marked with pool:chameleon to prevent their reassignment.
154
Aviv Keshetead47d52017-11-02 09:58:32 -0700155 |extra_labels| may be used to restrict DUTPool down to a subset
156 of a given board+pool, by specifying additonal labels that all
157 DUTs are required to possess.
158
J. Richard Barnette91d56812015-04-21 10:22:31 -0700159 TODO(jrbarnette): The use of `pool:chamelon` (instead of just
160 the `chameleon` label is a hack that should be eliminated.
161
162 _DUTPool instances are used to track both main pools that need
163 to be resupplied with working DUTs and spare pools that supply
164 those DUTs.
165
166 @property board Name of the board associated with
167 this pool of DUTs.
168 @property pool Name of the pool associated with
169 this pool of DUTs.
David James750c0382015-05-06 19:30:46 -0700170 @property working_hosts The list of this pool's working
J. Richard Barnette91d56812015-04-21 10:22:31 -0700171 DUTs.
David James750c0382015-05-06 19:30:46 -0700172 @property broken_hosts The list of this pool's broken
J. Richard Barnette91d56812015-04-21 10:22:31 -0700173 DUTs.
David James750c0382015-05-06 19:30:46 -0700174 @property ineligible_hosts The list of this pool's ineligible DUTs.
175 @property labels A list of labels that identify a DUT
J. Richard Barnette91d56812015-04-21 10:22:31 -0700176 as part of this pool.
David James750c0382015-05-06 19:30:46 -0700177 @property total_hosts The total number of hosts in pool.
J. Richard Barnette91d56812015-04-21 10:22:31 -0700178
179 """
180
Aviv Keshetead47d52017-11-02 09:58:32 -0700181 def __init__(self, afe, board, pool, start_time, end_time,
182 extra_labels=None):
J. Richard Barnette91d56812015-04-21 10:22:31 -0700183 self.board = board
184 self.pool = pool
David James750c0382015-05-06 19:30:46 -0700185 self.working_hosts = []
186 self.broken_hosts = []
187 self.ineligible_hosts = []
Aviv Keshetead47d52017-11-02 09:58:32 -0700188 self._extra_labels = extra_labels or []
Richard Barnette07303cb2016-04-15 16:56:16 -0700189 self.total_hosts = self._get_hosts(afe, start_time, end_time)
190 self._labels = [_POOL_PREFIX + self.pool]
J. Richard Barnette91d56812015-04-21 10:22:31 -0700191
192
Richard Barnette07303cb2016-04-15 16:56:16 -0700193 def _get_hosts(self, afe, start_time, end_time):
Prathmesh Prabhu68acc402017-11-09 15:24:15 -0800194 labels = labellib.LabelsMapping(self._extra_labels)
195 labels['board'] = self.board
196 labels['pool'] = self.pool
197 all_histories = status_history.HostJobHistory.get_multiple_histories(
198 afe, start_time, end_time, labels.getlabels())
J. Richard Barnette91d56812015-04-21 10:22:31 -0700199 for h in all_histories:
200 host = h.host
201 host_pools = [l for l in host.labels
202 if l.startswith(_POOL_PREFIX)]
203 if len(host_pools) != 1:
David James750c0382015-05-06 19:30:46 -0700204 self.ineligible_hosts.append(host)
J. Richard Barnette91d56812015-04-21 10:22:31 -0700205 else:
206 diag = h.last_diagnosis()[0]
207 if (diag == status_history.WORKING and
208 not host.locked):
David James750c0382015-05-06 19:30:46 -0700209 self.working_hosts.append(host)
J. Richard Barnette91d56812015-04-21 10:22:31 -0700210 else:
David James750c0382015-05-06 19:30:46 -0700211 self.broken_hosts.append(host)
J. Richard Barnette91d56812015-04-21 10:22:31 -0700212 return len(all_histories)
213
214
215 @property
216 def pool_labels(self):
217 """Return the AFE labels that identify this pool.
218
219 The returned labels are the labels that must be removed
220 to remove a DUT from the pool, or added to add a DUT.
221
222 @return A list of AFE labels suitable for AFE.add_labels()
223 or AFE.remove_labels().
224
225 """
Richard Barnette07303cb2016-04-15 16:56:16 -0700226 return self._labels
J. Richard Barnette91d56812015-04-21 10:22:31 -0700227
David James750c0382015-05-06 19:30:46 -0700228 def calculate_spares_needed(self, target_total):
J. Richard Barnette91d56812015-04-21 10:22:31 -0700229 """Calculate and log the spares needed to achieve a target.
230
231 Return how many working spares are needed to achieve the
David James750c0382015-05-06 19:30:46 -0700232 given `target_total` with all DUTs working.
J. Richard Barnette91d56812015-04-21 10:22:31 -0700233
234 The spares count may be positive or negative. Positive
235 values indicate spares are needed to replace broken DUTs in
236 order to reach the target; negative numbers indicate that
237 no spares are needed, and that a corresponding number of
238 working devices can be returned.
239
240 If the new target total would require returning ineligible
241 DUTs, an error is logged, and the target total is adjusted
242 so that those DUTs are not exchanged.
243
J. Richard Barnette91d56812015-04-21 10:22:31 -0700244 @param target_total The new target pool size.
245
246 @return The number of spares needed.
247
248 """
David James750c0382015-05-06 19:30:46 -0700249 num_ineligible = len(self.ineligible_hosts)
Jacob Kopczynskic6e483e2017-08-25 17:28:35 -0700250 spares_needed = target_total >= num_ineligible
251 if not spares_needed:
J. Richard Barnette91d56812015-04-21 10:22:31 -0700252 _log_error('%s %s pool: Target of %d is below '
253 'minimum of %d DUTs.',
254 self.board, self.pool,
255 target_total, num_ineligible)
256 _log_error('Adjusting target to %d DUTs.', num_ineligible)
257 target_total = num_ineligible
David James750c0382015-05-06 19:30:46 -0700258 adjustment = target_total - self.total_hosts
Jacob Kopczynskic6e483e2017-08-25 17:28:35 -0700259 metrics.Boolean(
260 'chromeos/autotest/balance_pools/exhausted_pools',
261 "True for each pool/board which requests more DUTs than supplied",
262 field_spec=[
263 ts_mon.StringField('pool'), ts_mon.StringField('board')]).set(
264 not spares_needed,
265 fields={'pool': self.pool, 'board': self.board}
266 )
David James750c0382015-05-06 19:30:46 -0700267 return len(self.broken_hosts) + adjustment
J. Richard Barnette91d56812015-04-21 10:22:31 -0700268
David James750c0382015-05-06 19:30:46 -0700269 def allocate_surplus(self, num_broken):
270 """Allocate a list DUTs that can returned as surplus.
J. Richard Barnette91d56812015-04-21 10:22:31 -0700271
272 Return a list of devices that can be returned in order to
273 reduce this pool's supply. Broken DUTs will be preferred
David James750c0382015-05-06 19:30:46 -0700274 over working ones.
J. Richard Barnette91d56812015-04-21 10:22:31 -0700275
276 The `num_broken` parameter indicates the number of broken
277 DUTs to be left in the pool. If this number exceeds the
278 number of broken DUTs actually in the pool, the returned
279 list will be empty. If this number is negative, it
280 indicates a number of working DUTs to be returned in
281 addition to all broken ones.
282
J. Richard Barnette91d56812015-04-21 10:22:31 -0700283 @param num_broken Total number of broken DUTs to be left in
284 this pool.
285
286 @return A list of DUTs to be returned as surplus.
287
288 """
289 if num_broken >= 0:
David James750c0382015-05-06 19:30:46 -0700290 surplus = self.broken_hosts[num_broken:]
J. Richard Barnette91d56812015-04-21 10:22:31 -0700291 return surplus
292 else:
David James750c0382015-05-06 19:30:46 -0700293 return (self.broken_hosts +
294 self.working_hosts[:-num_broken])
J. Richard Barnette91d56812015-04-21 10:22:31 -0700295
296
297def _exchange_labels(dry_run, hosts, target_pool, spare_pool):
298 """Reassign a list of DUTs from one pool to another.
299
300 For all the given hosts, remove all labels associated with
David James750c0382015-05-06 19:30:46 -0700301 `spare_pool`, and add the labels for `target_pool`.
J. Richard Barnette91d56812015-04-21 10:22:31 -0700302
303 If `dry_run` is true, perform no changes, but log the `atest`
304 commands needed to accomplish the necessary label changes.
305
306 @param dry_run Whether the logging is for a dry run or
307 for actual execution.
308 @param hosts List of DUTs (AFE hosts) to be reassigned.
309 @param target_pool The `_DUTPool` object from which the hosts
310 are drawn.
311 @param spare_pool The `_DUTPool` object to which the hosts
312 will be added.
313
314 """
315 if not hosts:
316 return
317 _log_info(dry_run, 'Transferring %d DUTs from %s to %s.',
318 len(hosts), spare_pool.pool, target_pool.pool)
Jacob Kopczynskic6e483e2017-08-25 17:28:35 -0700319 metrics.Counter(
320 'chromeos/autotest/balance_pools/duts_moved',
321 "DUTs transferred between pools",
322 field_spec=[ts_mon.StringField('board'),
323 ts_mon.StringField('source_pool'),
324 ts_mon.StringField('target_pool')]
325 ).increment_by(len(hosts),
326 fields={'board': target_pool.board,
327 'source_pool': spare_pool.pool,
328 'target_pool': target_pool.pool})
J. Richard Barnette91d56812015-04-21 10:22:31 -0700329 additions = target_pool.pool_labels
330 removals = spare_pool.pool_labels
331 for host in hosts:
332 if not dry_run:
333 _log_message('Updating host: %s.', host.hostname)
Richard Barnette07303cb2016-04-15 16:56:16 -0700334 host.remove_labels(removals)
335 host.add_labels(additions)
J. Richard Barnette91d56812015-04-21 10:22:31 -0700336 else:
337 _log_message('atest label remove -m %s %s',
338 host.hostname, ' '.join(removals))
339 _log_message('atest label add -m %s %s',
340 host.hostname, ' '.join(additions))
341
342
Aviv Keshet39c20dd2017-11-02 10:17:13 -0700343def _balance_board(arguments, afe, board, pool, start_time, end_time,
344 extra_labels=None):
J. Richard Barnette91d56812015-04-21 10:22:31 -0700345 """Balance one board as requested by command line arguments.
346
347 @param arguments Parsed command line arguments.
348 @param dry_run Whether the logging is for a dry run or
349 for actual execution.
350 @param afe AFE object to be used for the changes.
351 @param board Board to be balanced.
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700352 @param pool Pool of the board to be balanced.
J. Richard Barnette91d56812015-04-21 10:22:31 -0700353 @param start_time Start time for HostJobHistory objects in
354 the DUT pools.
355 @param end_time End time for HostJobHistory objects in the
356 DUT pools.
Aviv Keshet39c20dd2017-11-02 10:17:13 -0700357 @param extra_labels Optional extra labels that all DUTs must possess.
J. Richard Barnette91d56812015-04-21 10:22:31 -0700358
359 """
360 spare_pool = _DUTPool(afe, board, arguments.spare,
Aviv Keshet39c20dd2017-11-02 10:17:13 -0700361 start_time, end_time, extra_labels)
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700362 main_pool = _DUTPool(afe, board, pool,
Aviv Keshet39c20dd2017-11-02 10:17:13 -0700363 start_time, end_time, extra_labels)
J. Richard Barnette91d56812015-04-21 10:22:31 -0700364
David James750c0382015-05-06 19:30:46 -0700365 target_total = main_pool.total_hosts
J. Richard Barnette91d56812015-04-21 10:22:31 -0700366 if arguments.total is not None:
367 target_total = arguments.total
368 elif arguments.grow:
369 target_total += arguments.grow
370 elif arguments.shrink:
371 target_total -= arguments.shrink
372
David James750c0382015-05-06 19:30:46 -0700373 spares_needed = main_pool.calculate_spares_needed(target_total)
J. Richard Barnette91d56812015-04-21 10:22:31 -0700374 if spares_needed > 0:
David James750c0382015-05-06 19:30:46 -0700375 spare_duts = spare_pool.working_hosts[:spares_needed]
J. Richard Barnette91d56812015-04-21 10:22:31 -0700376 shortfall = spares_needed - len(spare_duts)
377 else:
378 spare_duts = []
379 shortfall = spares_needed
380
David James750c0382015-05-06 19:30:46 -0700381 surplus_duts = main_pool.allocate_surplus(shortfall)
382
383 if spares_needed or surplus_duts or arguments.verbose:
384 dry_run = arguments.dry_run
385 _log_message('')
386
387 _log_info(dry_run, 'Balancing %s %s pool:', board, main_pool.pool)
Aviv Keshet39c20dd2017-11-02 10:17:13 -0700388 if extra_labels:
389 _log_info(dry_run, 'Restricting to extra labels: %s', extra_labels)
David James750c0382015-05-06 19:30:46 -0700390 _log_info(dry_run,
391 'Total %d DUTs, %d working, %d broken, %d reserved.',
392 main_pool.total_hosts, len(main_pool.working_hosts),
393 len(main_pool.broken_hosts), len(main_pool.ineligible_hosts))
394
395 if spares_needed > 0:
396 add_msg = 'grow pool by %d DUTs' % spares_needed
397 elif spares_needed < 0:
398 add_msg = 'shrink pool by %d DUTs' % -spares_needed
399 else:
400 add_msg = 'no change to pool size'
401 _log_info(dry_run, 'Target is %d working DUTs; %s.',
402 target_total, add_msg)
403
404 _log_info(dry_run,
405 '%s %s pool has %d spares available.',
406 board, main_pool.pool, len(spare_pool.working_hosts))
407
408 if spares_needed > len(spare_duts):
409 _log_error('Not enough spares: need %d, only have %d.',
410 spares_needed, len(spare_duts))
411 elif shortfall >= 0:
412 _log_info(dry_run,
413 '%s %s pool will return %d broken DUTs, '
414 'leaving %d still in the pool.',
415 board, main_pool.pool,
416 len(surplus_duts),
417 len(main_pool.broken_hosts) - len(surplus_duts))
418 else:
419 _log_info(dry_run,
420 '%s %s pool will return %d surplus DUTs, '
421 'including %d working DUTs.',
422 board, main_pool.pool,
423 len(main_pool.broken_hosts) - shortfall,
424 -shortfall)
425
David Jamesf1d6e452015-07-17 15:23:04 -0700426 if (len(main_pool.broken_hosts) > arguments.max_broken and
427 not arguments.force_rebalance):
428 _log_error('%s %s pool: Refusing to act on pool with %d broken DUTs.',
429 board, main_pool.pool, len(main_pool.broken_hosts))
430 _log_error('Please investigate this board to see if there is a bug ')
431 _log_error('that is bricking devices. Once you have finished your ')
432 _log_error('investigation, you can force a rebalance with ')
433 _log_error('--force-rebalance')
434 return
435
J. Richard Barnette91d56812015-04-21 10:22:31 -0700436 if not spare_duts and not surplus_duts:
David James750c0382015-05-06 19:30:46 -0700437 if arguments.verbose:
438 _log_info(arguments.dry_run, 'No exchange required.')
J. Richard Barnette91d56812015-04-21 10:22:31 -0700439 return
440
441 _exchange_labels(arguments.dry_run, surplus_duts,
442 spare_pool, main_pool)
443 _exchange_labels(arguments.dry_run, spare_duts,
444 main_pool, spare_pool)
445
446
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700447def _too_many_broken_boards(inventory, pool, arguments):
448 """
449 Get the inventory of boards and check if too many boards are broken.
450
451 @param inventory: inventory object to determine board status inventory.
452 @param pool: The pool to check on for the board.
453 @param arguments Parsed command line arguments.
454
455 @return True if the number of boards with 1 or more broken duts exceed
456 max_broken_boards, False otherwise.
457 """
458 # Let's check if we even need to check for this max_broken_boards.
459 if arguments.force_rebalance or arguments.max_broken_boards == 0:
460 return False
461
462 # Let's get the number of broken duts for the specified pool and
463 # check that it's less than arguments.max_broken_boards. Or if
464 # it's not specified, calculate the default number of max broken
465 # boards based on the total number of boards per pool.
466 # TODO(kevcheng): Revisit to see if there's a better way to
467 # calculate the default max_broken_boards.
468 max_broken_boards = arguments.max_broken_boards
469 if max_broken_boards is None:
470 total_num_boards = len(inventory.get_managed_boards(pool=pool))
471 max_broken_boards = int(_MAX_BROKEN_BOARDS_DEFAULT_RATIO *
472 total_num_boards)
473 _log_info(arguments.dry_run,
474 'Default max broken boards calculated to be %d for '
475 '%s pool',
476 max_broken_boards, pool)
477
Prathmesh Prabhu154cb2b2017-11-08 17:36:51 -0800478 broken_boards = [board for board, counts in inventory.by_board.iteritems()
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700479 if counts.get_broken(pool) != 0]
480 broken_boards.sort()
481 num_of_broken_boards = len(broken_boards)
482 # TODO(kevcheng): Track which boards have broken duts, we can limit the
483 # number of boards we go through in the main loop with this knowledge.
484 _log_message('There are %d boards in the %s pool with at least 1 '
485 'broken DUT (max threshold %d)', num_of_broken_boards,
486 pool, max_broken_boards)
487 for broken_board in broken_boards:
488 _log_message(broken_board)
489 return num_of_broken_boards > max_broken_boards
490
491
J. Richard Barnette91d56812015-04-21 10:22:31 -0700492def _parse_command(argv):
493 """Parse the command line arguments.
494
495 Create an argument parser for this command's syntax, parse the
496 command line, and return the result of the `ArgumentParser`
497 `parse_args()` method.
498
499 @param argv Standard command line argument vector; `argv[0]` is
500 assumed to be the command name.
501
502 @return Result returned by `ArgumentParser.parse_args()`.
503
504 """
505 parser = argparse.ArgumentParser(
506 prog=argv[0],
507 description='Balance pool shortages from spares on reserve')
508
Prathmesh Prabhuc8cf0f62017-11-09 08:57:58 -0800509 parser.add_argument(
510 '-w', '--web', type=str, default=None,
511 help='AFE host to use. Default comes from shadow_config.',
512 )
J. Richard Barnette91d56812015-04-21 10:22:31 -0700513 count_group = parser.add_mutually_exclusive_group()
514 count_group.add_argument('-t', '--total', type=int,
515 metavar='COUNT', default=None,
516 help='Set the number of DUTs in the '
517 'pool to the specified count for '
518 'every BOARD')
519 count_group.add_argument('-a', '--grow', type=int,
520 metavar='COUNT', default=None,
521 help='Add the specified number of DUTs '
522 'to the pool for every BOARD')
523 count_group.add_argument('-d', '--shrink', type=int,
524 metavar='COUNT', default=None,
525 help='Remove the specified number of DUTs '
526 'from the pool for every BOARD')
527
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700528 parser.add_argument('-s', '--spare', default=_SPARE_DEFAULT,
J. Richard Barnette91d56812015-04-21 10:22:31 -0700529 metavar='POOL',
530 help='Pool from which to draw replacement '
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700531 'spares (default: pool:%s)' % _SPARE_DEFAULT)
J. Richard Barnette91d56812015-04-21 10:22:31 -0700532 parser.add_argument('-n', '--dry-run', action='store_true',
533 help='Report actions to take in the form of '
534 'shell commands')
David James750c0382015-05-06 19:30:46 -0700535 parser.add_argument('-v', '--verbose', action='store_true',
536 help='Print more detail about calculations for debug '
537 'purposes.')
J. Richard Barnette91d56812015-04-21 10:22:31 -0700538
David Jamesf1d6e452015-07-17 15:23:04 -0700539 parser.add_argument('-m', '--max-broken', default=2, type=int,
540 metavar='COUNT',
541 help='Only rebalance a pool if it has at most '
542 'COUNT broken DUTs.')
543 parser.add_argument('-f', '--force-rebalance', action='store_true',
544 help='Forcefully rebalance all DUTs in a pool, even '
545 'if it has a large number of broken DUTs. '
546 'Before doing this, please investigate whether '
547 'there is a bug that is bricking devices in the '
548 'lab.')
Jacob Kopczynskif8d90a82017-10-10 14:37:33 -0700549 parser.add_argument('--production', action='store_true',
Prathmesh Prabhu7a050da2017-11-09 09:15:03 -0800550 help='Treat this as a production run. This will '
551 'collect metrics.')
David Jamesf1d6e452015-07-17 15:23:04 -0700552
David James8352bc22015-05-05 16:37:05 -0700553 parser.add_argument('--all-boards', action='store_true',
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700554 help='Rebalance all managed boards. This will do a '
555 'very expensive check to see how many boards have '
556 'at least one broken DUT. To bypass that check, '
557 'set --max-broken-boards to 0.')
558 parser.add_argument('--max-broken-boards',
559 default=None, type=int,
560 help='Only rebalance all boards if number of boards '
561 'with broken DUTs in the specified pool '
562 'is less than COUNT.')
David James8352bc22015-05-05 16:37:05 -0700563
J. Richard Barnette91d56812015-04-21 10:22:31 -0700564 parser.add_argument('pool',
565 metavar='POOL',
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700566 help='Name of the pool to balance. Use %s to balance '
567 'all critical pools' % _ALL_CRITICAL_POOLS)
David James8352bc22015-05-05 16:37:05 -0700568 parser.add_argument('boards', nargs='*',
J. Richard Barnette91d56812015-04-21 10:22:31 -0700569 metavar='BOARD',
David James8352bc22015-05-05 16:37:05 -0700570 help='Names of boards to balance.')
J. Richard Barnette91d56812015-04-21 10:22:31 -0700571
Aviv Keshet39c20dd2017-11-02 10:17:13 -0700572 parser.add_argument('--model', type=str, action='store', metavar='MODEL',
573 help='Optional name of model to restrict to.')
574
J. Richard Barnette91d56812015-04-21 10:22:31 -0700575 arguments = parser.parse_args(argv[1:])
David James8352bc22015-05-05 16:37:05 -0700576
577 # Error-check arguments.
578 if not arguments.boards and not arguments.all_boards:
579 parser.error('No boards specified. To balance all boards, use '
580 '--all-boards')
581 if arguments.boards and arguments.all_boards:
582 parser.error('Cannot specify boards with --all-boards.')
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700583 if (arguments.pool == _ALL_CRITICAL_POOLS and
584 arguments.spare != _SPARE_DEFAULT):
585 parser.error('Cannot specify --spare pool to be %s when balancing all '
586 'critical pools.' % _SPARE_DEFAULT)
J. Richard Barnette91d56812015-04-21 10:22:31 -0700587 return arguments
588
589
Jacob Kopczynskic6e483e2017-08-25 17:28:35 -0700590def specify_balance_args(afe, arguments, pools):
591 """Take some arguments and translate them to a list of boards to balance
J. Richard Barnette91d56812015-04-21 10:22:31 -0700592
Jacob Kopczynskic6e483e2017-08-25 17:28:35 -0700593 Args:
594 @param afe AFE object to be used for taking inventory.
595 @param arguments Parsed command line arguments.
596 @param pools The list of pools to balance.
597
Aviv Keshet39c20dd2017-11-02 10:17:13 -0700598 @returns a list of (board, pool, extra_labels) tuples to be balanced
J. Richard Barnette91d56812015-04-21 10:22:31 -0700599
600 """
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700601 board_info = []
Jacob Kopczynskic6e483e2017-08-25 17:28:35 -0700602 boards = arguments.boards
Aviv Keshet39c20dd2017-11-02 10:17:13 -0700603 extra_labels = []
604 if arguments.model:
605 extra_labels = ['model:' + arguments.model]
606
David James8352bc22015-05-05 16:37:05 -0700607 if arguments.all_boards:
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700608 inventory = lab_inventory.get_inventory(afe)
609 for pool in pools:
Jacob Kopczynskic6e483e2017-08-25 17:28:35 -0700610 quarantine = _too_many_broken_boards(inventory, pool, arguments)
611 if quarantine:
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700612 _log_error('Refusing to balance all boards for %s pool, '
613 'too many boards with at least 1 broken DUT '
614 'detected.', pool)
615 else:
616 boards_in_pool = inventory.get_managed_boards(pool=pool)
617 current_len_board_info = len(board_info)
Aviv Keshet39c20dd2017-11-02 10:17:13 -0700618 board_info.extend([(board, pool, extra_labels)
619 for board in boards_in_pool])
Jacob Kopczynskic6e483e2017-08-25 17:28:35 -0700620 metrics.Boolean(
621 'chromeos/autotest/balance_pools/unchanged_pools').set(
622 quarantine, fields={'pool': pool})
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700623 else:
Jacob Kopczynskic6e483e2017-08-25 17:28:35 -0700624 # We have specified boards with a specified pool, setup the args to
625 # the balancer properly.
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700626 for pool in pools:
627 current_len_board_info = len(board_info)
Aviv Keshet39c20dd2017-11-02 10:17:13 -0700628 board_info.extend([(board, pool, extra_labels) for board in boards])
Jacob Kopczynskic6e483e2017-08-25 17:28:35 -0700629 return board_info
630
631
632def main(argv):
633 """Standard main routine.
634
635 @param argv Command line arguments including `sys.argv[0]`.
636
637 """
Jacob Kopczynskif8d90a82017-10-10 14:37:33 -0700638
Aviv Keshet39c20dd2017-11-02 10:17:13 -0700639 def balancer(board, pool, extra_labels):
Jacob Kopczynskic6e483e2017-08-25 17:28:35 -0700640 """Balance the specified board.
641
642 @param board The board name.
643 @param pool The pool to rebalance for the board.
Aviv Keshet39c20dd2017-11-02 10:17:13 -0700644 @param extra_labels extra labels to restrict to
Jacob Kopczynskic6e483e2017-08-25 17:28:35 -0700645 """
Aviv Keshet39c20dd2017-11-02 10:17:13 -0700646 _balance_board(arguments, afe, board, pool, start_time, end_time,
647 extra_labels)
Jacob Kopczynskic6e483e2017-08-25 17:28:35 -0700648 _log_message('')
649
Jacob Kopczynskif8d90a82017-10-10 14:37:33 -0700650 end_time = time.time()
651 start_time = end_time - 24 * 60 * 60
652
653 arguments = _parse_command(argv)
654 if arguments.production:
655 metrics_manager = site_utils.SetupTsMonGlobalState(
Prathmesh Prabhu7a050da2017-11-09 09:15:03 -0800656 'balance_pools',
657 short_lived=True,
658 auto_flush=False,
659 )
Jacob Kopczynskif8d90a82017-10-10 14:37:33 -0700660 else:
Jacob Kopczynskif8d90a82017-10-10 14:37:33 -0700661 metrics_manager = site_utils.TrivialContextManager()
662
663 with metrics_manager:
Jacob Kopczynskic6e483e2017-08-25 17:28:35 -0700664 try:
Prathmesh Prabhu7a050da2017-11-09 09:15:03 -0800665 afe = frontend.AFE(server=arguments.web)
666 pools = (lab_inventory.CRITICAL_POOLS
667 if arguments.pool == _ALL_CRITICAL_POOLS
668 else [arguments.pool])
669 board_info = specify_balance_args(afe, arguments, pools)
670 try:
671 parallel.RunTasksInProcessPool(balancer, board_info,
672 processes=8)
673 except KeyboardInterrupt:
674 pass
675 finally:
676 metrics.Flush()
J. Richard Barnette91d56812015-04-21 10:22:31 -0700677
678
679if __name__ == '__main__':
680 main(sys.argv)