blob: 6f6a34b3e0253c1f30e983257f8e4c9d52299389 [file] [log] [blame]
J. Richard Barnette91d56812015-04-21 10:22:31 -07001#!/usr/bin/env python
2# Copyright 2015 The Chromium OS Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6"""Adjust pool balances to cover DUT shortfalls.
7
8This command takes all broken DUTs in a specific pool for specific
Richard Barnette5de01eb2017-12-15 09:53:42 -08009models and swaps them with working DUTs taken from a selected pool
J. Richard Barnette91d56812015-04-21 10:22:31 -070010of spares. The command is meant primarily for replacing broken DUTs
11in critical pools like BVT or CQ, but it can also be used to adjust
12pool sizes, or to create or remove pools.
13
Richard Barnette5de01eb2017-12-15 09:53:42 -080014usage: balance_pool.py [ options ] POOL MODEL [ MODEL ... ]
J. Richard Barnette91d56812015-04-21 10:22:31 -070015
16positional arguments:
17 POOL Name of the pool to balance
Richard Barnette5de01eb2017-12-15 09:53:42 -080018 MODEL Names of models to balance
J. Richard Barnette91d56812015-04-21 10:22:31 -070019
20optional arguments:
21 -h, --help show this help message and exit
22 -t COUNT, --total COUNT
23 Set the number of DUTs in the pool to the specified
Richard Barnette5de01eb2017-12-15 09:53:42 -080024 count for every MODEL
J. Richard Barnette91d56812015-04-21 10:22:31 -070025 -a COUNT, --grow COUNT
26 Add the specified number of DUTs to the pool for every
Richard Barnette5de01eb2017-12-15 09:53:42 -080027 MODEL
J. Richard Barnette91d56812015-04-21 10:22:31 -070028 -d COUNT, --shrink COUNT
29 Remove the specified number of DUTs from the pool for
Richard Barnette5de01eb2017-12-15 09:53:42 -080030 every MODEL
J. Richard Barnette91d56812015-04-21 10:22:31 -070031 -s POOL, --spare POOL
32 Pool from which to draw replacement spares (default:
33 pool:suites)
Chung-yih Wangcc1d9cb2017-11-30 11:20:45 +080034 --sku SKU The specific SKU we intend to swap with
J. Richard Barnette91d56812015-04-21 10:22:31 -070035 -n, --dry-run Report actions to take in the form of shell commands
36
37
38The command attempts to remove all broken DUTs from the target POOL
Richard Barnette5de01eb2017-12-15 09:53:42 -080039for every MODEL, and replace them with enough working DUTs taken
J. Richard Barnette91d56812015-04-21 10:22:31 -070040from the spare pool to bring the strength of POOL to the requested
41total COUNT.
42
43If no COUNT options are supplied (i.e. there are no --total, --grow,
44or --shrink options), the command will maintain the current totals of
Richard Barnette5de01eb2017-12-15 09:53:42 -080045DUTs for every MODEL in the target POOL.
J. Richard Barnette91d56812015-04-21 10:22:31 -070046
47If not enough working spares are available, broken DUTs may be left
48in the pool to keep the pool at the target COUNT.
49
50When reducing pool size, working DUTs will be returned after broken
51DUTs, if it's necessary to achieve the target COUNT.
52
53"""
54
55
56import argparse
Richard Barnette81ef2242018-06-14 14:34:34 -070057import os
58import re
J. Richard Barnette91d56812015-04-21 10:22:31 -070059import sys
60import time
61
62import common
Xixuan Wu93e646c2017-12-07 18:36:10 -080063from autotest_lib.server import constants
Jacob Kopczynskic6e483e2017-08-25 17:28:35 -070064from autotest_lib.server import site_utils
Aviv Kesheta8834322018-05-07 13:28:32 -070065from autotest_lib.server.cros.dynamic_suite import frontend_wrappers
Aviv Keshet7ee95862016-08-30 15:18:27 -070066from autotest_lib.server.lib import status_history
Kevin Chengcf0ad2b2016-04-19 14:51:39 -070067from autotest_lib.site_utils import lab_inventory
Prathmesh Prabhu68acc402017-11-09 15:24:15 -080068from autotest_lib.utils import labellib
Jacob Kopczynskic6e483e2017-08-25 17:28:35 -070069from chromite.lib import metrics
David James2a3cb542015-05-05 17:13:43 -070070from chromite.lib import parallel
71
Jacob Kopczynskif8d90a82017-10-10 14:37:33 -070072#This must be imported after chromite.lib.metrics
73from infra_libs import ts_mon
J. Richard Barnette91d56812015-04-21 10:22:31 -070074
75_POOL_PREFIX = constants.Labels.POOL_PREFIX
Richard Barnette5de01eb2017-12-15 09:53:42 -080076# This is the ratio of all models we should calculate the default max
77# number of broken models against. It seemed like the best choice that
78# was neither too strict nor lax.
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -080079_MAX_BROKEN_DEFAULT_RATIO = 3.0 / 8.0
Kevin Chengcf0ad2b2016-04-19 14:51:39 -070080
81_ALL_CRITICAL_POOLS = 'all_critical_pools'
82_SPARE_DEFAULT = lab_inventory.SPARE_POOL
J. Richard Barnette91d56812015-04-21 10:22:31 -070083
84
Richard Barnette81ef2242018-06-14 14:34:34 -070085# _VALID_POOL_PATTERN - Regular expression matching pool names that will
86# be accepted on the command line.
87#
88# Note: This pattern was selected merely to recognize all existing pool
89# names; there's no underlying technical restriction motivating this
90# pattern. No reasonable request to add more special characters to the
91# allowed set should be refused.
92
93_VALID_POOL_PATTERN = re.compile('^[a-zA-z0-9_\-]+$')
94
95
J. Richard Barnette91d56812015-04-21 10:22:31 -070096def _log_message(message, *args):
97 """Log a message with optional format arguments to stdout.
98
99 This function logs a single line to stdout, with formatting
100 if necessary, and without adornments.
101
102 If `*args` are supplied, the message will be formatted using
103 the arguments.
104
105 @param message Message to be logged, possibly after formatting.
106 @param args Format arguments. If empty, the message is logged
107 without formatting.
108
109 """
110 if args:
111 message = message % args
112 sys.stdout.write('%s\n' % message)
113
114
115def _log_info(dry_run, message, *args):
116 """Log information in a dry-run dependent fashion.
117
118 This function logs a single line to stdout, with formatting
119 if necessary. When logging for a dry run, the message is
120 printed as a shell comment, rather than as unadorned text.
121
122 If `*args` are supplied, the message will be formatted using
123 the arguments.
124
125 @param message Message to be logged, possibly after formatting.
126 @param args Format arguments. If empty, the message is logged
127 without formatting.
128
129 """
130 if dry_run:
131 message = '# ' + message
132 _log_message(message, *args)
133
134
135def _log_error(message, *args):
136 """Log an error to stderr, with optional format arguments.
137
138 This function logs a single line to stderr, prefixed to indicate
139 that it is an error message.
140
141 If `*args` are supplied, the message will be formatted using
142 the arguments.
143
144 @param message Message to be logged, possibly after formatting.
145 @param args Format arguments. If empty, the message is logged
146 without formatting.
147
148 """
149 if args:
150 message = message % args
151 sys.stderr.write('ERROR: %s\n' % message)
152
153
154class _DUTPool(object):
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800155 """Information about a pool of DUTs matching given labels.
J. Richard Barnette91d56812015-04-21 10:22:31 -0700156
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800157 This class collects information about all DUTs for a given pool and matching
158 the given labels, and divides them into three categories:
J. Richard Barnette91d56812015-04-21 10:22:31 -0700159 + Working - the DUT is working for testing, and not locked.
160 + Broken - the DUT is unable to run tests, or it is locked.
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800161 + Ineligible - the DUT is not available to be removed from this pool. The
162 DUT may be either working or broken.
J. Richard Barnette91d56812015-04-21 10:22:31 -0700163
164 DUTs with more than one pool: label are ineligible for exchange
165 during balancing. This is done for the sake of chameleon hosts,
166 which must always be assigned to pool:suites. These DUTs are
167 always marked with pool:chameleon to prevent their reassignment.
168
169 TODO(jrbarnette): The use of `pool:chamelon` (instead of just
170 the `chameleon` label is a hack that should be eliminated.
171
172 _DUTPool instances are used to track both main pools that need
173 to be resupplied with working DUTs and spare pools that supply
174 those DUTs.
175
J. Richard Barnette91d56812015-04-21 10:22:31 -0700176 @property pool Name of the pool associated with
177 this pool of DUTs.
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800178 @property labels Labels that constrain the DUTs to consider.
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800179 @property working_hosts The list of this pool's working DUTs.
180 @property broken_hosts The list of this pool's broken DUTs.
David James750c0382015-05-06 19:30:46 -0700181 @property ineligible_hosts The list of this pool's ineligible DUTs.
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800182 @property pool_labels A list of labels that identify a DUT as part
183 of this pool.
David James750c0382015-05-06 19:30:46 -0700184 @property total_hosts The total number of hosts in pool.
J. Richard Barnette91d56812015-04-21 10:22:31 -0700185
186 """
187
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800188 def __init__(self, afe, pool, labels, start_time, end_time):
J. Richard Barnette91d56812015-04-21 10:22:31 -0700189 self.pool = pool
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800190 self.labels = labellib.LabelsMapping(labels)
191 self.labels['pool'] = pool
192 self._pool_labels = [_POOL_PREFIX + self.pool]
193
David James750c0382015-05-06 19:30:46 -0700194 self.working_hosts = []
195 self.broken_hosts = []
196 self.ineligible_hosts = []
Richard Barnette07303cb2016-04-15 16:56:16 -0700197 self.total_hosts = self._get_hosts(afe, start_time, end_time)
J. Richard Barnette91d56812015-04-21 10:22:31 -0700198
199
Richard Barnette07303cb2016-04-15 16:56:16 -0700200 def _get_hosts(self, afe, start_time, end_time):
Prathmesh Prabhu68acc402017-11-09 15:24:15 -0800201 all_histories = status_history.HostJobHistory.get_multiple_histories(
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800202 afe, start_time, end_time, self.labels.getlabels())
J. Richard Barnette91d56812015-04-21 10:22:31 -0700203 for h in all_histories:
204 host = h.host
205 host_pools = [l for l in host.labels
206 if l.startswith(_POOL_PREFIX)]
207 if len(host_pools) != 1:
David James750c0382015-05-06 19:30:46 -0700208 self.ineligible_hosts.append(host)
J. Richard Barnette91d56812015-04-21 10:22:31 -0700209 else:
210 diag = h.last_diagnosis()[0]
211 if (diag == status_history.WORKING and
212 not host.locked):
David James750c0382015-05-06 19:30:46 -0700213 self.working_hosts.append(host)
J. Richard Barnette91d56812015-04-21 10:22:31 -0700214 else:
David James750c0382015-05-06 19:30:46 -0700215 self.broken_hosts.append(host)
J. Richard Barnette91d56812015-04-21 10:22:31 -0700216 return len(all_histories)
217
218
219 @property
220 def pool_labels(self):
221 """Return the AFE labels that identify this pool.
222
223 The returned labels are the labels that must be removed
224 to remove a DUT from the pool, or added to add a DUT.
225
226 @return A list of AFE labels suitable for AFE.add_labels()
227 or AFE.remove_labels().
228
229 """
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800230 return self._pool_labels
J. Richard Barnette91d56812015-04-21 10:22:31 -0700231
David James750c0382015-05-06 19:30:46 -0700232 def calculate_spares_needed(self, target_total):
J. Richard Barnette91d56812015-04-21 10:22:31 -0700233 """Calculate and log the spares needed to achieve a target.
234
235 Return how many working spares are needed to achieve the
David James750c0382015-05-06 19:30:46 -0700236 given `target_total` with all DUTs working.
J. Richard Barnette91d56812015-04-21 10:22:31 -0700237
238 The spares count may be positive or negative. Positive
239 values indicate spares are needed to replace broken DUTs in
240 order to reach the target; negative numbers indicate that
241 no spares are needed, and that a corresponding number of
242 working devices can be returned.
243
244 If the new target total would require returning ineligible
245 DUTs, an error is logged, and the target total is adjusted
246 so that those DUTs are not exchanged.
247
J. Richard Barnette91d56812015-04-21 10:22:31 -0700248 @param target_total The new target pool size.
249
250 @return The number of spares needed.
251
252 """
David James750c0382015-05-06 19:30:46 -0700253 num_ineligible = len(self.ineligible_hosts)
Jacob Kopczynskic6e483e2017-08-25 17:28:35 -0700254 spares_needed = target_total >= num_ineligible
Jacob Kopczynskied55f2e2017-11-10 16:26:42 -0800255 metrics.Boolean(
256 'chromeos/autotest/balance_pools/exhausted_pools',
Richard Barnette5de01eb2017-12-15 09:53:42 -0800257 'True for each pool/model which requests more DUTs than supplied',
258 # TODO(jrbarnette) The 'board' field is a legacy. We need
259 # to leave it here until we do the extra work Monarch
260 # requires to delete a field.
Jacob Kopczynskied55f2e2017-11-10 16:26:42 -0800261 field_spec=[
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800262 ts_mon.StringField('pool'),
263 ts_mon.StringField('board'),
264 ts_mon.StringField('model'),
265 ]).set(
266 not spares_needed,
267 fields={
268 'pool': self.pool,
Richard Barnette5de01eb2017-12-15 09:53:42 -0800269 'board': self.labels.get('model', ''),
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800270 'model': self.labels.get('model', ''),
271 },
Jacob Kopczynskied55f2e2017-11-10 16:26:42 -0800272 )
Jacob Kopczynskic6e483e2017-08-25 17:28:35 -0700273 if not spares_needed:
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800274 _log_error(
275 '%s pool (%s): Target of %d is below minimum of %d DUTs.',
276 self.pool, self.labels, target_total, num_ineligible,
277 )
J. Richard Barnette91d56812015-04-21 10:22:31 -0700278 _log_error('Adjusting target to %d DUTs.', num_ineligible)
279 target_total = num_ineligible
Jacob Kopczynskied55f2e2017-11-10 16:26:42 -0800280 else:
281 _log_message('%s %s pool: Target of %d is above minimum.',
Richard Barnette5de01eb2017-12-15 09:53:42 -0800282 self.labels.get('model', ''), self.pool, target_total)
David James750c0382015-05-06 19:30:46 -0700283 adjustment = target_total - self.total_hosts
284 return len(self.broken_hosts) + adjustment
J. Richard Barnette91d56812015-04-21 10:22:31 -0700285
David James750c0382015-05-06 19:30:46 -0700286 def allocate_surplus(self, num_broken):
287 """Allocate a list DUTs that can returned as surplus.
J. Richard Barnette91d56812015-04-21 10:22:31 -0700288
289 Return a list of devices that can be returned in order to
290 reduce this pool's supply. Broken DUTs will be preferred
David James750c0382015-05-06 19:30:46 -0700291 over working ones.
J. Richard Barnette91d56812015-04-21 10:22:31 -0700292
293 The `num_broken` parameter indicates the number of broken
294 DUTs to be left in the pool. If this number exceeds the
295 number of broken DUTs actually in the pool, the returned
296 list will be empty. If this number is negative, it
297 indicates a number of working DUTs to be returned in
298 addition to all broken ones.
299
J. Richard Barnette91d56812015-04-21 10:22:31 -0700300 @param num_broken Total number of broken DUTs to be left in
301 this pool.
302
303 @return A list of DUTs to be returned as surplus.
304
305 """
306 if num_broken >= 0:
David James750c0382015-05-06 19:30:46 -0700307 surplus = self.broken_hosts[num_broken:]
J. Richard Barnette91d56812015-04-21 10:22:31 -0700308 return surplus
309 else:
David James750c0382015-05-06 19:30:46 -0700310 return (self.broken_hosts +
311 self.working_hosts[:-num_broken])
J. Richard Barnette91d56812015-04-21 10:22:31 -0700312
313
314def _exchange_labels(dry_run, hosts, target_pool, spare_pool):
315 """Reassign a list of DUTs from one pool to another.
316
317 For all the given hosts, remove all labels associated with
David James750c0382015-05-06 19:30:46 -0700318 `spare_pool`, and add the labels for `target_pool`.
J. Richard Barnette91d56812015-04-21 10:22:31 -0700319
320 If `dry_run` is true, perform no changes, but log the `atest`
321 commands needed to accomplish the necessary label changes.
322
323 @param dry_run Whether the logging is for a dry run or
324 for actual execution.
325 @param hosts List of DUTs (AFE hosts) to be reassigned.
326 @param target_pool The `_DUTPool` object from which the hosts
327 are drawn.
328 @param spare_pool The `_DUTPool` object to which the hosts
329 will be added.
330
331 """
J. Richard Barnette91d56812015-04-21 10:22:31 -0700332 _log_info(dry_run, 'Transferring %d DUTs from %s to %s.',
333 len(hosts), spare_pool.pool, target_pool.pool)
Jacob Kopczynskic6e483e2017-08-25 17:28:35 -0700334 metrics.Counter(
335 'chromeos/autotest/balance_pools/duts_moved',
Richard Barnette5de01eb2017-12-15 09:53:42 -0800336 'DUTs transferred between pools',
337 # TODO(jrbarnette) The 'board' field is a legacy. We need to
338 # leave it here until we do the extra work Monarch requires to
339 # delete a field.
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800340 field_spec=[
341 ts_mon.StringField('board'),
342 ts_mon.StringField('model'),
343 ts_mon.StringField('source_pool'),
344 ts_mon.StringField('target_pool'),
345 ]
346 ).increment_by(
347 len(hosts),
348 fields={
Richard Barnette5de01eb2017-12-15 09:53:42 -0800349 'board': target_pool.labels.get('model', ''),
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800350 'model': target_pool.labels.get('model', ''),
351 'source_pool': spare_pool.pool,
352 'target_pool': target_pool.pool,
353 },
354 )
Jacob Kopczynskied55f2e2017-11-10 16:26:42 -0800355 if not hosts:
356 return
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800357
J. Richard Barnette91d56812015-04-21 10:22:31 -0700358 additions = target_pool.pool_labels
359 removals = spare_pool.pool_labels
360 for host in hosts:
361 if not dry_run:
362 _log_message('Updating host: %s.', host.hostname)
Richard Barnette07303cb2016-04-15 16:56:16 -0700363 host.remove_labels(removals)
364 host.add_labels(additions)
J. Richard Barnette91d56812015-04-21 10:22:31 -0700365 else:
366 _log_message('atest label remove -m %s %s',
367 host.hostname, ' '.join(removals))
368 _log_message('atest label add -m %s %s',
369 host.hostname, ' '.join(additions))
370
371
Richard Barnette5de01eb2017-12-15 09:53:42 -0800372def _balance_model(arguments, afe, pool, labels, start_time, end_time):
373 """Balance one model as requested by command line arguments.
J. Richard Barnette91d56812015-04-21 10:22:31 -0700374
375 @param arguments Parsed command line arguments.
J. Richard Barnette91d56812015-04-21 10:22:31 -0700376 @param afe AFE object to be used for the changes.
Richard Barnette5de01eb2017-12-15 09:53:42 -0800377 @param pool Pool of the model to be balanced.
378 @param labels Restrict the balancing operation within DUTs
379 that have these labels.
J. Richard Barnette91d56812015-04-21 10:22:31 -0700380 @param start_time Start time for HostJobHistory objects in
381 the DUT pools.
382 @param end_time End time for HostJobHistory objects in the
383 DUT pools.
384
385 """
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800386 spare_pool = _DUTPool(afe, arguments.spare, labels, start_time, end_time)
387 main_pool = _DUTPool(afe, pool, labels, start_time, end_time)
J. Richard Barnette91d56812015-04-21 10:22:31 -0700388
David James750c0382015-05-06 19:30:46 -0700389 target_total = main_pool.total_hosts
J. Richard Barnette91d56812015-04-21 10:22:31 -0700390 if arguments.total is not None:
391 target_total = arguments.total
392 elif arguments.grow:
393 target_total += arguments.grow
394 elif arguments.shrink:
395 target_total -= arguments.shrink
396
David James750c0382015-05-06 19:30:46 -0700397 spares_needed = main_pool.calculate_spares_needed(target_total)
J. Richard Barnette91d56812015-04-21 10:22:31 -0700398 if spares_needed > 0:
David James750c0382015-05-06 19:30:46 -0700399 spare_duts = spare_pool.working_hosts[:spares_needed]
J. Richard Barnette91d56812015-04-21 10:22:31 -0700400 shortfall = spares_needed - len(spare_duts)
401 else:
402 spare_duts = []
403 shortfall = spares_needed
404
David James750c0382015-05-06 19:30:46 -0700405 surplus_duts = main_pool.allocate_surplus(shortfall)
406
407 if spares_needed or surplus_duts or arguments.verbose:
408 dry_run = arguments.dry_run
409 _log_message('')
410
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800411 _log_info(dry_run, 'Balancing %s %s pool:', labels, main_pool.pool)
David James750c0382015-05-06 19:30:46 -0700412 _log_info(dry_run,
413 'Total %d DUTs, %d working, %d broken, %d reserved.',
414 main_pool.total_hosts, len(main_pool.working_hosts),
415 len(main_pool.broken_hosts), len(main_pool.ineligible_hosts))
416
417 if spares_needed > 0:
418 add_msg = 'grow pool by %d DUTs' % spares_needed
419 elif spares_needed < 0:
420 add_msg = 'shrink pool by %d DUTs' % -spares_needed
421 else:
422 add_msg = 'no change to pool size'
423 _log_info(dry_run, 'Target is %d working DUTs; %s.',
424 target_total, add_msg)
425
426 _log_info(dry_run,
Chung-yih Wangcc1d9cb2017-11-30 11:20:45 +0800427 '%s %s pool has %d spares available for balancing pool %s',
428 labels, spare_pool.pool, len(spare_pool.working_hosts),
429 main_pool.pool)
David James750c0382015-05-06 19:30:46 -0700430
431 if spares_needed > len(spare_duts):
432 _log_error('Not enough spares: need %d, only have %d.',
433 spares_needed, len(spare_duts))
434 elif shortfall >= 0:
435 _log_info(dry_run,
436 '%s %s pool will return %d broken DUTs, '
437 'leaving %d still in the pool.',
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800438 labels, main_pool.pool,
David James750c0382015-05-06 19:30:46 -0700439 len(surplus_duts),
440 len(main_pool.broken_hosts) - len(surplus_duts))
441 else:
442 _log_info(dry_run,
443 '%s %s pool will return %d surplus DUTs, '
444 'including %d working DUTs.',
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800445 labels, main_pool.pool,
David James750c0382015-05-06 19:30:46 -0700446 len(main_pool.broken_hosts) - shortfall,
447 -shortfall)
448
David Jamesf1d6e452015-07-17 15:23:04 -0700449 if (len(main_pool.broken_hosts) > arguments.max_broken and
450 not arguments.force_rebalance):
451 _log_error('%s %s pool: Refusing to act on pool with %d broken DUTs.',
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800452 labels, main_pool.pool, len(main_pool.broken_hosts))
Richard Barnette5de01eb2017-12-15 09:53:42 -0800453 _log_error('Please investigate this model to for a bug ')
David Jamesf1d6e452015-07-17 15:23:04 -0700454 _log_error('that is bricking devices. Once you have finished your ')
455 _log_error('investigation, you can force a rebalance with ')
456 _log_error('--force-rebalance')
Jacob Kopczynskied55f2e2017-11-10 16:26:42 -0800457 spare_duts = []
458 surplus_duts = []
David Jamesf1d6e452015-07-17 15:23:04 -0700459
J. Richard Barnette91d56812015-04-21 10:22:31 -0700460 if not spare_duts and not surplus_duts:
David James750c0382015-05-06 19:30:46 -0700461 if arguments.verbose:
462 _log_info(arguments.dry_run, 'No exchange required.')
J. Richard Barnette91d56812015-04-21 10:22:31 -0700463
464 _exchange_labels(arguments.dry_run, surplus_duts,
465 spare_pool, main_pool)
466 _exchange_labels(arguments.dry_run, spare_duts,
467 main_pool, spare_pool)
468
469
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800470def _too_many_broken(inventory, pool, args):
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700471 """
Richard Barnette5de01eb2017-12-15 09:53:42 -0800472 Get the inventory of models and check if too many are broken.
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700473
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800474 @param inventory: _LabInventory object.
475 @param pool: The pool to check.
476 @param args: Parsed command line arguments.
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700477
Richard Barnette5de01eb2017-12-15 09:53:42 -0800478 @return True if the number of models with 1 or more broken duts
479 exceed max_broken_models, False otherwise.
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700480 """
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800481 # Were we asked to skip this check?
482 if (args.force_rebalance or
Richard Barnette5de01eb2017-12-15 09:53:42 -0800483 (args.all_models and args.max_broken_models == 0)):
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700484 return False
485
Richard Barnette5de01eb2017-12-15 09:53:42 -0800486 max_broken = args.max_broken_models
487 if max_broken is None:
488 total_num = len(inventory.get_pool_models(pool))
489 max_broken = int(_MAX_BROKEN_DEFAULT_RATIO * total_num)
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800490 _log_info(args.dry_run,
Richard Barnette5de01eb2017-12-15 09:53:42 -0800491 'Max broken models for pool %s: %d',
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800492 pool, max_broken)
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700493
Richard Barnette5de01eb2017-12-15 09:53:42 -0800494 broken = [model for model, counts in inventory.iteritems()
495 if counts.get_broken(pool) != 0]
496 _log_message('There are %d models in the %s pool with at least 1 '
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800497 'broken DUT (max threshold %d)',
498 len(broken), pool, max_broken)
499 for b in sorted(broken):
500 _log_message(b)
501 return len(broken) > max_broken
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700502
503
J. Richard Barnette91d56812015-04-21 10:22:31 -0700504def _parse_command(argv):
505 """Parse the command line arguments.
506
507 Create an argument parser for this command's syntax, parse the
508 command line, and return the result of the `ArgumentParser`
509 `parse_args()` method.
510
511 @param argv Standard command line argument vector; `argv[0]` is
512 assumed to be the command name.
513
514 @return Result returned by `ArgumentParser.parse_args()`.
515
516 """
517 parser = argparse.ArgumentParser(
Richard Barnette81ef2242018-06-14 14:34:34 -0700518 prog=os.path.basename(argv[0]),
J. Richard Barnette91d56812015-04-21 10:22:31 -0700519 description='Balance pool shortages from spares on reserve')
520
Prathmesh Prabhuc8cf0f62017-11-09 08:57:58 -0800521 parser.add_argument(
522 '-w', '--web', type=str, default=None,
523 help='AFE host to use. Default comes from shadow_config.',
524 )
J. Richard Barnette91d56812015-04-21 10:22:31 -0700525 count_group = parser.add_mutually_exclusive_group()
526 count_group.add_argument('-t', '--total', type=int,
527 metavar='COUNT', default=None,
528 help='Set the number of DUTs in the '
529 'pool to the specified count for '
Richard Barnette5de01eb2017-12-15 09:53:42 -0800530 'every MODEL')
J. Richard Barnette91d56812015-04-21 10:22:31 -0700531 count_group.add_argument('-a', '--grow', type=int,
532 metavar='COUNT', default=None,
533 help='Add the specified number of DUTs '
Richard Barnette5de01eb2017-12-15 09:53:42 -0800534 'to the pool for every MODEL')
J. Richard Barnette91d56812015-04-21 10:22:31 -0700535 count_group.add_argument('-d', '--shrink', type=int,
536 metavar='COUNT', default=None,
537 help='Remove the specified number of DUTs '
Richard Barnette5de01eb2017-12-15 09:53:42 -0800538 'from the pool for every MODEL')
J. Richard Barnette91d56812015-04-21 10:22:31 -0700539
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700540 parser.add_argument('-s', '--spare', default=_SPARE_DEFAULT,
J. Richard Barnette91d56812015-04-21 10:22:31 -0700541 metavar='POOL',
542 help='Pool from which to draw replacement '
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700543 'spares (default: pool:%s)' % _SPARE_DEFAULT)
J. Richard Barnette91d56812015-04-21 10:22:31 -0700544 parser.add_argument('-n', '--dry-run', action='store_true',
545 help='Report actions to take in the form of '
546 'shell commands')
David James750c0382015-05-06 19:30:46 -0700547 parser.add_argument('-v', '--verbose', action='store_true',
548 help='Print more detail about calculations for debug '
549 'purposes.')
J. Richard Barnette91d56812015-04-21 10:22:31 -0700550
David Jamesf1d6e452015-07-17 15:23:04 -0700551 parser.add_argument('-m', '--max-broken', default=2, type=int,
552 metavar='COUNT',
553 help='Only rebalance a pool if it has at most '
554 'COUNT broken DUTs.')
555 parser.add_argument('-f', '--force-rebalance', action='store_true',
556 help='Forcefully rebalance all DUTs in a pool, even '
557 'if it has a large number of broken DUTs. '
558 'Before doing this, please investigate whether '
559 'there is a bug that is bricking devices in the '
560 'lab.')
Jacob Kopczynskif8d90a82017-10-10 14:37:33 -0700561 parser.add_argument('--production', action='store_true',
Prathmesh Prabhu7a050da2017-11-09 09:15:03 -0800562 help='Treat this as a production run. This will '
563 'collect metrics.')
David Jamesf1d6e452015-07-17 15:23:04 -0700564
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800565 parser.add_argument(
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800566 '--all-models',
567 action='store_true',
568 help='Rebalance all managed models. This will do a very expensive '
569 'check to see how many models have at least one broken DUT. '
570 'To bypass that check, set --max-broken-models to 0.',
571 )
572 parser.add_argument(
Richard Barnette5de01eb2017-12-15 09:53:42 -0800573 '--max-broken-models', default=None, type=int, metavar='COUNT',
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800574 help='Only rebalance all models if number of models with broken '
575 'DUTs in the specified pool is less than COUNT.',
576 )
577
J. Richard Barnette91d56812015-04-21 10:22:31 -0700578 parser.add_argument('pool',
579 metavar='POOL',
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700580 help='Name of the pool to balance. Use %s to balance '
581 'all critical pools' % _ALL_CRITICAL_POOLS)
Richard Barnette5de01eb2017-12-15 09:53:42 -0800582 parser.add_argument('models', nargs='*', metavar='MODEL',
583 help='Names of models to balance.')
Aviv Keshet39c20dd2017-11-02 10:17:13 -0700584
Chung-yih Wangcc1d9cb2017-11-30 11:20:45 +0800585 parser.add_argument('--sku', type=str,
586 help='Optional name of sku to restrict to.')
587
J. Richard Barnette91d56812015-04-21 10:22:31 -0700588 arguments = parser.parse_args(argv[1:])
David James8352bc22015-05-05 16:37:05 -0700589
590 # Error-check arguments.
Richard Barnette5de01eb2017-12-15 09:53:42 -0800591 if arguments.models and arguments.all_models:
592 parser.error('Cannot specify individual models on the command line '
593 'when using --all-models.')
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700594 if (arguments.pool == _ALL_CRITICAL_POOLS and
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800595 arguments.spare != _SPARE_DEFAULT):
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700596 parser.error('Cannot specify --spare pool to be %s when balancing all '
597 'critical pools.' % _SPARE_DEFAULT)
Richard Barnette81ef2242018-06-14 14:34:34 -0700598 for p in (arguments.spare, arguments.pool):
599 if not _VALID_POOL_PATTERN.match(p):
600 parser.error('Invalid pool name: %s' % p)
J. Richard Barnette91d56812015-04-21 10:22:31 -0700601 return arguments
602
603
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800604def infer_balancer_targets(afe, arguments, pools):
Richard Barnette5de01eb2017-12-15 09:53:42 -0800605 """Take some arguments and translate them to a list of models to balance
J. Richard Barnette91d56812015-04-21 10:22:31 -0700606
Jacob Kopczynskic6e483e2017-08-25 17:28:35 -0700607 Args:
608 @param afe AFE object to be used for taking inventory.
609 @param arguments Parsed command line arguments.
610 @param pools The list of pools to balance.
611
Richard Barnette5de01eb2017-12-15 09:53:42 -0800612 @returns a list of (model, labels) tuples to be balanced
J. Richard Barnette91d56812015-04-21 10:22:31 -0700613
614 """
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800615 balancer_targets = []
Aviv Keshet39c20dd2017-11-02 10:17:13 -0700616
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800617 for pool in pools:
Richard Barnette5de01eb2017-12-15 09:53:42 -0800618 if arguments.all_models:
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800619 inventory = lab_inventory.get_inventory(afe)
620 quarantine = _too_many_broken(inventory, pool, arguments)
Jacob Kopczynskic6e483e2017-08-25 17:28:35 -0700621 if quarantine:
Richard Barnette5de01eb2017-12-15 09:53:42 -0800622 _log_error('Refusing to balance all models for %s pool, '
623 'too many models with at least 1 broken DUT '
624 'detected.', pool)
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700625 else:
Richard Barnette685ac852018-04-04 16:14:06 -0700626 for model in inventory.get_pool_models(pool):
Richard Barnette5de01eb2017-12-15 09:53:42 -0800627 labels = labellib.LabelsMapping()
628 labels['model'] = model
629 balancer_targets.append((pool, labels.getlabels()))
Jacob Kopczynskic6e483e2017-08-25 17:28:35 -0700630 metrics.Boolean(
631 'chromeos/autotest/balance_pools/unchanged_pools').set(
632 quarantine, fields={'pool': pool})
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800633 _log_message('Pool %s quarantine status: %s', pool, quarantine)
634 else:
Richard Barnette5de01eb2017-12-15 09:53:42 -0800635 for model in arguments.models:
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800636 labels = labellib.LabelsMapping()
Richard Barnette5de01eb2017-12-15 09:53:42 -0800637 labels['model'] = model
Richard Barnette32fb1e82018-01-30 13:39:30 -0800638 if arguments.sku:
639 labels['sku'] = arguments.sku
640 balancer_targets.append((pool, labels.getlabels()))
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800641 return balancer_targets
Jacob Kopczynskic6e483e2017-08-25 17:28:35 -0700642
643
644def main(argv):
645 """Standard main routine.
646
647 @param argv Command line arguments including `sys.argv[0]`.
648
649 """
Jacob Kopczynskif8d90a82017-10-10 14:37:33 -0700650 arguments = _parse_command(argv)
651 if arguments.production:
Aviv Keshet2cc427d2018-04-18 13:39:24 -0700652 metrics_manager = site_utils.SetupTsMonGlobalState('balance_pools',
653 indirect=True)
Jacob Kopczynskif8d90a82017-10-10 14:37:33 -0700654 else:
Jacob Kopczynskif8d90a82017-10-10 14:37:33 -0700655 metrics_manager = site_utils.TrivialContextManager()
656
657 with metrics_manager:
Aviv Keshet259a6502018-04-18 13:48:02 -0700658 with metrics.SuccessCounter('chromeos/autotest/balance_pools/runs'):
659 end_time = time.time()
660 start_time = end_time - 24 * 60 * 60
Aviv Kesheta8834322018-05-07 13:28:32 -0700661 afe = frontend_wrappers.RetryingAFE(server=arguments.web)
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800662
Aviv Keshet259a6502018-04-18 13:48:02 -0700663 def balancer(pool, labels):
664 """Balance the specified model.
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800665
Aviv Keshet259a6502018-04-18 13:48:02 -0700666 @param pool: The pool to rebalance for the model.
667 @param labels: labels to restrict to balancing operations
668 within.
669 """
670 _balance_model(arguments, afe, pool, labels,
671 start_time, end_time)
672 _log_message('')
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800673
Aviv Keshet259a6502018-04-18 13:48:02 -0700674 pools = (lab_inventory.CRITICAL_POOLS
675 if arguments.pool == _ALL_CRITICAL_POOLS
676 else [arguments.pool])
677 balancer_targets = infer_balancer_targets(afe, arguments, pools)
678 try:
679 parallel.RunTasksInProcessPool(
680 balancer,
681 balancer_targets,
682 processes=8,
683 )
684 except KeyboardInterrupt:
685 pass
J. Richard Barnette91d56812015-04-21 10:22:31 -0700686
687
688if __name__ == '__main__':
689 main(sys.argv)