blob: fb0298f54e3b3f5a702a960989d769405d02b3be [file] [log] [blame]
J. Richard Barnette91d56812015-04-21 10:22:31 -07001#!/usr/bin/env python
2# Copyright 2015 The Chromium OS Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6"""Adjust pool balances to cover DUT shortfalls.
7
8This command takes all broken DUTs in a specific pool for specific
Richard Barnette5de01eb2017-12-15 09:53:42 -08009models and swaps them with working DUTs taken from a selected pool
J. Richard Barnette91d56812015-04-21 10:22:31 -070010of spares. The command is meant primarily for replacing broken DUTs
11in critical pools like BVT or CQ, but it can also be used to adjust
12pool sizes, or to create or remove pools.
13
Richard Barnette5de01eb2017-12-15 09:53:42 -080014usage: balance_pool.py [ options ] POOL MODEL [ MODEL ... ]
J. Richard Barnette91d56812015-04-21 10:22:31 -070015
16positional arguments:
17 POOL Name of the pool to balance
Richard Barnette5de01eb2017-12-15 09:53:42 -080018 MODEL Names of models to balance
J. Richard Barnette91d56812015-04-21 10:22:31 -070019
20optional arguments:
21 -h, --help show this help message and exit
22 -t COUNT, --total COUNT
23 Set the number of DUTs in the pool to the specified
Richard Barnette5de01eb2017-12-15 09:53:42 -080024 count for every MODEL
J. Richard Barnette91d56812015-04-21 10:22:31 -070025 -a COUNT, --grow COUNT
26 Add the specified number of DUTs to the pool for every
Richard Barnette5de01eb2017-12-15 09:53:42 -080027 MODEL
J. Richard Barnette91d56812015-04-21 10:22:31 -070028 -d COUNT, --shrink COUNT
29 Remove the specified number of DUTs from the pool for
Richard Barnette5de01eb2017-12-15 09:53:42 -080030 every MODEL
J. Richard Barnette91d56812015-04-21 10:22:31 -070031 -s POOL, --spare POOL
32 Pool from which to draw replacement spares (default:
33 pool:suites)
Chung-yih Wangcc1d9cb2017-11-30 11:20:45 +080034 --sku SKU The specific SKU we intend to swap with
J. Richard Barnette91d56812015-04-21 10:22:31 -070035 -n, --dry-run Report actions to take in the form of shell commands
36
37
38The command attempts to remove all broken DUTs from the target POOL
Richard Barnette5de01eb2017-12-15 09:53:42 -080039for every MODEL, and replace them with enough working DUTs taken
J. Richard Barnette91d56812015-04-21 10:22:31 -070040from the spare pool to bring the strength of POOL to the requested
41total COUNT.
42
43If no COUNT options are supplied (i.e. there are no --total, --grow,
44or --shrink options), the command will maintain the current totals of
Richard Barnette5de01eb2017-12-15 09:53:42 -080045DUTs for every MODEL in the target POOL.
J. Richard Barnette91d56812015-04-21 10:22:31 -070046
47If not enough working spares are available, broken DUTs may be left
48in the pool to keep the pool at the target COUNT.
49
50When reducing pool size, working DUTs will be returned after broken
51DUTs, if it's necessary to achieve the target COUNT.
52
53"""
54
55
56import argparse
57import sys
58import time
59
60import common
Xixuan Wu93e646c2017-12-07 18:36:10 -080061from autotest_lib.server import constants
J. Richard Barnette91d56812015-04-21 10:22:31 -070062from autotest_lib.server import frontend
Jacob Kopczynskic6e483e2017-08-25 17:28:35 -070063from autotest_lib.server import site_utils
Aviv Keshet7ee95862016-08-30 15:18:27 -070064from autotest_lib.server.lib import status_history
Kevin Chengcf0ad2b2016-04-19 14:51:39 -070065from autotest_lib.site_utils import lab_inventory
Prathmesh Prabhu68acc402017-11-09 15:24:15 -080066from autotest_lib.utils import labellib
Jacob Kopczynskic6e483e2017-08-25 17:28:35 -070067from chromite.lib import metrics
David James2a3cb542015-05-05 17:13:43 -070068from chromite.lib import parallel
69
Jacob Kopczynskif8d90a82017-10-10 14:37:33 -070070#This must be imported after chromite.lib.metrics
71from infra_libs import ts_mon
J. Richard Barnette91d56812015-04-21 10:22:31 -070072
73_POOL_PREFIX = constants.Labels.POOL_PREFIX
Richard Barnette5de01eb2017-12-15 09:53:42 -080074# This is the ratio of all models we should calculate the default max
75# number of broken models against. It seemed like the best choice that
76# was neither too strict nor lax.
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -080077_MAX_BROKEN_DEFAULT_RATIO = 3.0 / 8.0
Kevin Chengcf0ad2b2016-04-19 14:51:39 -070078
79_ALL_CRITICAL_POOLS = 'all_critical_pools'
80_SPARE_DEFAULT = lab_inventory.SPARE_POOL
J. Richard Barnette91d56812015-04-21 10:22:31 -070081
82
83def _log_message(message, *args):
84 """Log a message with optional format arguments to stdout.
85
86 This function logs a single line to stdout, with formatting
87 if necessary, and without adornments.
88
89 If `*args` are supplied, the message will be formatted using
90 the arguments.
91
92 @param message Message to be logged, possibly after formatting.
93 @param args Format arguments. If empty, the message is logged
94 without formatting.
95
96 """
97 if args:
98 message = message % args
99 sys.stdout.write('%s\n' % message)
100
101
102def _log_info(dry_run, message, *args):
103 """Log information in a dry-run dependent fashion.
104
105 This function logs a single line to stdout, with formatting
106 if necessary. When logging for a dry run, the message is
107 printed as a shell comment, rather than as unadorned text.
108
109 If `*args` are supplied, the message will be formatted using
110 the arguments.
111
112 @param message Message to be logged, possibly after formatting.
113 @param args Format arguments. If empty, the message is logged
114 without formatting.
115
116 """
117 if dry_run:
118 message = '# ' + message
119 _log_message(message, *args)
120
121
122def _log_error(message, *args):
123 """Log an error to stderr, with optional format arguments.
124
125 This function logs a single line to stderr, prefixed to indicate
126 that it is an error message.
127
128 If `*args` are supplied, the message will be formatted using
129 the arguments.
130
131 @param message Message to be logged, possibly after formatting.
132 @param args Format arguments. If empty, the message is logged
133 without formatting.
134
135 """
136 if args:
137 message = message % args
138 sys.stderr.write('ERROR: %s\n' % message)
139
140
141class _DUTPool(object):
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800142 """Information about a pool of DUTs matching given labels.
J. Richard Barnette91d56812015-04-21 10:22:31 -0700143
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800144 This class collects information about all DUTs for a given pool and matching
145 the given labels, and divides them into three categories:
J. Richard Barnette91d56812015-04-21 10:22:31 -0700146 + Working - the DUT is working for testing, and not locked.
147 + Broken - the DUT is unable to run tests, or it is locked.
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800148 + Ineligible - the DUT is not available to be removed from this pool. The
149 DUT may be either working or broken.
J. Richard Barnette91d56812015-04-21 10:22:31 -0700150
151 DUTs with more than one pool: label are ineligible for exchange
152 during balancing. This is done for the sake of chameleon hosts,
153 which must always be assigned to pool:suites. These DUTs are
154 always marked with pool:chameleon to prevent their reassignment.
155
156 TODO(jrbarnette): The use of `pool:chamelon` (instead of just
157 the `chameleon` label is a hack that should be eliminated.
158
159 _DUTPool instances are used to track both main pools that need
160 to be resupplied with working DUTs and spare pools that supply
161 those DUTs.
162
J. Richard Barnette91d56812015-04-21 10:22:31 -0700163 @property pool Name of the pool associated with
164 this pool of DUTs.
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800165 @property labels Labels that constrain the DUTs to consider.
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800166 @property working_hosts The list of this pool's working DUTs.
167 @property broken_hosts The list of this pool's broken DUTs.
David James750c0382015-05-06 19:30:46 -0700168 @property ineligible_hosts The list of this pool's ineligible DUTs.
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800169 @property pool_labels A list of labels that identify a DUT as part
170 of this pool.
David James750c0382015-05-06 19:30:46 -0700171 @property total_hosts The total number of hosts in pool.
J. Richard Barnette91d56812015-04-21 10:22:31 -0700172
173 """
174
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800175 def __init__(self, afe, pool, labels, start_time, end_time):
J. Richard Barnette91d56812015-04-21 10:22:31 -0700176 self.pool = pool
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800177 self.labels = labellib.LabelsMapping(labels)
178 self.labels['pool'] = pool
179 self._pool_labels = [_POOL_PREFIX + self.pool]
180
David James750c0382015-05-06 19:30:46 -0700181 self.working_hosts = []
182 self.broken_hosts = []
183 self.ineligible_hosts = []
Richard Barnette07303cb2016-04-15 16:56:16 -0700184 self.total_hosts = self._get_hosts(afe, start_time, end_time)
J. Richard Barnette91d56812015-04-21 10:22:31 -0700185
186
Richard Barnette07303cb2016-04-15 16:56:16 -0700187 def _get_hosts(self, afe, start_time, end_time):
Prathmesh Prabhu68acc402017-11-09 15:24:15 -0800188 all_histories = status_history.HostJobHistory.get_multiple_histories(
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800189 afe, start_time, end_time, self.labels.getlabels())
J. Richard Barnette91d56812015-04-21 10:22:31 -0700190 for h in all_histories:
191 host = h.host
192 host_pools = [l for l in host.labels
193 if l.startswith(_POOL_PREFIX)]
194 if len(host_pools) != 1:
David James750c0382015-05-06 19:30:46 -0700195 self.ineligible_hosts.append(host)
J. Richard Barnette91d56812015-04-21 10:22:31 -0700196 else:
197 diag = h.last_diagnosis()[0]
198 if (diag == status_history.WORKING and
199 not host.locked):
David James750c0382015-05-06 19:30:46 -0700200 self.working_hosts.append(host)
J. Richard Barnette91d56812015-04-21 10:22:31 -0700201 else:
David James750c0382015-05-06 19:30:46 -0700202 self.broken_hosts.append(host)
J. Richard Barnette91d56812015-04-21 10:22:31 -0700203 return len(all_histories)
204
205
206 @property
207 def pool_labels(self):
208 """Return the AFE labels that identify this pool.
209
210 The returned labels are the labels that must be removed
211 to remove a DUT from the pool, or added to add a DUT.
212
213 @return A list of AFE labels suitable for AFE.add_labels()
214 or AFE.remove_labels().
215
216 """
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800217 return self._pool_labels
J. Richard Barnette91d56812015-04-21 10:22:31 -0700218
David James750c0382015-05-06 19:30:46 -0700219 def calculate_spares_needed(self, target_total):
J. Richard Barnette91d56812015-04-21 10:22:31 -0700220 """Calculate and log the spares needed to achieve a target.
221
222 Return how many working spares are needed to achieve the
David James750c0382015-05-06 19:30:46 -0700223 given `target_total` with all DUTs working.
J. Richard Barnette91d56812015-04-21 10:22:31 -0700224
225 The spares count may be positive or negative. Positive
226 values indicate spares are needed to replace broken DUTs in
227 order to reach the target; negative numbers indicate that
228 no spares are needed, and that a corresponding number of
229 working devices can be returned.
230
231 If the new target total would require returning ineligible
232 DUTs, an error is logged, and the target total is adjusted
233 so that those DUTs are not exchanged.
234
J. Richard Barnette91d56812015-04-21 10:22:31 -0700235 @param target_total The new target pool size.
236
237 @return The number of spares needed.
238
239 """
David James750c0382015-05-06 19:30:46 -0700240 num_ineligible = len(self.ineligible_hosts)
Jacob Kopczynskic6e483e2017-08-25 17:28:35 -0700241 spares_needed = target_total >= num_ineligible
Jacob Kopczynskied55f2e2017-11-10 16:26:42 -0800242 metrics.Boolean(
243 'chromeos/autotest/balance_pools/exhausted_pools',
Richard Barnette5de01eb2017-12-15 09:53:42 -0800244 'True for each pool/model which requests more DUTs than supplied',
245 # TODO(jrbarnette) The 'board' field is a legacy. We need
246 # to leave it here until we do the extra work Monarch
247 # requires to delete a field.
Jacob Kopczynskied55f2e2017-11-10 16:26:42 -0800248 field_spec=[
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800249 ts_mon.StringField('pool'),
250 ts_mon.StringField('board'),
251 ts_mon.StringField('model'),
252 ]).set(
253 not spares_needed,
254 fields={
255 'pool': self.pool,
Richard Barnette5de01eb2017-12-15 09:53:42 -0800256 'board': self.labels.get('model', ''),
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800257 'model': self.labels.get('model', ''),
258 },
Jacob Kopczynskied55f2e2017-11-10 16:26:42 -0800259 )
Jacob Kopczynskic6e483e2017-08-25 17:28:35 -0700260 if not spares_needed:
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800261 _log_error(
262 '%s pool (%s): Target of %d is below minimum of %d DUTs.',
263 self.pool, self.labels, target_total, num_ineligible,
264 )
J. Richard Barnette91d56812015-04-21 10:22:31 -0700265 _log_error('Adjusting target to %d DUTs.', num_ineligible)
266 target_total = num_ineligible
Jacob Kopczynskied55f2e2017-11-10 16:26:42 -0800267 else:
268 _log_message('%s %s pool: Target of %d is above minimum.',
Richard Barnette5de01eb2017-12-15 09:53:42 -0800269 self.labels.get('model', ''), self.pool, target_total)
David James750c0382015-05-06 19:30:46 -0700270 adjustment = target_total - self.total_hosts
271 return len(self.broken_hosts) + adjustment
J. Richard Barnette91d56812015-04-21 10:22:31 -0700272
David James750c0382015-05-06 19:30:46 -0700273 def allocate_surplus(self, num_broken):
274 """Allocate a list DUTs that can returned as surplus.
J. Richard Barnette91d56812015-04-21 10:22:31 -0700275
276 Return a list of devices that can be returned in order to
277 reduce this pool's supply. Broken DUTs will be preferred
David James750c0382015-05-06 19:30:46 -0700278 over working ones.
J. Richard Barnette91d56812015-04-21 10:22:31 -0700279
280 The `num_broken` parameter indicates the number of broken
281 DUTs to be left in the pool. If this number exceeds the
282 number of broken DUTs actually in the pool, the returned
283 list will be empty. If this number is negative, it
284 indicates a number of working DUTs to be returned in
285 addition to all broken ones.
286
J. Richard Barnette91d56812015-04-21 10:22:31 -0700287 @param num_broken Total number of broken DUTs to be left in
288 this pool.
289
290 @return A list of DUTs to be returned as surplus.
291
292 """
293 if num_broken >= 0:
David James750c0382015-05-06 19:30:46 -0700294 surplus = self.broken_hosts[num_broken:]
J. Richard Barnette91d56812015-04-21 10:22:31 -0700295 return surplus
296 else:
David James750c0382015-05-06 19:30:46 -0700297 return (self.broken_hosts +
298 self.working_hosts[:-num_broken])
J. Richard Barnette91d56812015-04-21 10:22:31 -0700299
300
301def _exchange_labels(dry_run, hosts, target_pool, spare_pool):
302 """Reassign a list of DUTs from one pool to another.
303
304 For all the given hosts, remove all labels associated with
David James750c0382015-05-06 19:30:46 -0700305 `spare_pool`, and add the labels for `target_pool`.
J. Richard Barnette91d56812015-04-21 10:22:31 -0700306
307 If `dry_run` is true, perform no changes, but log the `atest`
308 commands needed to accomplish the necessary label changes.
309
310 @param dry_run Whether the logging is for a dry run or
311 for actual execution.
312 @param hosts List of DUTs (AFE hosts) to be reassigned.
313 @param target_pool The `_DUTPool` object from which the hosts
314 are drawn.
315 @param spare_pool The `_DUTPool` object to which the hosts
316 will be added.
317
318 """
J. Richard Barnette91d56812015-04-21 10:22:31 -0700319 _log_info(dry_run, 'Transferring %d DUTs from %s to %s.',
320 len(hosts), spare_pool.pool, target_pool.pool)
Jacob Kopczynskic6e483e2017-08-25 17:28:35 -0700321 metrics.Counter(
322 'chromeos/autotest/balance_pools/duts_moved',
Richard Barnette5de01eb2017-12-15 09:53:42 -0800323 'DUTs transferred between pools',
324 # TODO(jrbarnette) The 'board' field is a legacy. We need to
325 # leave it here until we do the extra work Monarch requires to
326 # delete a field.
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800327 field_spec=[
328 ts_mon.StringField('board'),
329 ts_mon.StringField('model'),
330 ts_mon.StringField('source_pool'),
331 ts_mon.StringField('target_pool'),
332 ]
333 ).increment_by(
334 len(hosts),
335 fields={
Richard Barnette5de01eb2017-12-15 09:53:42 -0800336 'board': target_pool.labels.get('model', ''),
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800337 'model': target_pool.labels.get('model', ''),
338 'source_pool': spare_pool.pool,
339 'target_pool': target_pool.pool,
340 },
341 )
Jacob Kopczynskied55f2e2017-11-10 16:26:42 -0800342 if not hosts:
343 return
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800344
J. Richard Barnette91d56812015-04-21 10:22:31 -0700345 additions = target_pool.pool_labels
346 removals = spare_pool.pool_labels
347 for host in hosts:
348 if not dry_run:
349 _log_message('Updating host: %s.', host.hostname)
Richard Barnette07303cb2016-04-15 16:56:16 -0700350 host.remove_labels(removals)
351 host.add_labels(additions)
J. Richard Barnette91d56812015-04-21 10:22:31 -0700352 else:
353 _log_message('atest label remove -m %s %s',
354 host.hostname, ' '.join(removals))
355 _log_message('atest label add -m %s %s',
356 host.hostname, ' '.join(additions))
357
358
Richard Barnette5de01eb2017-12-15 09:53:42 -0800359def _balance_model(arguments, afe, pool, labels, start_time, end_time):
360 """Balance one model as requested by command line arguments.
J. Richard Barnette91d56812015-04-21 10:22:31 -0700361
362 @param arguments Parsed command line arguments.
J. Richard Barnette91d56812015-04-21 10:22:31 -0700363 @param afe AFE object to be used for the changes.
Richard Barnette5de01eb2017-12-15 09:53:42 -0800364 @param pool Pool of the model to be balanced.
365 @param labels Restrict the balancing operation within DUTs
366 that have these labels.
J. Richard Barnette91d56812015-04-21 10:22:31 -0700367 @param start_time Start time for HostJobHistory objects in
368 the DUT pools.
369 @param end_time End time for HostJobHistory objects in the
370 DUT pools.
371
372 """
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800373 spare_pool = _DUTPool(afe, arguments.spare, labels, start_time, end_time)
374 main_pool = _DUTPool(afe, pool, labels, start_time, end_time)
J. Richard Barnette91d56812015-04-21 10:22:31 -0700375
David James750c0382015-05-06 19:30:46 -0700376 target_total = main_pool.total_hosts
J. Richard Barnette91d56812015-04-21 10:22:31 -0700377 if arguments.total is not None:
378 target_total = arguments.total
379 elif arguments.grow:
380 target_total += arguments.grow
381 elif arguments.shrink:
382 target_total -= arguments.shrink
383
David James750c0382015-05-06 19:30:46 -0700384 spares_needed = main_pool.calculate_spares_needed(target_total)
J. Richard Barnette91d56812015-04-21 10:22:31 -0700385 if spares_needed > 0:
David James750c0382015-05-06 19:30:46 -0700386 spare_duts = spare_pool.working_hosts[:spares_needed]
J. Richard Barnette91d56812015-04-21 10:22:31 -0700387 shortfall = spares_needed - len(spare_duts)
388 else:
389 spare_duts = []
390 shortfall = spares_needed
391
David James750c0382015-05-06 19:30:46 -0700392 surplus_duts = main_pool.allocate_surplus(shortfall)
393
394 if spares_needed or surplus_duts or arguments.verbose:
395 dry_run = arguments.dry_run
396 _log_message('')
397
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800398 _log_info(dry_run, 'Balancing %s %s pool:', labels, main_pool.pool)
David James750c0382015-05-06 19:30:46 -0700399 _log_info(dry_run,
400 'Total %d DUTs, %d working, %d broken, %d reserved.',
401 main_pool.total_hosts, len(main_pool.working_hosts),
402 len(main_pool.broken_hosts), len(main_pool.ineligible_hosts))
403
404 if spares_needed > 0:
405 add_msg = 'grow pool by %d DUTs' % spares_needed
406 elif spares_needed < 0:
407 add_msg = 'shrink pool by %d DUTs' % -spares_needed
408 else:
409 add_msg = 'no change to pool size'
410 _log_info(dry_run, 'Target is %d working DUTs; %s.',
411 target_total, add_msg)
412
413 _log_info(dry_run,
Chung-yih Wangcc1d9cb2017-11-30 11:20:45 +0800414 '%s %s pool has %d spares available for balancing pool %s',
415 labels, spare_pool.pool, len(spare_pool.working_hosts),
416 main_pool.pool)
David James750c0382015-05-06 19:30:46 -0700417
418 if spares_needed > len(spare_duts):
419 _log_error('Not enough spares: need %d, only have %d.',
420 spares_needed, len(spare_duts))
421 elif shortfall >= 0:
422 _log_info(dry_run,
423 '%s %s pool will return %d broken DUTs, '
424 'leaving %d still in the pool.',
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800425 labels, main_pool.pool,
David James750c0382015-05-06 19:30:46 -0700426 len(surplus_duts),
427 len(main_pool.broken_hosts) - len(surplus_duts))
428 else:
429 _log_info(dry_run,
430 '%s %s pool will return %d surplus DUTs, '
431 'including %d working DUTs.',
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800432 labels, main_pool.pool,
David James750c0382015-05-06 19:30:46 -0700433 len(main_pool.broken_hosts) - shortfall,
434 -shortfall)
435
David Jamesf1d6e452015-07-17 15:23:04 -0700436 if (len(main_pool.broken_hosts) > arguments.max_broken and
437 not arguments.force_rebalance):
438 _log_error('%s %s pool: Refusing to act on pool with %d broken DUTs.',
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800439 labels, main_pool.pool, len(main_pool.broken_hosts))
Richard Barnette5de01eb2017-12-15 09:53:42 -0800440 _log_error('Please investigate this model to for a bug ')
David Jamesf1d6e452015-07-17 15:23:04 -0700441 _log_error('that is bricking devices. Once you have finished your ')
442 _log_error('investigation, you can force a rebalance with ')
443 _log_error('--force-rebalance')
Jacob Kopczynskied55f2e2017-11-10 16:26:42 -0800444 spare_duts = []
445 surplus_duts = []
David Jamesf1d6e452015-07-17 15:23:04 -0700446
J. Richard Barnette91d56812015-04-21 10:22:31 -0700447 if not spare_duts and not surplus_duts:
David James750c0382015-05-06 19:30:46 -0700448 if arguments.verbose:
449 _log_info(arguments.dry_run, 'No exchange required.')
J. Richard Barnette91d56812015-04-21 10:22:31 -0700450
451 _exchange_labels(arguments.dry_run, surplus_duts,
452 spare_pool, main_pool)
453 _exchange_labels(arguments.dry_run, spare_duts,
454 main_pool, spare_pool)
455
456
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800457def _too_many_broken(inventory, pool, args):
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700458 """
Richard Barnette5de01eb2017-12-15 09:53:42 -0800459 Get the inventory of models and check if too many are broken.
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700460
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800461 @param inventory: _LabInventory object.
462 @param pool: The pool to check.
463 @param args: Parsed command line arguments.
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700464
Richard Barnette5de01eb2017-12-15 09:53:42 -0800465 @return True if the number of models with 1 or more broken duts
466 exceed max_broken_models, False otherwise.
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700467 """
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800468 # Were we asked to skip this check?
469 if (args.force_rebalance or
Richard Barnette5de01eb2017-12-15 09:53:42 -0800470 (args.all_models and args.max_broken_models == 0)):
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700471 return False
472
Richard Barnette5de01eb2017-12-15 09:53:42 -0800473 max_broken = args.max_broken_models
474 if max_broken is None:
475 total_num = len(inventory.get_pool_models(pool))
476 max_broken = int(_MAX_BROKEN_DEFAULT_RATIO * total_num)
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800477 _log_info(args.dry_run,
Richard Barnette5de01eb2017-12-15 09:53:42 -0800478 'Max broken models for pool %s: %d',
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800479 pool, max_broken)
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700480
Richard Barnette5de01eb2017-12-15 09:53:42 -0800481 broken = [model for model, counts in inventory.iteritems()
482 if counts.get_broken(pool) != 0]
483 _log_message('There are %d models in the %s pool with at least 1 '
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800484 'broken DUT (max threshold %d)',
485 len(broken), pool, max_broken)
486 for b in sorted(broken):
487 _log_message(b)
488 return len(broken) > max_broken
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700489
490
J. Richard Barnette91d56812015-04-21 10:22:31 -0700491def _parse_command(argv):
492 """Parse the command line arguments.
493
494 Create an argument parser for this command's syntax, parse the
495 command line, and return the result of the `ArgumentParser`
496 `parse_args()` method.
497
498 @param argv Standard command line argument vector; `argv[0]` is
499 assumed to be the command name.
500
501 @return Result returned by `ArgumentParser.parse_args()`.
502
503 """
504 parser = argparse.ArgumentParser(
505 prog=argv[0],
506 description='Balance pool shortages from spares on reserve')
507
Prathmesh Prabhuc8cf0f62017-11-09 08:57:58 -0800508 parser.add_argument(
509 '-w', '--web', type=str, default=None,
510 help='AFE host to use. Default comes from shadow_config.',
511 )
J. Richard Barnette91d56812015-04-21 10:22:31 -0700512 count_group = parser.add_mutually_exclusive_group()
513 count_group.add_argument('-t', '--total', type=int,
514 metavar='COUNT', default=None,
515 help='Set the number of DUTs in the '
516 'pool to the specified count for '
Richard Barnette5de01eb2017-12-15 09:53:42 -0800517 'every MODEL')
J. Richard Barnette91d56812015-04-21 10:22:31 -0700518 count_group.add_argument('-a', '--grow', type=int,
519 metavar='COUNT', default=None,
520 help='Add the specified number of DUTs '
Richard Barnette5de01eb2017-12-15 09:53:42 -0800521 'to the pool for every MODEL')
J. Richard Barnette91d56812015-04-21 10:22:31 -0700522 count_group.add_argument('-d', '--shrink', type=int,
523 metavar='COUNT', default=None,
524 help='Remove the specified number of DUTs '
Richard Barnette5de01eb2017-12-15 09:53:42 -0800525 'from the pool for every MODEL')
J. Richard Barnette91d56812015-04-21 10:22:31 -0700526
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700527 parser.add_argument('-s', '--spare', default=_SPARE_DEFAULT,
J. Richard Barnette91d56812015-04-21 10:22:31 -0700528 metavar='POOL',
529 help='Pool from which to draw replacement '
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700530 'spares (default: pool:%s)' % _SPARE_DEFAULT)
J. Richard Barnette91d56812015-04-21 10:22:31 -0700531 parser.add_argument('-n', '--dry-run', action='store_true',
532 help='Report actions to take in the form of '
533 'shell commands')
David James750c0382015-05-06 19:30:46 -0700534 parser.add_argument('-v', '--verbose', action='store_true',
535 help='Print more detail about calculations for debug '
536 'purposes.')
J. Richard Barnette91d56812015-04-21 10:22:31 -0700537
David Jamesf1d6e452015-07-17 15:23:04 -0700538 parser.add_argument('-m', '--max-broken', default=2, type=int,
539 metavar='COUNT',
540 help='Only rebalance a pool if it has at most '
541 'COUNT broken DUTs.')
542 parser.add_argument('-f', '--force-rebalance', action='store_true',
543 help='Forcefully rebalance all DUTs in a pool, even '
544 'if it has a large number of broken DUTs. '
545 'Before doing this, please investigate whether '
546 'there is a bug that is bricking devices in the '
547 'lab.')
Jacob Kopczynskif8d90a82017-10-10 14:37:33 -0700548 parser.add_argument('--production', action='store_true',
Prathmesh Prabhu7a050da2017-11-09 09:15:03 -0800549 help='Treat this as a production run. This will '
550 'collect metrics.')
David Jamesf1d6e452015-07-17 15:23:04 -0700551
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800552 parser.add_argument(
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800553 '--all-models',
554 action='store_true',
555 help='Rebalance all managed models. This will do a very expensive '
556 'check to see how many models have at least one broken DUT. '
557 'To bypass that check, set --max-broken-models to 0.',
558 )
559 parser.add_argument(
Richard Barnette5de01eb2017-12-15 09:53:42 -0800560 '--max-broken-models', default=None, type=int, metavar='COUNT',
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800561 help='Only rebalance all models if number of models with broken '
562 'DUTs in the specified pool is less than COUNT.',
563 )
564
J. Richard Barnette91d56812015-04-21 10:22:31 -0700565 parser.add_argument('pool',
566 metavar='POOL',
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700567 help='Name of the pool to balance. Use %s to balance '
568 'all critical pools' % _ALL_CRITICAL_POOLS)
Richard Barnette5de01eb2017-12-15 09:53:42 -0800569 parser.add_argument('models', nargs='*', metavar='MODEL',
570 help='Names of models to balance.')
Aviv Keshet39c20dd2017-11-02 10:17:13 -0700571
Chung-yih Wangcc1d9cb2017-11-30 11:20:45 +0800572 parser.add_argument('--sku', type=str,
573 help='Optional name of sku to restrict to.')
574
J. Richard Barnette91d56812015-04-21 10:22:31 -0700575 arguments = parser.parse_args(argv[1:])
David James8352bc22015-05-05 16:37:05 -0700576
577 # Error-check arguments.
Richard Barnette5de01eb2017-12-15 09:53:42 -0800578 if arguments.models and arguments.all_models:
579 parser.error('Cannot specify individual models on the command line '
580 'when using --all-models.')
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700581 if (arguments.pool == _ALL_CRITICAL_POOLS and
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800582 arguments.spare != _SPARE_DEFAULT):
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700583 parser.error('Cannot specify --spare pool to be %s when balancing all '
584 'critical pools.' % _SPARE_DEFAULT)
J. Richard Barnette91d56812015-04-21 10:22:31 -0700585 return arguments
586
587
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800588def infer_balancer_targets(afe, arguments, pools):
Richard Barnette5de01eb2017-12-15 09:53:42 -0800589 """Take some arguments and translate them to a list of models to balance
J. Richard Barnette91d56812015-04-21 10:22:31 -0700590
Jacob Kopczynskic6e483e2017-08-25 17:28:35 -0700591 Args:
592 @param afe AFE object to be used for taking inventory.
593 @param arguments Parsed command line arguments.
594 @param pools The list of pools to balance.
595
Richard Barnette5de01eb2017-12-15 09:53:42 -0800596 @returns a list of (model, labels) tuples to be balanced
J. Richard Barnette91d56812015-04-21 10:22:31 -0700597
598 """
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800599 balancer_targets = []
Aviv Keshet39c20dd2017-11-02 10:17:13 -0700600
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800601 for pool in pools:
Richard Barnette5de01eb2017-12-15 09:53:42 -0800602 if arguments.all_models:
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800603 inventory = lab_inventory.get_inventory(afe)
604 quarantine = _too_many_broken(inventory, pool, arguments)
Jacob Kopczynskic6e483e2017-08-25 17:28:35 -0700605 if quarantine:
Richard Barnette5de01eb2017-12-15 09:53:42 -0800606 _log_error('Refusing to balance all models for %s pool, '
607 'too many models with at least 1 broken DUT '
608 'detected.', pool)
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700609 else:
Richard Barnette685ac852018-04-04 16:14:06 -0700610 for model in inventory.get_pool_models(pool):
Richard Barnette5de01eb2017-12-15 09:53:42 -0800611 labels = labellib.LabelsMapping()
612 labels['model'] = model
613 balancer_targets.append((pool, labels.getlabels()))
Jacob Kopczynskic6e483e2017-08-25 17:28:35 -0700614 metrics.Boolean(
615 'chromeos/autotest/balance_pools/unchanged_pools').set(
616 quarantine, fields={'pool': pool})
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800617 _log_message('Pool %s quarantine status: %s', pool, quarantine)
618 else:
Richard Barnette5de01eb2017-12-15 09:53:42 -0800619 for model in arguments.models:
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800620 labels = labellib.LabelsMapping()
Richard Barnette5de01eb2017-12-15 09:53:42 -0800621 labels['model'] = model
Richard Barnette32fb1e82018-01-30 13:39:30 -0800622 if arguments.sku:
623 labels['sku'] = arguments.sku
624 balancer_targets.append((pool, labels.getlabels()))
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800625 return balancer_targets
Jacob Kopczynskic6e483e2017-08-25 17:28:35 -0700626
627
628def main(argv):
629 """Standard main routine.
630
631 @param argv Command line arguments including `sys.argv[0]`.
632
633 """
Jacob Kopczynskif8d90a82017-10-10 14:37:33 -0700634 arguments = _parse_command(argv)
635 if arguments.production:
Aviv Keshet2cc427d2018-04-18 13:39:24 -0700636 metrics_manager = site_utils.SetupTsMonGlobalState('balance_pools',
637 indirect=True)
Jacob Kopczynskif8d90a82017-10-10 14:37:33 -0700638 else:
Jacob Kopczynskif8d90a82017-10-10 14:37:33 -0700639 metrics_manager = site_utils.TrivialContextManager()
640
641 with metrics_manager:
Aviv Keshet259a6502018-04-18 13:48:02 -0700642 with metrics.SuccessCounter('chromeos/autotest/balance_pools/runs'):
643 end_time = time.time()
644 start_time = end_time - 24 * 60 * 60
645 afe = frontend.AFE(server=arguments.web)
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800646
Aviv Keshet259a6502018-04-18 13:48:02 -0700647 def balancer(pool, labels):
648 """Balance the specified model.
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800649
Aviv Keshet259a6502018-04-18 13:48:02 -0700650 @param pool: The pool to rebalance for the model.
651 @param labels: labels to restrict to balancing operations
652 within.
653 """
654 _balance_model(arguments, afe, pool, labels,
655 start_time, end_time)
656 _log_message('')
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800657
Aviv Keshet259a6502018-04-18 13:48:02 -0700658 pools = (lab_inventory.CRITICAL_POOLS
659 if arguments.pool == _ALL_CRITICAL_POOLS
660 else [arguments.pool])
661 balancer_targets = infer_balancer_targets(afe, arguments, pools)
662 try:
663 parallel.RunTasksInProcessPool(
664 balancer,
665 balancer_targets,
666 processes=8,
667 )
668 except KeyboardInterrupt:
669 pass
J. Richard Barnette91d56812015-04-21 10:22:31 -0700670
671
672if __name__ == '__main__':
673 main(sys.argv)