blob: a5db6f94d4f23d427502844a1732b051ef3bf08f [file] [log] [blame]
Mike Frysingerd03e6b52019-08-03 12:49:01 -04001#!/usr/bin/env python2
J. Richard Barnette91d56812015-04-21 10:22:31 -07002# Copyright 2015 The Chromium OS Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6"""Adjust pool balances to cover DUT shortfalls.
7
8This command takes all broken DUTs in a specific pool for specific
Richard Barnette5de01eb2017-12-15 09:53:42 -08009models and swaps them with working DUTs taken from a selected pool
J. Richard Barnette91d56812015-04-21 10:22:31 -070010of spares. The command is meant primarily for replacing broken DUTs
11in critical pools like BVT or CQ, but it can also be used to adjust
12pool sizes, or to create or remove pools.
13
Richard Barnette5de01eb2017-12-15 09:53:42 -080014usage: balance_pool.py [ options ] POOL MODEL [ MODEL ... ]
J. Richard Barnette91d56812015-04-21 10:22:31 -070015
16positional arguments:
17 POOL Name of the pool to balance
Richard Barnette5de01eb2017-12-15 09:53:42 -080018 MODEL Names of models to balance
J. Richard Barnette91d56812015-04-21 10:22:31 -070019
20optional arguments:
21 -h, --help show this help message and exit
22 -t COUNT, --total COUNT
23 Set the number of DUTs in the pool to the specified
Richard Barnette5de01eb2017-12-15 09:53:42 -080024 count for every MODEL
J. Richard Barnette91d56812015-04-21 10:22:31 -070025 -a COUNT, --grow COUNT
26 Add the specified number of DUTs to the pool for every
Richard Barnette5de01eb2017-12-15 09:53:42 -080027 MODEL
J. Richard Barnette91d56812015-04-21 10:22:31 -070028 -d COUNT, --shrink COUNT
29 Remove the specified number of DUTs from the pool for
Richard Barnette5de01eb2017-12-15 09:53:42 -080030 every MODEL
J. Richard Barnette91d56812015-04-21 10:22:31 -070031 -s POOL, --spare POOL
32 Pool from which to draw replacement spares (default:
33 pool:suites)
Matthew Leszczenskidf2c3d72018-11-13 14:50:01 -080034 -p PHASE, --phase PHASE
35 Phase to restrict the balance pool operation to
Chung-yih Wangcc1d9cb2017-11-30 11:20:45 +080036 --sku SKU The specific SKU we intend to swap with
J. Richard Barnette91d56812015-04-21 10:22:31 -070037 -n, --dry-run Report actions to take in the form of shell commands
38
39
40The command attempts to remove all broken DUTs from the target POOL
Richard Barnette5de01eb2017-12-15 09:53:42 -080041for every MODEL, and replace them with enough working DUTs taken
J. Richard Barnette91d56812015-04-21 10:22:31 -070042from the spare pool to bring the strength of POOL to the requested
43total COUNT.
44
45If no COUNT options are supplied (i.e. there are no --total, --grow,
46or --shrink options), the command will maintain the current totals of
Richard Barnette5de01eb2017-12-15 09:53:42 -080047DUTs for every MODEL in the target POOL.
J. Richard Barnette91d56812015-04-21 10:22:31 -070048
49If not enough working spares are available, broken DUTs may be left
50in the pool to keep the pool at the target COUNT.
51
52When reducing pool size, working DUTs will be returned after broken
53DUTs, if it's necessary to achieve the target COUNT.
54
55"""
56
57
58import argparse
Richard Barnette81ef2242018-06-14 14:34:34 -070059import os
60import re
J. Richard Barnette91d56812015-04-21 10:22:31 -070061import sys
62import time
63
64import common
Xixuan Wu93e646c2017-12-07 18:36:10 -080065from autotest_lib.server import constants
Jacob Kopczynskic6e483e2017-08-25 17:28:35 -070066from autotest_lib.server import site_utils
Aviv Kesheta8834322018-05-07 13:28:32 -070067from autotest_lib.server.cros.dynamic_suite import frontend_wrappers
Aviv Keshet7ee95862016-08-30 15:18:27 -070068from autotest_lib.server.lib import status_history
Kevin Chengcf0ad2b2016-04-19 14:51:39 -070069from autotest_lib.site_utils import lab_inventory
Prathmesh Prabhu68acc402017-11-09 15:24:15 -080070from autotest_lib.utils import labellib
Jacob Kopczynskic6e483e2017-08-25 17:28:35 -070071from chromite.lib import metrics
David James2a3cb542015-05-05 17:13:43 -070072from chromite.lib import parallel
73
Jacob Kopczynskif8d90a82017-10-10 14:37:33 -070074#This must be imported after chromite.lib.metrics
75from infra_libs import ts_mon
J. Richard Barnette91d56812015-04-21 10:22:31 -070076
77_POOL_PREFIX = constants.Labels.POOL_PREFIX
Richard Barnette5de01eb2017-12-15 09:53:42 -080078# This is the ratio of all models we should calculate the default max
79# number of broken models against. It seemed like the best choice that
80# was neither too strict nor lax.
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -080081_MAX_BROKEN_DEFAULT_RATIO = 3.0 / 8.0
Kevin Chengcf0ad2b2016-04-19 14:51:39 -070082
83_ALL_CRITICAL_POOLS = 'all_critical_pools'
84_SPARE_DEFAULT = lab_inventory.SPARE_POOL
J. Richard Barnette91d56812015-04-21 10:22:31 -070085
86
Richard Barnette81ef2242018-06-14 14:34:34 -070087# _VALID_POOL_PATTERN - Regular expression matching pool names that will
88# be accepted on the command line.
89#
90# Note: This pattern was selected merely to recognize all existing pool
91# names; there's no underlying technical restriction motivating this
92# pattern. No reasonable request to add more special characters to the
93# allowed set should be refused.
94
95_VALID_POOL_PATTERN = re.compile('^[a-zA-z0-9_\-]+$')
96
97
J. Richard Barnette91d56812015-04-21 10:22:31 -070098def _log_message(message, *args):
99 """Log a message with optional format arguments to stdout.
100
101 This function logs a single line to stdout, with formatting
102 if necessary, and without adornments.
103
104 If `*args` are supplied, the message will be formatted using
105 the arguments.
106
107 @param message Message to be logged, possibly after formatting.
108 @param args Format arguments. If empty, the message is logged
109 without formatting.
110
111 """
112 if args:
113 message = message % args
114 sys.stdout.write('%s\n' % message)
115
116
117def _log_info(dry_run, message, *args):
118 """Log information in a dry-run dependent fashion.
119
120 This function logs a single line to stdout, with formatting
121 if necessary. When logging for a dry run, the message is
122 printed as a shell comment, rather than as unadorned text.
123
124 If `*args` are supplied, the message will be formatted using
125 the arguments.
126
127 @param message Message to be logged, possibly after formatting.
128 @param args Format arguments. If empty, the message is logged
129 without formatting.
130
131 """
132 if dry_run:
133 message = '# ' + message
134 _log_message(message, *args)
135
136
137def _log_error(message, *args):
138 """Log an error to stderr, with optional format arguments.
139
140 This function logs a single line to stderr, prefixed to indicate
141 that it is an error message.
142
143 If `*args` are supplied, the message will be formatted using
144 the arguments.
145
146 @param message Message to be logged, possibly after formatting.
147 @param args Format arguments. If empty, the message is logged
148 without formatting.
149
150 """
151 if args:
152 message = message % args
153 sys.stderr.write('ERROR: %s\n' % message)
154
155
156class _DUTPool(object):
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800157 """Information about a pool of DUTs matching given labels.
J. Richard Barnette91d56812015-04-21 10:22:31 -0700158
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800159 This class collects information about all DUTs for a given pool and matching
160 the given labels, and divides them into three categories:
J. Richard Barnette91d56812015-04-21 10:22:31 -0700161 + Working - the DUT is working for testing, and not locked.
162 + Broken - the DUT is unable to run tests, or it is locked.
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800163 + Ineligible - the DUT is not available to be removed from this pool. The
164 DUT may be either working or broken.
J. Richard Barnette91d56812015-04-21 10:22:31 -0700165
166 DUTs with more than one pool: label are ineligible for exchange
167 during balancing. This is done for the sake of chameleon hosts,
168 which must always be assigned to pool:suites. These DUTs are
169 always marked with pool:chameleon to prevent their reassignment.
170
171 TODO(jrbarnette): The use of `pool:chamelon` (instead of just
172 the `chameleon` label is a hack that should be eliminated.
173
174 _DUTPool instances are used to track both main pools that need
175 to be resupplied with working DUTs and spare pools that supply
176 those DUTs.
177
J. Richard Barnette91d56812015-04-21 10:22:31 -0700178 @property pool Name of the pool associated with
179 this pool of DUTs.
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800180 @property labels Labels that constrain the DUTs to consider.
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800181 @property working_hosts The list of this pool's working DUTs.
182 @property broken_hosts The list of this pool's broken DUTs.
David James750c0382015-05-06 19:30:46 -0700183 @property ineligible_hosts The list of this pool's ineligible DUTs.
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800184 @property pool_labels A list of labels that identify a DUT as part
185 of this pool.
David James750c0382015-05-06 19:30:46 -0700186 @property total_hosts The total number of hosts in pool.
J. Richard Barnette91d56812015-04-21 10:22:31 -0700187
188 """
189
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800190 def __init__(self, afe, pool, labels, start_time, end_time):
J. Richard Barnette91d56812015-04-21 10:22:31 -0700191 self.pool = pool
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800192 self.labels = labellib.LabelsMapping(labels)
193 self.labels['pool'] = pool
194 self._pool_labels = [_POOL_PREFIX + self.pool]
195
David James750c0382015-05-06 19:30:46 -0700196 self.working_hosts = []
197 self.broken_hosts = []
198 self.ineligible_hosts = []
Richard Barnette07303cb2016-04-15 16:56:16 -0700199 self.total_hosts = self._get_hosts(afe, start_time, end_time)
J. Richard Barnette91d56812015-04-21 10:22:31 -0700200
201
Richard Barnette07303cb2016-04-15 16:56:16 -0700202 def _get_hosts(self, afe, start_time, end_time):
Prathmesh Prabhu68acc402017-11-09 15:24:15 -0800203 all_histories = status_history.HostJobHistory.get_multiple_histories(
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800204 afe, start_time, end_time, self.labels.getlabels())
J. Richard Barnette91d56812015-04-21 10:22:31 -0700205 for h in all_histories:
206 host = h.host
207 host_pools = [l for l in host.labels
208 if l.startswith(_POOL_PREFIX)]
209 if len(host_pools) != 1:
David James750c0382015-05-06 19:30:46 -0700210 self.ineligible_hosts.append(host)
J. Richard Barnette91d56812015-04-21 10:22:31 -0700211 else:
212 diag = h.last_diagnosis()[0]
213 if (diag == status_history.WORKING and
214 not host.locked):
David James750c0382015-05-06 19:30:46 -0700215 self.working_hosts.append(host)
J. Richard Barnette91d56812015-04-21 10:22:31 -0700216 else:
David James750c0382015-05-06 19:30:46 -0700217 self.broken_hosts.append(host)
J. Richard Barnette91d56812015-04-21 10:22:31 -0700218 return len(all_histories)
219
220
221 @property
222 def pool_labels(self):
223 """Return the AFE labels that identify this pool.
224
225 The returned labels are the labels that must be removed
226 to remove a DUT from the pool, or added to add a DUT.
227
228 @return A list of AFE labels suitable for AFE.add_labels()
229 or AFE.remove_labels().
230
231 """
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800232 return self._pool_labels
J. Richard Barnette91d56812015-04-21 10:22:31 -0700233
David James750c0382015-05-06 19:30:46 -0700234 def calculate_spares_needed(self, target_total):
J. Richard Barnette91d56812015-04-21 10:22:31 -0700235 """Calculate and log the spares needed to achieve a target.
236
237 Return how many working spares are needed to achieve the
David James750c0382015-05-06 19:30:46 -0700238 given `target_total` with all DUTs working.
J. Richard Barnette91d56812015-04-21 10:22:31 -0700239
240 The spares count may be positive or negative. Positive
241 values indicate spares are needed to replace broken DUTs in
242 order to reach the target; negative numbers indicate that
243 no spares are needed, and that a corresponding number of
244 working devices can be returned.
245
246 If the new target total would require returning ineligible
247 DUTs, an error is logged, and the target total is adjusted
248 so that those DUTs are not exchanged.
249
J. Richard Barnette91d56812015-04-21 10:22:31 -0700250 @param target_total The new target pool size.
251
252 @return The number of spares needed.
253
254 """
David James750c0382015-05-06 19:30:46 -0700255 num_ineligible = len(self.ineligible_hosts)
Jacob Kopczynskic6e483e2017-08-25 17:28:35 -0700256 spares_needed = target_total >= num_ineligible
Jacob Kopczynskied55f2e2017-11-10 16:26:42 -0800257 metrics.Boolean(
258 'chromeos/autotest/balance_pools/exhausted_pools',
Richard Barnette5de01eb2017-12-15 09:53:42 -0800259 'True for each pool/model which requests more DUTs than supplied',
260 # TODO(jrbarnette) The 'board' field is a legacy. We need
261 # to leave it here until we do the extra work Monarch
262 # requires to delete a field.
Jacob Kopczynskied55f2e2017-11-10 16:26:42 -0800263 field_spec=[
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800264 ts_mon.StringField('pool'),
265 ts_mon.StringField('board'),
266 ts_mon.StringField('model'),
267 ]).set(
268 not spares_needed,
269 fields={
270 'pool': self.pool,
Richard Barnette5de01eb2017-12-15 09:53:42 -0800271 'board': self.labels.get('model', ''),
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800272 'model': self.labels.get('model', ''),
273 },
Jacob Kopczynskied55f2e2017-11-10 16:26:42 -0800274 )
Jacob Kopczynskic6e483e2017-08-25 17:28:35 -0700275 if not spares_needed:
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800276 _log_error(
277 '%s pool (%s): Target of %d is below minimum of %d DUTs.',
278 self.pool, self.labels, target_total, num_ineligible,
279 )
J. Richard Barnette91d56812015-04-21 10:22:31 -0700280 _log_error('Adjusting target to %d DUTs.', num_ineligible)
281 target_total = num_ineligible
Jacob Kopczynskied55f2e2017-11-10 16:26:42 -0800282 else:
283 _log_message('%s %s pool: Target of %d is above minimum.',
Richard Barnette5de01eb2017-12-15 09:53:42 -0800284 self.labels.get('model', ''), self.pool, target_total)
David James750c0382015-05-06 19:30:46 -0700285 adjustment = target_total - self.total_hosts
286 return len(self.broken_hosts) + adjustment
J. Richard Barnette91d56812015-04-21 10:22:31 -0700287
David James750c0382015-05-06 19:30:46 -0700288 def allocate_surplus(self, num_broken):
289 """Allocate a list DUTs that can returned as surplus.
J. Richard Barnette91d56812015-04-21 10:22:31 -0700290
291 Return a list of devices that can be returned in order to
292 reduce this pool's supply. Broken DUTs will be preferred
David James750c0382015-05-06 19:30:46 -0700293 over working ones.
J. Richard Barnette91d56812015-04-21 10:22:31 -0700294
295 The `num_broken` parameter indicates the number of broken
296 DUTs to be left in the pool. If this number exceeds the
297 number of broken DUTs actually in the pool, the returned
298 list will be empty. If this number is negative, it
299 indicates a number of working DUTs to be returned in
300 addition to all broken ones.
301
J. Richard Barnette91d56812015-04-21 10:22:31 -0700302 @param num_broken Total number of broken DUTs to be left in
303 this pool.
304
305 @return A list of DUTs to be returned as surplus.
306
307 """
308 if num_broken >= 0:
David James750c0382015-05-06 19:30:46 -0700309 surplus = self.broken_hosts[num_broken:]
J. Richard Barnette91d56812015-04-21 10:22:31 -0700310 return surplus
311 else:
David James750c0382015-05-06 19:30:46 -0700312 return (self.broken_hosts +
313 self.working_hosts[:-num_broken])
J. Richard Barnette91d56812015-04-21 10:22:31 -0700314
315
316def _exchange_labels(dry_run, hosts, target_pool, spare_pool):
317 """Reassign a list of DUTs from one pool to another.
318
319 For all the given hosts, remove all labels associated with
David James750c0382015-05-06 19:30:46 -0700320 `spare_pool`, and add the labels for `target_pool`.
J. Richard Barnette91d56812015-04-21 10:22:31 -0700321
322 If `dry_run` is true, perform no changes, but log the `atest`
323 commands needed to accomplish the necessary label changes.
324
325 @param dry_run Whether the logging is for a dry run or
326 for actual execution.
327 @param hosts List of DUTs (AFE hosts) to be reassigned.
328 @param target_pool The `_DUTPool` object from which the hosts
329 are drawn.
330 @param spare_pool The `_DUTPool` object to which the hosts
331 will be added.
332
333 """
J. Richard Barnette91d56812015-04-21 10:22:31 -0700334 _log_info(dry_run, 'Transferring %d DUTs from %s to %s.',
335 len(hosts), spare_pool.pool, target_pool.pool)
Jacob Kopczynskic6e483e2017-08-25 17:28:35 -0700336 metrics.Counter(
337 'chromeos/autotest/balance_pools/duts_moved',
Richard Barnette5de01eb2017-12-15 09:53:42 -0800338 'DUTs transferred between pools',
339 # TODO(jrbarnette) The 'board' field is a legacy. We need to
340 # leave it here until we do the extra work Monarch requires to
341 # delete a field.
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800342 field_spec=[
343 ts_mon.StringField('board'),
344 ts_mon.StringField('model'),
345 ts_mon.StringField('source_pool'),
346 ts_mon.StringField('target_pool'),
347 ]
348 ).increment_by(
349 len(hosts),
350 fields={
Richard Barnette5de01eb2017-12-15 09:53:42 -0800351 'board': target_pool.labels.get('model', ''),
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800352 'model': target_pool.labels.get('model', ''),
353 'source_pool': spare_pool.pool,
354 'target_pool': target_pool.pool,
355 },
356 )
Jacob Kopczynskied55f2e2017-11-10 16:26:42 -0800357 if not hosts:
358 return
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800359
J. Richard Barnette91d56812015-04-21 10:22:31 -0700360 additions = target_pool.pool_labels
361 removals = spare_pool.pool_labels
362 for host in hosts:
363 if not dry_run:
364 _log_message('Updating host: %s.', host.hostname)
Richard Barnette07303cb2016-04-15 16:56:16 -0700365 host.remove_labels(removals)
366 host.add_labels(additions)
J. Richard Barnette91d56812015-04-21 10:22:31 -0700367 else:
368 _log_message('atest label remove -m %s %s',
369 host.hostname, ' '.join(removals))
370 _log_message('atest label add -m %s %s',
371 host.hostname, ' '.join(additions))
372
373
Richard Barnette5de01eb2017-12-15 09:53:42 -0800374def _balance_model(arguments, afe, pool, labels, start_time, end_time):
375 """Balance one model as requested by command line arguments.
J. Richard Barnette91d56812015-04-21 10:22:31 -0700376
377 @param arguments Parsed command line arguments.
J. Richard Barnette91d56812015-04-21 10:22:31 -0700378 @param afe AFE object to be used for the changes.
Richard Barnette5de01eb2017-12-15 09:53:42 -0800379 @param pool Pool of the model to be balanced.
380 @param labels Restrict the balancing operation within DUTs
381 that have these labels.
J. Richard Barnette91d56812015-04-21 10:22:31 -0700382 @param start_time Start time for HostJobHistory objects in
383 the DUT pools.
384 @param end_time End time for HostJobHistory objects in the
385 DUT pools.
386
387 """
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800388 spare_pool = _DUTPool(afe, arguments.spare, labels, start_time, end_time)
389 main_pool = _DUTPool(afe, pool, labels, start_time, end_time)
J. Richard Barnette91d56812015-04-21 10:22:31 -0700390
David James750c0382015-05-06 19:30:46 -0700391 target_total = main_pool.total_hosts
J. Richard Barnette91d56812015-04-21 10:22:31 -0700392 if arguments.total is not None:
393 target_total = arguments.total
394 elif arguments.grow:
395 target_total += arguments.grow
396 elif arguments.shrink:
397 target_total -= arguments.shrink
398
David James750c0382015-05-06 19:30:46 -0700399 spares_needed = main_pool.calculate_spares_needed(target_total)
J. Richard Barnette91d56812015-04-21 10:22:31 -0700400 if spares_needed > 0:
David James750c0382015-05-06 19:30:46 -0700401 spare_duts = spare_pool.working_hosts[:spares_needed]
J. Richard Barnette91d56812015-04-21 10:22:31 -0700402 shortfall = spares_needed - len(spare_duts)
403 else:
404 spare_duts = []
405 shortfall = spares_needed
406
David James750c0382015-05-06 19:30:46 -0700407 surplus_duts = main_pool.allocate_surplus(shortfall)
408
409 if spares_needed or surplus_duts or arguments.verbose:
410 dry_run = arguments.dry_run
411 _log_message('')
412
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800413 _log_info(dry_run, 'Balancing %s %s pool:', labels, main_pool.pool)
David James750c0382015-05-06 19:30:46 -0700414 _log_info(dry_run,
415 'Total %d DUTs, %d working, %d broken, %d reserved.',
416 main_pool.total_hosts, len(main_pool.working_hosts),
417 len(main_pool.broken_hosts), len(main_pool.ineligible_hosts))
418
419 if spares_needed > 0:
420 add_msg = 'grow pool by %d DUTs' % spares_needed
421 elif spares_needed < 0:
422 add_msg = 'shrink pool by %d DUTs' % -spares_needed
423 else:
424 add_msg = 'no change to pool size'
425 _log_info(dry_run, 'Target is %d working DUTs; %s.',
426 target_total, add_msg)
427
428 _log_info(dry_run,
Chung-yih Wangcc1d9cb2017-11-30 11:20:45 +0800429 '%s %s pool has %d spares available for balancing pool %s',
430 labels, spare_pool.pool, len(spare_pool.working_hosts),
431 main_pool.pool)
David James750c0382015-05-06 19:30:46 -0700432
433 if spares_needed > len(spare_duts):
434 _log_error('Not enough spares: need %d, only have %d.',
435 spares_needed, len(spare_duts))
436 elif shortfall >= 0:
437 _log_info(dry_run,
438 '%s %s pool will return %d broken DUTs, '
439 'leaving %d still in the pool.',
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800440 labels, main_pool.pool,
David James750c0382015-05-06 19:30:46 -0700441 len(surplus_duts),
442 len(main_pool.broken_hosts) - len(surplus_duts))
443 else:
444 _log_info(dry_run,
445 '%s %s pool will return %d surplus DUTs, '
446 'including %d working DUTs.',
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800447 labels, main_pool.pool,
David James750c0382015-05-06 19:30:46 -0700448 len(main_pool.broken_hosts) - shortfall,
449 -shortfall)
450
David Jamesf1d6e452015-07-17 15:23:04 -0700451 if (len(main_pool.broken_hosts) > arguments.max_broken and
452 not arguments.force_rebalance):
453 _log_error('%s %s pool: Refusing to act on pool with %d broken DUTs.',
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800454 labels, main_pool.pool, len(main_pool.broken_hosts))
Richard Barnette5de01eb2017-12-15 09:53:42 -0800455 _log_error('Please investigate this model to for a bug ')
David Jamesf1d6e452015-07-17 15:23:04 -0700456 _log_error('that is bricking devices. Once you have finished your ')
457 _log_error('investigation, you can force a rebalance with ')
458 _log_error('--force-rebalance')
Jacob Kopczynskied55f2e2017-11-10 16:26:42 -0800459 spare_duts = []
460 surplus_duts = []
David Jamesf1d6e452015-07-17 15:23:04 -0700461
J. Richard Barnette91d56812015-04-21 10:22:31 -0700462 if not spare_duts and not surplus_duts:
David James750c0382015-05-06 19:30:46 -0700463 if arguments.verbose:
464 _log_info(arguments.dry_run, 'No exchange required.')
J. Richard Barnette91d56812015-04-21 10:22:31 -0700465
466 _exchange_labels(arguments.dry_run, surplus_duts,
467 spare_pool, main_pool)
468 _exchange_labels(arguments.dry_run, spare_duts,
469 main_pool, spare_pool)
470
471
472def _parse_command(argv):
473 """Parse the command line arguments.
474
475 Create an argument parser for this command's syntax, parse the
476 command line, and return the result of the `ArgumentParser`
477 `parse_args()` method.
478
479 @param argv Standard command line argument vector; `argv[0]` is
480 assumed to be the command name.
481
482 @return Result returned by `ArgumentParser.parse_args()`.
483
484 """
485 parser = argparse.ArgumentParser(
Richard Barnette81ef2242018-06-14 14:34:34 -0700486 prog=os.path.basename(argv[0]),
J. Richard Barnette91d56812015-04-21 10:22:31 -0700487 description='Balance pool shortages from spares on reserve')
488
Prathmesh Prabhuc8cf0f62017-11-09 08:57:58 -0800489 parser.add_argument(
490 '-w', '--web', type=str, default=None,
491 help='AFE host to use. Default comes from shadow_config.',
492 )
J. Richard Barnette91d56812015-04-21 10:22:31 -0700493 count_group = parser.add_mutually_exclusive_group()
494 count_group.add_argument('-t', '--total', type=int,
495 metavar='COUNT', default=None,
496 help='Set the number of DUTs in the '
497 'pool to the specified count for '
Richard Barnette5de01eb2017-12-15 09:53:42 -0800498 'every MODEL')
J. Richard Barnette91d56812015-04-21 10:22:31 -0700499 count_group.add_argument('-a', '--grow', type=int,
500 metavar='COUNT', default=None,
501 help='Add the specified number of DUTs '
Richard Barnette5de01eb2017-12-15 09:53:42 -0800502 'to the pool for every MODEL')
J. Richard Barnette91d56812015-04-21 10:22:31 -0700503 count_group.add_argument('-d', '--shrink', type=int,
504 metavar='COUNT', default=None,
505 help='Remove the specified number of DUTs '
Richard Barnette5de01eb2017-12-15 09:53:42 -0800506 'from the pool for every MODEL')
J. Richard Barnette91d56812015-04-21 10:22:31 -0700507
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700508 parser.add_argument('-s', '--spare', default=_SPARE_DEFAULT,
J. Richard Barnette91d56812015-04-21 10:22:31 -0700509 metavar='POOL',
510 help='Pool from which to draw replacement '
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700511 'spares (default: pool:%s)' % _SPARE_DEFAULT)
J. Richard Barnette91d56812015-04-21 10:22:31 -0700512 parser.add_argument('-n', '--dry-run', action='store_true',
513 help='Report actions to take in the form of '
514 'shell commands')
David James750c0382015-05-06 19:30:46 -0700515 parser.add_argument('-v', '--verbose', action='store_true',
516 help='Print more detail about calculations for debug '
517 'purposes.')
J. Richard Barnette91d56812015-04-21 10:22:31 -0700518
David Jamesf1d6e452015-07-17 15:23:04 -0700519 parser.add_argument('-m', '--max-broken', default=2, type=int,
520 metavar='COUNT',
521 help='Only rebalance a pool if it has at most '
522 'COUNT broken DUTs.')
523 parser.add_argument('-f', '--force-rebalance', action='store_true',
524 help='Forcefully rebalance all DUTs in a pool, even '
525 'if it has a large number of broken DUTs. '
526 'Before doing this, please investigate whether '
527 'there is a bug that is bricking devices in the '
528 'lab.')
Jacob Kopczynskif8d90a82017-10-10 14:37:33 -0700529 parser.add_argument('--production', action='store_true',
Prathmesh Prabhu7a050da2017-11-09 09:15:03 -0800530 help='Treat this as a production run. This will '
531 'collect metrics.')
David Jamesf1d6e452015-07-17 15:23:04 -0700532
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800533 parser.add_argument(
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800534 '--all-models',
535 action='store_true',
536 help='Rebalance all managed models. This will do a very expensive '
537 'check to see how many models have at least one broken DUT. '
538 'To bypass that check, set --max-broken-models to 0.',
539 )
540 parser.add_argument(
Richard Barnette5de01eb2017-12-15 09:53:42 -0800541 '--max-broken-models', default=None, type=int, metavar='COUNT',
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800542 help='Only rebalance all models if number of models with broken '
543 'DUTs in the specified pool is less than COUNT.',
544 )
545
J. Richard Barnette91d56812015-04-21 10:22:31 -0700546 parser.add_argument('pool',
547 metavar='POOL',
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700548 help='Name of the pool to balance. Use %s to balance '
549 'all critical pools' % _ALL_CRITICAL_POOLS)
Richard Barnette5de01eb2017-12-15 09:53:42 -0800550 parser.add_argument('models', nargs='*', metavar='MODEL',
551 help='Names of models to balance.')
Aviv Keshet39c20dd2017-11-02 10:17:13 -0700552
Matthew Leszczenskidf2c3d72018-11-13 14:50:01 -0800553 parser.add_argument('-p', '--phase', metavar='PHASE',
554 help='Optional phase label to restrict balance '
555 'operation to.')
556
Chung-yih Wangcc1d9cb2017-11-30 11:20:45 +0800557 parser.add_argument('--sku', type=str,
558 help='Optional name of sku to restrict to.')
559
J. Richard Barnette91d56812015-04-21 10:22:31 -0700560 arguments = parser.parse_args(argv[1:])
David James8352bc22015-05-05 16:37:05 -0700561
562 # Error-check arguments.
Richard Barnette5de01eb2017-12-15 09:53:42 -0800563 if arguments.models and arguments.all_models:
564 parser.error('Cannot specify individual models on the command line '
565 'when using --all-models.')
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700566 if (arguments.pool == _ALL_CRITICAL_POOLS and
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800567 arguments.spare != _SPARE_DEFAULT):
Kevin Chengcf0ad2b2016-04-19 14:51:39 -0700568 parser.error('Cannot specify --spare pool to be %s when balancing all '
569 'critical pools.' % _SPARE_DEFAULT)
Richard Barnette81ef2242018-06-14 14:34:34 -0700570 for p in (arguments.spare, arguments.pool):
571 if not _VALID_POOL_PATTERN.match(p):
572 parser.error('Invalid pool name: %s' % p)
J. Richard Barnette91d56812015-04-21 10:22:31 -0700573 return arguments
574
575
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800576def infer_balancer_targets(afe, arguments, pools):
Richard Barnette5de01eb2017-12-15 09:53:42 -0800577 """Take some arguments and translate them to a list of models to balance
J. Richard Barnette91d56812015-04-21 10:22:31 -0700578
Jacob Kopczynskic6e483e2017-08-25 17:28:35 -0700579 Args:
580 @param afe AFE object to be used for taking inventory.
581 @param arguments Parsed command line arguments.
582 @param pools The list of pools to balance.
583
Richard Barnette5de01eb2017-12-15 09:53:42 -0800584 @returns a list of (model, labels) tuples to be balanced
J. Richard Barnette91d56812015-04-21 10:22:31 -0700585
586 """
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800587 balancer_targets = []
Aviv Keshet39c20dd2017-11-02 10:17:13 -0700588
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800589 for pool in pools:
Richard Barnette5de01eb2017-12-15 09:53:42 -0800590 if arguments.all_models:
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800591 inventory = lab_inventory.get_inventory(afe)
Xixuan Wu99f20d12019-08-30 18:29:59 -0700592 for model in inventory.get_pool_models(pool):
593 labels = labellib.LabelsMapping()
594 labels['model'] = model
595 if arguments.phase:
596 labels['phase'] = arguments.phase
597 balancer_targets.append((pool, labels.getlabels()))
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800598 else:
Richard Barnette5de01eb2017-12-15 09:53:42 -0800599 for model in arguments.models:
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800600 labels = labellib.LabelsMapping()
Richard Barnette5de01eb2017-12-15 09:53:42 -0800601 labels['model'] = model
Richard Barnette32fb1e82018-01-30 13:39:30 -0800602 if arguments.sku:
603 labels['sku'] = arguments.sku
Matthew Leszczenskidf2c3d72018-11-13 14:50:01 -0800604 if arguments.phase:
605 labels['phase'] = arguments.phase
Richard Barnette32fb1e82018-01-30 13:39:30 -0800606 balancer_targets.append((pool, labels.getlabels()))
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800607 return balancer_targets
Jacob Kopczynskic6e483e2017-08-25 17:28:35 -0700608
609
610def main(argv):
611 """Standard main routine.
612
613 @param argv Command line arguments including `sys.argv[0]`.
614
615 """
Jacob Kopczynskif8d90a82017-10-10 14:37:33 -0700616 arguments = _parse_command(argv)
617 if arguments.production:
Aviv Keshet2cc427d2018-04-18 13:39:24 -0700618 metrics_manager = site_utils.SetupTsMonGlobalState('balance_pools',
619 indirect=True)
Jacob Kopczynskif8d90a82017-10-10 14:37:33 -0700620 else:
Jacob Kopczynskif8d90a82017-10-10 14:37:33 -0700621 metrics_manager = site_utils.TrivialContextManager()
622
623 with metrics_manager:
Aviv Keshet259a6502018-04-18 13:48:02 -0700624 with metrics.SuccessCounter('chromeos/autotest/balance_pools/runs'):
625 end_time = time.time()
626 start_time = end_time - 24 * 60 * 60
Aviv Kesheta8834322018-05-07 13:28:32 -0700627 afe = frontend_wrappers.RetryingAFE(server=arguments.web)
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800628
Aviv Keshet259a6502018-04-18 13:48:02 -0700629 def balancer(pool, labels):
630 """Balance the specified model.
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800631
Aviv Keshet259a6502018-04-18 13:48:02 -0700632 @param pool: The pool to rebalance for the model.
633 @param labels: labels to restrict to balancing operations
634 within.
635 """
636 _balance_model(arguments, afe, pool, labels,
637 start_time, end_time)
638 _log_message('')
Prathmesh Prabhubb5cb292017-11-09 16:42:48 -0800639
Aviv Keshet259a6502018-04-18 13:48:02 -0700640 pools = (lab_inventory.CRITICAL_POOLS
641 if arguments.pool == _ALL_CRITICAL_POOLS
642 else [arguments.pool])
643 balancer_targets = infer_balancer_targets(afe, arguments, pools)
644 try:
645 parallel.RunTasksInProcessPool(
646 balancer,
647 balancer_targets,
648 processes=8,
649 )
650 except KeyboardInterrupt:
651 pass
J. Richard Barnette91d56812015-04-21 10:22:31 -0700652
653
654if __name__ == '__main__':
655 main(sys.argv)