J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 1 | #!/usr/bin/env python |
| 2 | # Copyright 2015 The Chromium OS Authors. All rights reserved. |
| 3 | # Use of this source code is governed by a BSD-style license that can be |
| 4 | # found in the LICENSE file. |
| 5 | |
| 6 | """Adjust pool balances to cover DUT shortfalls. |
| 7 | |
| 8 | This command takes all broken DUTs in a specific pool for specific |
| 9 | boards and swaps them with working DUTs taken from a selected pool |
| 10 | of spares. The command is meant primarily for replacing broken DUTs |
| 11 | in critical pools like BVT or CQ, but it can also be used to adjust |
| 12 | pool sizes, or to create or remove pools. |
| 13 | |
| 14 | usage: balance_pool.py [ options ] POOL BOARD [ BOARD ... ] |
| 15 | |
| 16 | positional arguments: |
| 17 | POOL Name of the pool to balance |
| 18 | BOARD Names of boards to balance |
| 19 | |
| 20 | optional arguments: |
| 21 | -h, --help show this help message and exit |
| 22 | -t COUNT, --total COUNT |
| 23 | Set the number of DUTs in the pool to the specified |
| 24 | count for every BOARD |
| 25 | -a COUNT, --grow COUNT |
| 26 | Add the specified number of DUTs to the pool for every |
| 27 | BOARD |
| 28 | -d COUNT, --shrink COUNT |
| 29 | Remove the specified number of DUTs from the pool for |
| 30 | every BOARD |
| 31 | -s POOL, --spare POOL |
| 32 | Pool from which to draw replacement spares (default: |
| 33 | pool:suites) |
| 34 | -n, --dry-run Report actions to take in the form of shell commands |
| 35 | |
| 36 | |
| 37 | The command attempts to remove all broken DUTs from the target POOL |
| 38 | for every BOARD, and replace them with enough working DUTs taken |
| 39 | from the spare pool to bring the strength of POOL to the requested |
| 40 | total COUNT. |
| 41 | |
| 42 | If no COUNT options are supplied (i.e. there are no --total, --grow, |
| 43 | or --shrink options), the command will maintain the current totals of |
| 44 | DUTs for every BOARD in the target POOL. |
| 45 | |
| 46 | If not enough working spares are available, broken DUTs may be left |
| 47 | in the pool to keep the pool at the target COUNT. |
| 48 | |
| 49 | When reducing pool size, working DUTs will be returned after broken |
| 50 | DUTs, if it's necessary to achieve the target COUNT. |
| 51 | |
| 52 | """ |
| 53 | |
| 54 | |
| 55 | import argparse |
| 56 | import sys |
| 57 | import time |
| 58 | |
| 59 | import common |
| 60 | from autotest_lib.server import frontend |
Jacob Kopczynski | c6e483e | 2017-08-25 17:28:35 -0700 | [diff] [blame] | 61 | from autotest_lib.server import site_utils |
Aviv Keshet | 7ee9586 | 2016-08-30 15:18:27 -0700 | [diff] [blame] | 62 | from autotest_lib.server.lib import status_history |
Kevin Cheng | cf0ad2b | 2016-04-19 14:51:39 -0700 | [diff] [blame] | 63 | from autotest_lib.site_utils import lab_inventory |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 64 | from autotest_lib.site_utils.suite_scheduler import constants |
Prathmesh Prabhu | 68acc40 | 2017-11-09 15:24:15 -0800 | [diff] [blame^] | 65 | from autotest_lib.utils import labellib |
Jacob Kopczynski | c6e483e | 2017-08-25 17:28:35 -0700 | [diff] [blame] | 66 | from chromite.lib import metrics |
David James | 2a3cb54 | 2015-05-05 17:13:43 -0700 | [diff] [blame] | 67 | from chromite.lib import parallel |
| 68 | |
Jacob Kopczynski | f8d90a8 | 2017-10-10 14:37:33 -0700 | [diff] [blame] | 69 | #This must be imported after chromite.lib.metrics |
| 70 | from infra_libs import ts_mon |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 71 | |
| 72 | _POOL_PREFIX = constants.Labels.POOL_PREFIX |
Kevin Cheng | cf0ad2b | 2016-04-19 14:51:39 -0700 | [diff] [blame] | 73 | # This is the ratio of all boards we should calculate the default max number of |
| 74 | # broken boards against. It seemed like the best choice that was neither too |
| 75 | # strict nor lax. |
| 76 | _MAX_BROKEN_BOARDS_DEFAULT_RATIO = 3.0 / 8.0 |
| 77 | |
| 78 | _ALL_CRITICAL_POOLS = 'all_critical_pools' |
| 79 | _SPARE_DEFAULT = lab_inventory.SPARE_POOL |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 80 | |
| 81 | |
| 82 | def _log_message(message, *args): |
| 83 | """Log a message with optional format arguments to stdout. |
| 84 | |
| 85 | This function logs a single line to stdout, with formatting |
| 86 | if necessary, and without adornments. |
| 87 | |
| 88 | If `*args` are supplied, the message will be formatted using |
| 89 | the arguments. |
| 90 | |
| 91 | @param message Message to be logged, possibly after formatting. |
| 92 | @param args Format arguments. If empty, the message is logged |
| 93 | without formatting. |
| 94 | |
| 95 | """ |
| 96 | if args: |
| 97 | message = message % args |
| 98 | sys.stdout.write('%s\n' % message) |
| 99 | |
| 100 | |
| 101 | def _log_info(dry_run, message, *args): |
| 102 | """Log information in a dry-run dependent fashion. |
| 103 | |
| 104 | This function logs a single line to stdout, with formatting |
| 105 | if necessary. When logging for a dry run, the message is |
| 106 | printed as a shell comment, rather than as unadorned text. |
| 107 | |
| 108 | If `*args` are supplied, the message will be formatted using |
| 109 | the arguments. |
| 110 | |
| 111 | @param message Message to be logged, possibly after formatting. |
| 112 | @param args Format arguments. If empty, the message is logged |
| 113 | without formatting. |
| 114 | |
| 115 | """ |
| 116 | if dry_run: |
| 117 | message = '# ' + message |
| 118 | _log_message(message, *args) |
| 119 | |
| 120 | |
| 121 | def _log_error(message, *args): |
| 122 | """Log an error to stderr, with optional format arguments. |
| 123 | |
| 124 | This function logs a single line to stderr, prefixed to indicate |
| 125 | that it is an error message. |
| 126 | |
| 127 | If `*args` are supplied, the message will be formatted using |
| 128 | the arguments. |
| 129 | |
| 130 | @param message Message to be logged, possibly after formatting. |
| 131 | @param args Format arguments. If empty, the message is logged |
| 132 | without formatting. |
| 133 | |
| 134 | """ |
| 135 | if args: |
| 136 | message = message % args |
| 137 | sys.stderr.write('ERROR: %s\n' % message) |
| 138 | |
| 139 | |
| 140 | class _DUTPool(object): |
| 141 | """Information about a pool of DUTs for a given board. |
| 142 | |
| 143 | This class collects information about all DUTs for a given |
| 144 | board and pool pair, and divides them into three categories: |
| 145 | + Working - the DUT is working for testing, and not locked. |
| 146 | + Broken - the DUT is unable to run tests, or it is locked. |
| 147 | + Ineligible - the DUT is not available to be removed from |
| 148 | this pool. The DUT may be either working or broken. |
| 149 | |
| 150 | DUTs with more than one pool: label are ineligible for exchange |
| 151 | during balancing. This is done for the sake of chameleon hosts, |
| 152 | which must always be assigned to pool:suites. These DUTs are |
| 153 | always marked with pool:chameleon to prevent their reassignment. |
| 154 | |
Aviv Keshet | ead47d5 | 2017-11-02 09:58:32 -0700 | [diff] [blame] | 155 | |extra_labels| may be used to restrict DUTPool down to a subset |
| 156 | of a given board+pool, by specifying additonal labels that all |
| 157 | DUTs are required to possess. |
| 158 | |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 159 | TODO(jrbarnette): The use of `pool:chamelon` (instead of just |
| 160 | the `chameleon` label is a hack that should be eliminated. |
| 161 | |
| 162 | _DUTPool instances are used to track both main pools that need |
| 163 | to be resupplied with working DUTs and spare pools that supply |
| 164 | those DUTs. |
| 165 | |
| 166 | @property board Name of the board associated with |
| 167 | this pool of DUTs. |
| 168 | @property pool Name of the pool associated with |
| 169 | this pool of DUTs. |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 170 | @property working_hosts The list of this pool's working |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 171 | DUTs. |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 172 | @property broken_hosts The list of this pool's broken |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 173 | DUTs. |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 174 | @property ineligible_hosts The list of this pool's ineligible DUTs. |
| 175 | @property labels A list of labels that identify a DUT |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 176 | as part of this pool. |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 177 | @property total_hosts The total number of hosts in pool. |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 178 | |
| 179 | """ |
| 180 | |
Aviv Keshet | ead47d5 | 2017-11-02 09:58:32 -0700 | [diff] [blame] | 181 | def __init__(self, afe, board, pool, start_time, end_time, |
| 182 | extra_labels=None): |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 183 | self.board = board |
| 184 | self.pool = pool |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 185 | self.working_hosts = [] |
| 186 | self.broken_hosts = [] |
| 187 | self.ineligible_hosts = [] |
Aviv Keshet | ead47d5 | 2017-11-02 09:58:32 -0700 | [diff] [blame] | 188 | self._extra_labels = extra_labels or [] |
Richard Barnette | 07303cb | 2016-04-15 16:56:16 -0700 | [diff] [blame] | 189 | self.total_hosts = self._get_hosts(afe, start_time, end_time) |
| 190 | self._labels = [_POOL_PREFIX + self.pool] |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 191 | |
| 192 | |
Richard Barnette | 07303cb | 2016-04-15 16:56:16 -0700 | [diff] [blame] | 193 | def _get_hosts(self, afe, start_time, end_time): |
Prathmesh Prabhu | 68acc40 | 2017-11-09 15:24:15 -0800 | [diff] [blame^] | 194 | labels = labellib.LabelsMapping(self._extra_labels) |
| 195 | labels['board'] = self.board |
| 196 | labels['pool'] = self.pool |
| 197 | all_histories = status_history.HostJobHistory.get_multiple_histories( |
| 198 | afe, start_time, end_time, labels.getlabels()) |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 199 | for h in all_histories: |
| 200 | host = h.host |
| 201 | host_pools = [l for l in host.labels |
| 202 | if l.startswith(_POOL_PREFIX)] |
| 203 | if len(host_pools) != 1: |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 204 | self.ineligible_hosts.append(host) |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 205 | else: |
| 206 | diag = h.last_diagnosis()[0] |
| 207 | if (diag == status_history.WORKING and |
| 208 | not host.locked): |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 209 | self.working_hosts.append(host) |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 210 | else: |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 211 | self.broken_hosts.append(host) |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 212 | return len(all_histories) |
| 213 | |
| 214 | |
| 215 | @property |
| 216 | def pool_labels(self): |
| 217 | """Return the AFE labels that identify this pool. |
| 218 | |
| 219 | The returned labels are the labels that must be removed |
| 220 | to remove a DUT from the pool, or added to add a DUT. |
| 221 | |
| 222 | @return A list of AFE labels suitable for AFE.add_labels() |
| 223 | or AFE.remove_labels(). |
| 224 | |
| 225 | """ |
Richard Barnette | 07303cb | 2016-04-15 16:56:16 -0700 | [diff] [blame] | 226 | return self._labels |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 227 | |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 228 | def calculate_spares_needed(self, target_total): |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 229 | """Calculate and log the spares needed to achieve a target. |
| 230 | |
| 231 | Return how many working spares are needed to achieve the |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 232 | given `target_total` with all DUTs working. |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 233 | |
| 234 | The spares count may be positive or negative. Positive |
| 235 | values indicate spares are needed to replace broken DUTs in |
| 236 | order to reach the target; negative numbers indicate that |
| 237 | no spares are needed, and that a corresponding number of |
| 238 | working devices can be returned. |
| 239 | |
| 240 | If the new target total would require returning ineligible |
| 241 | DUTs, an error is logged, and the target total is adjusted |
| 242 | so that those DUTs are not exchanged. |
| 243 | |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 244 | @param target_total The new target pool size. |
| 245 | |
| 246 | @return The number of spares needed. |
| 247 | |
| 248 | """ |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 249 | num_ineligible = len(self.ineligible_hosts) |
Jacob Kopczynski | c6e483e | 2017-08-25 17:28:35 -0700 | [diff] [blame] | 250 | spares_needed = target_total >= num_ineligible |
| 251 | if not spares_needed: |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 252 | _log_error('%s %s pool: Target of %d is below ' |
| 253 | 'minimum of %d DUTs.', |
| 254 | self.board, self.pool, |
| 255 | target_total, num_ineligible) |
| 256 | _log_error('Adjusting target to %d DUTs.', num_ineligible) |
| 257 | target_total = num_ineligible |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 258 | adjustment = target_total - self.total_hosts |
Jacob Kopczynski | c6e483e | 2017-08-25 17:28:35 -0700 | [diff] [blame] | 259 | metrics.Boolean( |
| 260 | 'chromeos/autotest/balance_pools/exhausted_pools', |
| 261 | "True for each pool/board which requests more DUTs than supplied", |
| 262 | field_spec=[ |
| 263 | ts_mon.StringField('pool'), ts_mon.StringField('board')]).set( |
| 264 | not spares_needed, |
| 265 | fields={'pool': self.pool, 'board': self.board} |
| 266 | ) |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 267 | return len(self.broken_hosts) + adjustment |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 268 | |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 269 | def allocate_surplus(self, num_broken): |
| 270 | """Allocate a list DUTs that can returned as surplus. |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 271 | |
| 272 | Return a list of devices that can be returned in order to |
| 273 | reduce this pool's supply. Broken DUTs will be preferred |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 274 | over working ones. |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 275 | |
| 276 | The `num_broken` parameter indicates the number of broken |
| 277 | DUTs to be left in the pool. If this number exceeds the |
| 278 | number of broken DUTs actually in the pool, the returned |
| 279 | list will be empty. If this number is negative, it |
| 280 | indicates a number of working DUTs to be returned in |
| 281 | addition to all broken ones. |
| 282 | |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 283 | @param num_broken Total number of broken DUTs to be left in |
| 284 | this pool. |
| 285 | |
| 286 | @return A list of DUTs to be returned as surplus. |
| 287 | |
| 288 | """ |
| 289 | if num_broken >= 0: |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 290 | surplus = self.broken_hosts[num_broken:] |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 291 | return surplus |
| 292 | else: |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 293 | return (self.broken_hosts + |
| 294 | self.working_hosts[:-num_broken]) |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 295 | |
| 296 | |
| 297 | def _exchange_labels(dry_run, hosts, target_pool, spare_pool): |
| 298 | """Reassign a list of DUTs from one pool to another. |
| 299 | |
| 300 | For all the given hosts, remove all labels associated with |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 301 | `spare_pool`, and add the labels for `target_pool`. |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 302 | |
| 303 | If `dry_run` is true, perform no changes, but log the `atest` |
| 304 | commands needed to accomplish the necessary label changes. |
| 305 | |
| 306 | @param dry_run Whether the logging is for a dry run or |
| 307 | for actual execution. |
| 308 | @param hosts List of DUTs (AFE hosts) to be reassigned. |
| 309 | @param target_pool The `_DUTPool` object from which the hosts |
| 310 | are drawn. |
| 311 | @param spare_pool The `_DUTPool` object to which the hosts |
| 312 | will be added. |
| 313 | |
| 314 | """ |
| 315 | if not hosts: |
| 316 | return |
| 317 | _log_info(dry_run, 'Transferring %d DUTs from %s to %s.', |
| 318 | len(hosts), spare_pool.pool, target_pool.pool) |
Jacob Kopczynski | c6e483e | 2017-08-25 17:28:35 -0700 | [diff] [blame] | 319 | metrics.Counter( |
| 320 | 'chromeos/autotest/balance_pools/duts_moved', |
| 321 | "DUTs transferred between pools", |
| 322 | field_spec=[ts_mon.StringField('board'), |
| 323 | ts_mon.StringField('source_pool'), |
| 324 | ts_mon.StringField('target_pool')] |
| 325 | ).increment_by(len(hosts), |
| 326 | fields={'board': target_pool.board, |
| 327 | 'source_pool': spare_pool.pool, |
| 328 | 'target_pool': target_pool.pool}) |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 329 | additions = target_pool.pool_labels |
| 330 | removals = spare_pool.pool_labels |
| 331 | for host in hosts: |
| 332 | if not dry_run: |
| 333 | _log_message('Updating host: %s.', host.hostname) |
Richard Barnette | 07303cb | 2016-04-15 16:56:16 -0700 | [diff] [blame] | 334 | host.remove_labels(removals) |
| 335 | host.add_labels(additions) |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 336 | else: |
| 337 | _log_message('atest label remove -m %s %s', |
| 338 | host.hostname, ' '.join(removals)) |
| 339 | _log_message('atest label add -m %s %s', |
| 340 | host.hostname, ' '.join(additions)) |
| 341 | |
| 342 | |
Aviv Keshet | 39c20dd | 2017-11-02 10:17:13 -0700 | [diff] [blame] | 343 | def _balance_board(arguments, afe, board, pool, start_time, end_time, |
| 344 | extra_labels=None): |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 345 | """Balance one board as requested by command line arguments. |
| 346 | |
| 347 | @param arguments Parsed command line arguments. |
| 348 | @param dry_run Whether the logging is for a dry run or |
| 349 | for actual execution. |
| 350 | @param afe AFE object to be used for the changes. |
| 351 | @param board Board to be balanced. |
Kevin Cheng | cf0ad2b | 2016-04-19 14:51:39 -0700 | [diff] [blame] | 352 | @param pool Pool of the board to be balanced. |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 353 | @param start_time Start time for HostJobHistory objects in |
| 354 | the DUT pools. |
| 355 | @param end_time End time for HostJobHistory objects in the |
| 356 | DUT pools. |
Aviv Keshet | 39c20dd | 2017-11-02 10:17:13 -0700 | [diff] [blame] | 357 | @param extra_labels Optional extra labels that all DUTs must possess. |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 358 | |
| 359 | """ |
| 360 | spare_pool = _DUTPool(afe, board, arguments.spare, |
Aviv Keshet | 39c20dd | 2017-11-02 10:17:13 -0700 | [diff] [blame] | 361 | start_time, end_time, extra_labels) |
Kevin Cheng | cf0ad2b | 2016-04-19 14:51:39 -0700 | [diff] [blame] | 362 | main_pool = _DUTPool(afe, board, pool, |
Aviv Keshet | 39c20dd | 2017-11-02 10:17:13 -0700 | [diff] [blame] | 363 | start_time, end_time, extra_labels) |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 364 | |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 365 | target_total = main_pool.total_hosts |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 366 | if arguments.total is not None: |
| 367 | target_total = arguments.total |
| 368 | elif arguments.grow: |
| 369 | target_total += arguments.grow |
| 370 | elif arguments.shrink: |
| 371 | target_total -= arguments.shrink |
| 372 | |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 373 | spares_needed = main_pool.calculate_spares_needed(target_total) |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 374 | if spares_needed > 0: |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 375 | spare_duts = spare_pool.working_hosts[:spares_needed] |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 376 | shortfall = spares_needed - len(spare_duts) |
| 377 | else: |
| 378 | spare_duts = [] |
| 379 | shortfall = spares_needed |
| 380 | |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 381 | surplus_duts = main_pool.allocate_surplus(shortfall) |
| 382 | |
| 383 | if spares_needed or surplus_duts or arguments.verbose: |
| 384 | dry_run = arguments.dry_run |
| 385 | _log_message('') |
| 386 | |
| 387 | _log_info(dry_run, 'Balancing %s %s pool:', board, main_pool.pool) |
Aviv Keshet | 39c20dd | 2017-11-02 10:17:13 -0700 | [diff] [blame] | 388 | if extra_labels: |
| 389 | _log_info(dry_run, 'Restricting to extra labels: %s', extra_labels) |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 390 | _log_info(dry_run, |
| 391 | 'Total %d DUTs, %d working, %d broken, %d reserved.', |
| 392 | main_pool.total_hosts, len(main_pool.working_hosts), |
| 393 | len(main_pool.broken_hosts), len(main_pool.ineligible_hosts)) |
| 394 | |
| 395 | if spares_needed > 0: |
| 396 | add_msg = 'grow pool by %d DUTs' % spares_needed |
| 397 | elif spares_needed < 0: |
| 398 | add_msg = 'shrink pool by %d DUTs' % -spares_needed |
| 399 | else: |
| 400 | add_msg = 'no change to pool size' |
| 401 | _log_info(dry_run, 'Target is %d working DUTs; %s.', |
| 402 | target_total, add_msg) |
| 403 | |
| 404 | _log_info(dry_run, |
| 405 | '%s %s pool has %d spares available.', |
| 406 | board, main_pool.pool, len(spare_pool.working_hosts)) |
| 407 | |
| 408 | if spares_needed > len(spare_duts): |
| 409 | _log_error('Not enough spares: need %d, only have %d.', |
| 410 | spares_needed, len(spare_duts)) |
| 411 | elif shortfall >= 0: |
| 412 | _log_info(dry_run, |
| 413 | '%s %s pool will return %d broken DUTs, ' |
| 414 | 'leaving %d still in the pool.', |
| 415 | board, main_pool.pool, |
| 416 | len(surplus_duts), |
| 417 | len(main_pool.broken_hosts) - len(surplus_duts)) |
| 418 | else: |
| 419 | _log_info(dry_run, |
| 420 | '%s %s pool will return %d surplus DUTs, ' |
| 421 | 'including %d working DUTs.', |
| 422 | board, main_pool.pool, |
| 423 | len(main_pool.broken_hosts) - shortfall, |
| 424 | -shortfall) |
| 425 | |
David James | f1d6e45 | 2015-07-17 15:23:04 -0700 | [diff] [blame] | 426 | if (len(main_pool.broken_hosts) > arguments.max_broken and |
| 427 | not arguments.force_rebalance): |
| 428 | _log_error('%s %s pool: Refusing to act on pool with %d broken DUTs.', |
| 429 | board, main_pool.pool, len(main_pool.broken_hosts)) |
| 430 | _log_error('Please investigate this board to see if there is a bug ') |
| 431 | _log_error('that is bricking devices. Once you have finished your ') |
| 432 | _log_error('investigation, you can force a rebalance with ') |
| 433 | _log_error('--force-rebalance') |
| 434 | return |
| 435 | |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 436 | if not spare_duts and not surplus_duts: |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 437 | if arguments.verbose: |
| 438 | _log_info(arguments.dry_run, 'No exchange required.') |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 439 | return |
| 440 | |
| 441 | _exchange_labels(arguments.dry_run, surplus_duts, |
| 442 | spare_pool, main_pool) |
| 443 | _exchange_labels(arguments.dry_run, spare_duts, |
| 444 | main_pool, spare_pool) |
| 445 | |
| 446 | |
Kevin Cheng | cf0ad2b | 2016-04-19 14:51:39 -0700 | [diff] [blame] | 447 | def _too_many_broken_boards(inventory, pool, arguments): |
| 448 | """ |
| 449 | Get the inventory of boards and check if too many boards are broken. |
| 450 | |
| 451 | @param inventory: inventory object to determine board status inventory. |
| 452 | @param pool: The pool to check on for the board. |
| 453 | @param arguments Parsed command line arguments. |
| 454 | |
| 455 | @return True if the number of boards with 1 or more broken duts exceed |
| 456 | max_broken_boards, False otherwise. |
| 457 | """ |
| 458 | # Let's check if we even need to check for this max_broken_boards. |
| 459 | if arguments.force_rebalance or arguments.max_broken_boards == 0: |
| 460 | return False |
| 461 | |
| 462 | # Let's get the number of broken duts for the specified pool and |
| 463 | # check that it's less than arguments.max_broken_boards. Or if |
| 464 | # it's not specified, calculate the default number of max broken |
| 465 | # boards based on the total number of boards per pool. |
| 466 | # TODO(kevcheng): Revisit to see if there's a better way to |
| 467 | # calculate the default max_broken_boards. |
| 468 | max_broken_boards = arguments.max_broken_boards |
| 469 | if max_broken_boards is None: |
| 470 | total_num_boards = len(inventory.get_managed_boards(pool=pool)) |
| 471 | max_broken_boards = int(_MAX_BROKEN_BOARDS_DEFAULT_RATIO * |
| 472 | total_num_boards) |
| 473 | _log_info(arguments.dry_run, |
| 474 | 'Default max broken boards calculated to be %d for ' |
| 475 | '%s pool', |
| 476 | max_broken_boards, pool) |
| 477 | |
Prathmesh Prabhu | 154cb2b | 2017-11-08 17:36:51 -0800 | [diff] [blame] | 478 | broken_boards = [board for board, counts in inventory.by_board.iteritems() |
Kevin Cheng | cf0ad2b | 2016-04-19 14:51:39 -0700 | [diff] [blame] | 479 | if counts.get_broken(pool) != 0] |
| 480 | broken_boards.sort() |
| 481 | num_of_broken_boards = len(broken_boards) |
| 482 | # TODO(kevcheng): Track which boards have broken duts, we can limit the |
| 483 | # number of boards we go through in the main loop with this knowledge. |
| 484 | _log_message('There are %d boards in the %s pool with at least 1 ' |
| 485 | 'broken DUT (max threshold %d)', num_of_broken_boards, |
| 486 | pool, max_broken_boards) |
| 487 | for broken_board in broken_boards: |
| 488 | _log_message(broken_board) |
| 489 | return num_of_broken_boards > max_broken_boards |
| 490 | |
| 491 | |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 492 | def _parse_command(argv): |
| 493 | """Parse the command line arguments. |
| 494 | |
| 495 | Create an argument parser for this command's syntax, parse the |
| 496 | command line, and return the result of the `ArgumentParser` |
| 497 | `parse_args()` method. |
| 498 | |
| 499 | @param argv Standard command line argument vector; `argv[0]` is |
| 500 | assumed to be the command name. |
| 501 | |
| 502 | @return Result returned by `ArgumentParser.parse_args()`. |
| 503 | |
| 504 | """ |
| 505 | parser = argparse.ArgumentParser( |
| 506 | prog=argv[0], |
| 507 | description='Balance pool shortages from spares on reserve') |
| 508 | |
Prathmesh Prabhu | c8cf0f6 | 2017-11-09 08:57:58 -0800 | [diff] [blame] | 509 | parser.add_argument( |
| 510 | '-w', '--web', type=str, default=None, |
| 511 | help='AFE host to use. Default comes from shadow_config.', |
| 512 | ) |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 513 | count_group = parser.add_mutually_exclusive_group() |
| 514 | count_group.add_argument('-t', '--total', type=int, |
| 515 | metavar='COUNT', default=None, |
| 516 | help='Set the number of DUTs in the ' |
| 517 | 'pool to the specified count for ' |
| 518 | 'every BOARD') |
| 519 | count_group.add_argument('-a', '--grow', type=int, |
| 520 | metavar='COUNT', default=None, |
| 521 | help='Add the specified number of DUTs ' |
| 522 | 'to the pool for every BOARD') |
| 523 | count_group.add_argument('-d', '--shrink', type=int, |
| 524 | metavar='COUNT', default=None, |
| 525 | help='Remove the specified number of DUTs ' |
| 526 | 'from the pool for every BOARD') |
| 527 | |
Kevin Cheng | cf0ad2b | 2016-04-19 14:51:39 -0700 | [diff] [blame] | 528 | parser.add_argument('-s', '--spare', default=_SPARE_DEFAULT, |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 529 | metavar='POOL', |
| 530 | help='Pool from which to draw replacement ' |
Kevin Cheng | cf0ad2b | 2016-04-19 14:51:39 -0700 | [diff] [blame] | 531 | 'spares (default: pool:%s)' % _SPARE_DEFAULT) |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 532 | parser.add_argument('-n', '--dry-run', action='store_true', |
| 533 | help='Report actions to take in the form of ' |
| 534 | 'shell commands') |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 535 | parser.add_argument('-v', '--verbose', action='store_true', |
| 536 | help='Print more detail about calculations for debug ' |
| 537 | 'purposes.') |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 538 | |
David James | f1d6e45 | 2015-07-17 15:23:04 -0700 | [diff] [blame] | 539 | parser.add_argument('-m', '--max-broken', default=2, type=int, |
| 540 | metavar='COUNT', |
| 541 | help='Only rebalance a pool if it has at most ' |
| 542 | 'COUNT broken DUTs.') |
| 543 | parser.add_argument('-f', '--force-rebalance', action='store_true', |
| 544 | help='Forcefully rebalance all DUTs in a pool, even ' |
| 545 | 'if it has a large number of broken DUTs. ' |
| 546 | 'Before doing this, please investigate whether ' |
| 547 | 'there is a bug that is bricking devices in the ' |
| 548 | 'lab.') |
Jacob Kopczynski | f8d90a8 | 2017-10-10 14:37:33 -0700 | [diff] [blame] | 549 | parser.add_argument('--production', action='store_true', |
Prathmesh Prabhu | 7a050da | 2017-11-09 09:15:03 -0800 | [diff] [blame] | 550 | help='Treat this as a production run. This will ' |
| 551 | 'collect metrics.') |
David James | f1d6e45 | 2015-07-17 15:23:04 -0700 | [diff] [blame] | 552 | |
David James | 8352bc2 | 2015-05-05 16:37:05 -0700 | [diff] [blame] | 553 | parser.add_argument('--all-boards', action='store_true', |
Kevin Cheng | cf0ad2b | 2016-04-19 14:51:39 -0700 | [diff] [blame] | 554 | help='Rebalance all managed boards. This will do a ' |
| 555 | 'very expensive check to see how many boards have ' |
| 556 | 'at least one broken DUT. To bypass that check, ' |
| 557 | 'set --max-broken-boards to 0.') |
| 558 | parser.add_argument('--max-broken-boards', |
| 559 | default=None, type=int, |
| 560 | help='Only rebalance all boards if number of boards ' |
| 561 | 'with broken DUTs in the specified pool ' |
| 562 | 'is less than COUNT.') |
David James | 8352bc2 | 2015-05-05 16:37:05 -0700 | [diff] [blame] | 563 | |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 564 | parser.add_argument('pool', |
| 565 | metavar='POOL', |
Kevin Cheng | cf0ad2b | 2016-04-19 14:51:39 -0700 | [diff] [blame] | 566 | help='Name of the pool to balance. Use %s to balance ' |
| 567 | 'all critical pools' % _ALL_CRITICAL_POOLS) |
David James | 8352bc2 | 2015-05-05 16:37:05 -0700 | [diff] [blame] | 568 | parser.add_argument('boards', nargs='*', |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 569 | metavar='BOARD', |
David James | 8352bc2 | 2015-05-05 16:37:05 -0700 | [diff] [blame] | 570 | help='Names of boards to balance.') |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 571 | |
Aviv Keshet | 39c20dd | 2017-11-02 10:17:13 -0700 | [diff] [blame] | 572 | parser.add_argument('--model', type=str, action='store', metavar='MODEL', |
| 573 | help='Optional name of model to restrict to.') |
| 574 | |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 575 | arguments = parser.parse_args(argv[1:]) |
David James | 8352bc2 | 2015-05-05 16:37:05 -0700 | [diff] [blame] | 576 | |
| 577 | # Error-check arguments. |
| 578 | if not arguments.boards and not arguments.all_boards: |
| 579 | parser.error('No boards specified. To balance all boards, use ' |
| 580 | '--all-boards') |
| 581 | if arguments.boards and arguments.all_boards: |
| 582 | parser.error('Cannot specify boards with --all-boards.') |
Kevin Cheng | cf0ad2b | 2016-04-19 14:51:39 -0700 | [diff] [blame] | 583 | if (arguments.pool == _ALL_CRITICAL_POOLS and |
| 584 | arguments.spare != _SPARE_DEFAULT): |
| 585 | parser.error('Cannot specify --spare pool to be %s when balancing all ' |
| 586 | 'critical pools.' % _SPARE_DEFAULT) |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 587 | return arguments |
| 588 | |
| 589 | |
Jacob Kopczynski | c6e483e | 2017-08-25 17:28:35 -0700 | [diff] [blame] | 590 | def specify_balance_args(afe, arguments, pools): |
| 591 | """Take some arguments and translate them to a list of boards to balance |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 592 | |
Jacob Kopczynski | c6e483e | 2017-08-25 17:28:35 -0700 | [diff] [blame] | 593 | Args: |
| 594 | @param afe AFE object to be used for taking inventory. |
| 595 | @param arguments Parsed command line arguments. |
| 596 | @param pools The list of pools to balance. |
| 597 | |
Aviv Keshet | 39c20dd | 2017-11-02 10:17:13 -0700 | [diff] [blame] | 598 | @returns a list of (board, pool, extra_labels) tuples to be balanced |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 599 | |
| 600 | """ |
Kevin Cheng | cf0ad2b | 2016-04-19 14:51:39 -0700 | [diff] [blame] | 601 | board_info = [] |
Jacob Kopczynski | c6e483e | 2017-08-25 17:28:35 -0700 | [diff] [blame] | 602 | boards = arguments.boards |
Aviv Keshet | 39c20dd | 2017-11-02 10:17:13 -0700 | [diff] [blame] | 603 | extra_labels = [] |
| 604 | if arguments.model: |
| 605 | extra_labels = ['model:' + arguments.model] |
| 606 | |
David James | 8352bc2 | 2015-05-05 16:37:05 -0700 | [diff] [blame] | 607 | if arguments.all_boards: |
Kevin Cheng | cf0ad2b | 2016-04-19 14:51:39 -0700 | [diff] [blame] | 608 | inventory = lab_inventory.get_inventory(afe) |
| 609 | for pool in pools: |
Jacob Kopczynski | c6e483e | 2017-08-25 17:28:35 -0700 | [diff] [blame] | 610 | quarantine = _too_many_broken_boards(inventory, pool, arguments) |
| 611 | if quarantine: |
Kevin Cheng | cf0ad2b | 2016-04-19 14:51:39 -0700 | [diff] [blame] | 612 | _log_error('Refusing to balance all boards for %s pool, ' |
| 613 | 'too many boards with at least 1 broken DUT ' |
| 614 | 'detected.', pool) |
| 615 | else: |
| 616 | boards_in_pool = inventory.get_managed_boards(pool=pool) |
| 617 | current_len_board_info = len(board_info) |
Aviv Keshet | 39c20dd | 2017-11-02 10:17:13 -0700 | [diff] [blame] | 618 | board_info.extend([(board, pool, extra_labels) |
| 619 | for board in boards_in_pool]) |
Jacob Kopczynski | c6e483e | 2017-08-25 17:28:35 -0700 | [diff] [blame] | 620 | metrics.Boolean( |
| 621 | 'chromeos/autotest/balance_pools/unchanged_pools').set( |
| 622 | quarantine, fields={'pool': pool}) |
Kevin Cheng | cf0ad2b | 2016-04-19 14:51:39 -0700 | [diff] [blame] | 623 | else: |
Jacob Kopczynski | c6e483e | 2017-08-25 17:28:35 -0700 | [diff] [blame] | 624 | # We have specified boards with a specified pool, setup the args to |
| 625 | # the balancer properly. |
Kevin Cheng | cf0ad2b | 2016-04-19 14:51:39 -0700 | [diff] [blame] | 626 | for pool in pools: |
| 627 | current_len_board_info = len(board_info) |
Aviv Keshet | 39c20dd | 2017-11-02 10:17:13 -0700 | [diff] [blame] | 628 | board_info.extend([(board, pool, extra_labels) for board in boards]) |
Jacob Kopczynski | c6e483e | 2017-08-25 17:28:35 -0700 | [diff] [blame] | 629 | return board_info |
| 630 | |
| 631 | |
| 632 | def main(argv): |
| 633 | """Standard main routine. |
| 634 | |
| 635 | @param argv Command line arguments including `sys.argv[0]`. |
| 636 | |
| 637 | """ |
Jacob Kopczynski | f8d90a8 | 2017-10-10 14:37:33 -0700 | [diff] [blame] | 638 | |
Aviv Keshet | 39c20dd | 2017-11-02 10:17:13 -0700 | [diff] [blame] | 639 | def balancer(board, pool, extra_labels): |
Jacob Kopczynski | c6e483e | 2017-08-25 17:28:35 -0700 | [diff] [blame] | 640 | """Balance the specified board. |
| 641 | |
| 642 | @param board The board name. |
| 643 | @param pool The pool to rebalance for the board. |
Aviv Keshet | 39c20dd | 2017-11-02 10:17:13 -0700 | [diff] [blame] | 644 | @param extra_labels extra labels to restrict to |
Jacob Kopczynski | c6e483e | 2017-08-25 17:28:35 -0700 | [diff] [blame] | 645 | """ |
Aviv Keshet | 39c20dd | 2017-11-02 10:17:13 -0700 | [diff] [blame] | 646 | _balance_board(arguments, afe, board, pool, start_time, end_time, |
| 647 | extra_labels) |
Jacob Kopczynski | c6e483e | 2017-08-25 17:28:35 -0700 | [diff] [blame] | 648 | _log_message('') |
| 649 | |
Jacob Kopczynski | f8d90a8 | 2017-10-10 14:37:33 -0700 | [diff] [blame] | 650 | end_time = time.time() |
| 651 | start_time = end_time - 24 * 60 * 60 |
| 652 | |
| 653 | arguments = _parse_command(argv) |
| 654 | if arguments.production: |
| 655 | metrics_manager = site_utils.SetupTsMonGlobalState( |
Prathmesh Prabhu | 7a050da | 2017-11-09 09:15:03 -0800 | [diff] [blame] | 656 | 'balance_pools', |
| 657 | short_lived=True, |
| 658 | auto_flush=False, |
| 659 | ) |
Jacob Kopczynski | f8d90a8 | 2017-10-10 14:37:33 -0700 | [diff] [blame] | 660 | else: |
Jacob Kopczynski | f8d90a8 | 2017-10-10 14:37:33 -0700 | [diff] [blame] | 661 | metrics_manager = site_utils.TrivialContextManager() |
| 662 | |
| 663 | with metrics_manager: |
Jacob Kopczynski | c6e483e | 2017-08-25 17:28:35 -0700 | [diff] [blame] | 664 | try: |
Prathmesh Prabhu | 7a050da | 2017-11-09 09:15:03 -0800 | [diff] [blame] | 665 | afe = frontend.AFE(server=arguments.web) |
| 666 | pools = (lab_inventory.CRITICAL_POOLS |
| 667 | if arguments.pool == _ALL_CRITICAL_POOLS |
| 668 | else [arguments.pool]) |
| 669 | board_info = specify_balance_args(afe, arguments, pools) |
| 670 | try: |
| 671 | parallel.RunTasksInProcessPool(balancer, board_info, |
| 672 | processes=8) |
| 673 | except KeyboardInterrupt: |
| 674 | pass |
| 675 | finally: |
| 676 | metrics.Flush() |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 677 | |
| 678 | |
| 679 | if __name__ == '__main__': |
| 680 | main(sys.argv) |