J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 1 | #!/usr/bin/env python |
| 2 | # Copyright 2015 The Chromium OS Authors. All rights reserved. |
| 3 | # Use of this source code is governed by a BSD-style license that can be |
| 4 | # found in the LICENSE file. |
| 5 | |
| 6 | """Adjust pool balances to cover DUT shortfalls. |
| 7 | |
| 8 | This command takes all broken DUTs in a specific pool for specific |
| 9 | boards and swaps them with working DUTs taken from a selected pool |
| 10 | of spares. The command is meant primarily for replacing broken DUTs |
| 11 | in critical pools like BVT or CQ, but it can also be used to adjust |
| 12 | pool sizes, or to create or remove pools. |
| 13 | |
| 14 | usage: balance_pool.py [ options ] POOL BOARD [ BOARD ... ] |
| 15 | |
| 16 | positional arguments: |
| 17 | POOL Name of the pool to balance |
| 18 | BOARD Names of boards to balance |
| 19 | |
| 20 | optional arguments: |
| 21 | -h, --help show this help message and exit |
| 22 | -t COUNT, --total COUNT |
| 23 | Set the number of DUTs in the pool to the specified |
| 24 | count for every BOARD |
| 25 | -a COUNT, --grow COUNT |
| 26 | Add the specified number of DUTs to the pool for every |
| 27 | BOARD |
| 28 | -d COUNT, --shrink COUNT |
| 29 | Remove the specified number of DUTs from the pool for |
| 30 | every BOARD |
| 31 | -s POOL, --spare POOL |
| 32 | Pool from which to draw replacement spares (default: |
| 33 | pool:suites) |
| 34 | -n, --dry-run Report actions to take in the form of shell commands |
| 35 | |
| 36 | |
| 37 | The command attempts to remove all broken DUTs from the target POOL |
| 38 | for every BOARD, and replace them with enough working DUTs taken |
| 39 | from the spare pool to bring the strength of POOL to the requested |
| 40 | total COUNT. |
| 41 | |
| 42 | If no COUNT options are supplied (i.e. there are no --total, --grow, |
| 43 | or --shrink options), the command will maintain the current totals of |
| 44 | DUTs for every BOARD in the target POOL. |
| 45 | |
| 46 | If not enough working spares are available, broken DUTs may be left |
| 47 | in the pool to keep the pool at the target COUNT. |
| 48 | |
| 49 | When reducing pool size, working DUTs will be returned after broken |
| 50 | DUTs, if it's necessary to achieve the target COUNT. |
| 51 | |
J. Richard Barnette | 70c03b0 | 2015-05-26 14:33:17 -0700 | [diff] [blame] | 52 | If the selected target POOL is for a Freon board, *and* the selected |
| 53 | spare pool has no DUTs (in any state), *and* the corresponding |
| 54 | non-Freon spare pool is populated, then the non-Freon pool will |
| 55 | be used for the Freon board. A similar rule applies to balancing |
| 56 | non-Freon boards when there is an available Freon spare pool. |
| 57 | |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 58 | """ |
| 59 | |
| 60 | |
| 61 | import argparse |
| 62 | import sys |
| 63 | import time |
| 64 | |
| 65 | import common |
| 66 | from autotest_lib.server import frontend |
David James | 8352bc2 | 2015-05-05 16:37:05 -0700 | [diff] [blame^] | 67 | from autotest_lib.site_utils import host_label_utils |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 68 | from autotest_lib.site_utils import status_history |
| 69 | from autotest_lib.site_utils.suite_scheduler import constants |
| 70 | |
David James | 2a3cb54 | 2015-05-05 17:13:43 -0700 | [diff] [blame] | 71 | from chromite.lib import parallel |
| 72 | |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 73 | |
| 74 | _POOL_PREFIX = constants.Labels.POOL_PREFIX |
J. Richard Barnette | 70c03b0 | 2015-05-26 14:33:17 -0700 | [diff] [blame] | 75 | _BOARD_PREFIX = constants.Labels.BOARD_PREFIX |
| 76 | |
| 77 | _FREON_BOARD_TAG = 'freon' |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 78 | |
| 79 | |
| 80 | def _log_message(message, *args): |
| 81 | """Log a message with optional format arguments to stdout. |
| 82 | |
| 83 | This function logs a single line to stdout, with formatting |
| 84 | if necessary, and without adornments. |
| 85 | |
| 86 | If `*args` are supplied, the message will be formatted using |
| 87 | the arguments. |
| 88 | |
| 89 | @param message Message to be logged, possibly after formatting. |
| 90 | @param args Format arguments. If empty, the message is logged |
| 91 | without formatting. |
| 92 | |
| 93 | """ |
| 94 | if args: |
| 95 | message = message % args |
| 96 | sys.stdout.write('%s\n' % message) |
| 97 | |
| 98 | |
| 99 | def _log_info(dry_run, message, *args): |
| 100 | """Log information in a dry-run dependent fashion. |
| 101 | |
| 102 | This function logs a single line to stdout, with formatting |
| 103 | if necessary. When logging for a dry run, the message is |
| 104 | printed as a shell comment, rather than as unadorned text. |
| 105 | |
| 106 | If `*args` are supplied, the message will be formatted using |
| 107 | the arguments. |
| 108 | |
| 109 | @param message Message to be logged, possibly after formatting. |
| 110 | @param args Format arguments. If empty, the message is logged |
| 111 | without formatting. |
| 112 | |
| 113 | """ |
| 114 | if dry_run: |
| 115 | message = '# ' + message |
| 116 | _log_message(message, *args) |
| 117 | |
| 118 | |
| 119 | def _log_error(message, *args): |
| 120 | """Log an error to stderr, with optional format arguments. |
| 121 | |
| 122 | This function logs a single line to stderr, prefixed to indicate |
| 123 | that it is an error message. |
| 124 | |
| 125 | If `*args` are supplied, the message will be formatted using |
| 126 | the arguments. |
| 127 | |
| 128 | @param message Message to be logged, possibly after formatting. |
| 129 | @param args Format arguments. If empty, the message is logged |
| 130 | without formatting. |
| 131 | |
| 132 | """ |
| 133 | if args: |
| 134 | message = message % args |
| 135 | sys.stderr.write('ERROR: %s\n' % message) |
| 136 | |
| 137 | |
| 138 | class _DUTPool(object): |
| 139 | """Information about a pool of DUTs for a given board. |
| 140 | |
| 141 | This class collects information about all DUTs for a given |
| 142 | board and pool pair, and divides them into three categories: |
| 143 | + Working - the DUT is working for testing, and not locked. |
| 144 | + Broken - the DUT is unable to run tests, or it is locked. |
| 145 | + Ineligible - the DUT is not available to be removed from |
| 146 | this pool. The DUT may be either working or broken. |
| 147 | |
| 148 | DUTs with more than one pool: label are ineligible for exchange |
| 149 | during balancing. This is done for the sake of chameleon hosts, |
| 150 | which must always be assigned to pool:suites. These DUTs are |
| 151 | always marked with pool:chameleon to prevent their reassignment. |
| 152 | |
| 153 | TODO(jrbarnette): The use of `pool:chamelon` (instead of just |
| 154 | the `chameleon` label is a hack that should be eliminated. |
| 155 | |
| 156 | _DUTPool instances are used to track both main pools that need |
| 157 | to be resupplied with working DUTs and spare pools that supply |
| 158 | those DUTs. |
| 159 | |
| 160 | @property board Name of the board associated with |
| 161 | this pool of DUTs. |
| 162 | @property pool Name of the pool associated with |
| 163 | this pool of DUTs. |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 164 | @property working_hosts The list of this pool's working |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 165 | DUTs. |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 166 | @property broken_hosts The list of this pool's broken |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 167 | DUTs. |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 168 | @property ineligible_hosts The list of this pool's ineligible DUTs. |
| 169 | @property labels A list of labels that identify a DUT |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 170 | as part of this pool. |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 171 | @property total_hosts The total number of hosts in pool. |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 172 | |
| 173 | """ |
| 174 | |
| 175 | |
J. Richard Barnette | 70c03b0 | 2015-05-26 14:33:17 -0700 | [diff] [blame] | 176 | @staticmethod |
| 177 | def _get_platform_label(board): |
| 178 | """Return the platform label associated with `board`. |
| 179 | |
| 180 | When swapping between freon and non-freon boards, the |
| 181 | platform label must also change (because wmatrix reports |
| 182 | build results against platform labels, not boards). So, we |
| 183 | must be able to get the platform label from the board name. |
| 184 | |
| 185 | For non-freon boards, the platform label is based on a name |
| 186 | assigned by the firmware, which in some cases is different |
| 187 | from the board name. For freon boards, the platform label |
| 188 | is always the board name. |
| 189 | |
| 190 | @param board The board name to convert to a platform label. |
| 191 | @return The platform label for the given board name. |
| 192 | |
| 193 | """ |
| 194 | if board.endswith(_FREON_BOARD_TAG): |
| 195 | return board |
| 196 | if board.startswith('x86-'): |
| 197 | return board[len('x86-') :] |
| 198 | platform_map = { |
| 199 | 'daisy': 'snow', |
| 200 | 'daisy_spring': 'spring', |
| 201 | 'daisy_skate': 'skate', |
| 202 | 'parrot_ivb': 'parrot_2', |
| 203 | 'falco_li': 'falco' |
| 204 | } |
| 205 | return platform_map.get(board, board) |
| 206 | |
| 207 | |
| 208 | @staticmethod |
| 209 | def _freon_board_toggle(board): |
| 210 | """Toggle a board name between freon and non-freon. |
| 211 | |
| 212 | For boards naming a freon build, return the name of the |
| 213 | associated non-freon board. For boards naming non-freon |
| 214 | builds, return the name of the associated freon board. |
| 215 | |
| 216 | @param board The board name to be toggled. |
| 217 | @return A new board name, toggled for freon. |
| 218 | |
| 219 | """ |
| 220 | if board.endswith(_FREON_BOARD_TAG): |
| 221 | # The actual board name ends with either "-freon" or |
| 222 | # "_freon", so we have to strip off one extra character. |
| 223 | return board[: -len(_FREON_BOARD_TAG) - 1] |
| 224 | else: |
| 225 | # The actual board name will end with either "-freon" or |
| 226 | # "_freon"; we have to figure out which one to use. |
| 227 | joiner = '_' |
| 228 | if joiner in board: |
| 229 | joiner = '-' |
| 230 | return joiner.join([board, _FREON_BOARD_TAG]) |
| 231 | |
| 232 | |
| 233 | def __init__(self, afe, board, pool, start_time, end_time, |
| 234 | use_freon=False): |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 235 | self.board = board |
| 236 | self.pool = pool |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 237 | self.working_hosts = [] |
| 238 | self.broken_hosts = [] |
| 239 | self.ineligible_hosts = [] |
| 240 | self.total_hosts = self._get_hosts( |
J. Richard Barnette | 70c03b0 | 2015-05-26 14:33:17 -0700 | [diff] [blame] | 241 | afe, start_time, end_time, use_freon) |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 242 | self.labels = set([_BOARD_PREFIX + self.board, |
| 243 | self._get_platform_label(self.board), |
| 244 | _POOL_PREFIX + self.pool]) |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 245 | |
| 246 | |
J. Richard Barnette | 70c03b0 | 2015-05-26 14:33:17 -0700 | [diff] [blame] | 247 | def _get_hosts(self, afe, start_time, end_time, use_freon): |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 248 | all_histories = ( |
| 249 | status_history.HostJobHistory.get_multiple_histories( |
| 250 | afe, start_time, end_time, |
| 251 | board=self.board, pool=self.pool)) |
J. Richard Barnette | 70c03b0 | 2015-05-26 14:33:17 -0700 | [diff] [blame] | 252 | if not all_histories and use_freon: |
| 253 | alternate_board = self._freon_board_toggle(self.board) |
| 254 | alternate_histories = ( |
| 255 | status_history.HostJobHistory.get_multiple_histories( |
| 256 | afe, start_time, end_time, |
| 257 | board=alternate_board, pool=self.pool)) |
| 258 | if alternate_histories: |
| 259 | self.board = alternate_board |
| 260 | all_histories = alternate_histories |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 261 | for h in all_histories: |
| 262 | host = h.host |
| 263 | host_pools = [l for l in host.labels |
| 264 | if l.startswith(_POOL_PREFIX)] |
| 265 | if len(host_pools) != 1: |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 266 | self.ineligible_hosts.append(host) |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 267 | else: |
| 268 | diag = h.last_diagnosis()[0] |
| 269 | if (diag == status_history.WORKING and |
| 270 | not host.locked): |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 271 | self.working_hosts.append(host) |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 272 | else: |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 273 | self.broken_hosts.append(host) |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 274 | return len(all_histories) |
| 275 | |
| 276 | |
| 277 | @property |
| 278 | def pool_labels(self): |
| 279 | """Return the AFE labels that identify this pool. |
| 280 | |
| 281 | The returned labels are the labels that must be removed |
| 282 | to remove a DUT from the pool, or added to add a DUT. |
| 283 | |
| 284 | @return A list of AFE labels suitable for AFE.add_labels() |
| 285 | or AFE.remove_labels(). |
| 286 | |
| 287 | """ |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 288 | return self.labels |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 289 | |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 290 | def calculate_spares_needed(self, target_total): |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 291 | """Calculate and log the spares needed to achieve a target. |
| 292 | |
| 293 | Return how many working spares are needed to achieve the |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 294 | given `target_total` with all DUTs working. |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 295 | |
| 296 | The spares count may be positive or negative. Positive |
| 297 | values indicate spares are needed to replace broken DUTs in |
| 298 | order to reach the target; negative numbers indicate that |
| 299 | no spares are needed, and that a corresponding number of |
| 300 | working devices can be returned. |
| 301 | |
| 302 | If the new target total would require returning ineligible |
| 303 | DUTs, an error is logged, and the target total is adjusted |
| 304 | so that those DUTs are not exchanged. |
| 305 | |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 306 | @param target_total The new target pool size. |
| 307 | |
| 308 | @return The number of spares needed. |
| 309 | |
| 310 | """ |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 311 | num_ineligible = len(self.ineligible_hosts) |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 312 | if target_total < num_ineligible: |
| 313 | _log_error('%s %s pool: Target of %d is below ' |
| 314 | 'minimum of %d DUTs.', |
| 315 | self.board, self.pool, |
| 316 | target_total, num_ineligible) |
| 317 | _log_error('Adjusting target to %d DUTs.', num_ineligible) |
| 318 | target_total = num_ineligible |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 319 | adjustment = target_total - self.total_hosts |
| 320 | return len(self.broken_hosts) + adjustment |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 321 | |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 322 | def allocate_surplus(self, num_broken): |
| 323 | """Allocate a list DUTs that can returned as surplus. |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 324 | |
| 325 | Return a list of devices that can be returned in order to |
| 326 | reduce this pool's supply. Broken DUTs will be preferred |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 327 | over working ones. |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 328 | |
| 329 | The `num_broken` parameter indicates the number of broken |
| 330 | DUTs to be left in the pool. If this number exceeds the |
| 331 | number of broken DUTs actually in the pool, the returned |
| 332 | list will be empty. If this number is negative, it |
| 333 | indicates a number of working DUTs to be returned in |
| 334 | addition to all broken ones. |
| 335 | |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 336 | @param num_broken Total number of broken DUTs to be left in |
| 337 | this pool. |
| 338 | |
| 339 | @return A list of DUTs to be returned as surplus. |
| 340 | |
| 341 | """ |
| 342 | if num_broken >= 0: |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 343 | surplus = self.broken_hosts[num_broken:] |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 344 | return surplus |
| 345 | else: |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 346 | return (self.broken_hosts + |
| 347 | self.working_hosts[:-num_broken]) |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 348 | |
| 349 | |
| 350 | def _exchange_labels(dry_run, hosts, target_pool, spare_pool): |
| 351 | """Reassign a list of DUTs from one pool to another. |
| 352 | |
| 353 | For all the given hosts, remove all labels associated with |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 354 | `spare_pool`, and add the labels for `target_pool`. |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 355 | |
| 356 | If `dry_run` is true, perform no changes, but log the `atest` |
| 357 | commands needed to accomplish the necessary label changes. |
| 358 | |
| 359 | @param dry_run Whether the logging is for a dry run or |
| 360 | for actual execution. |
| 361 | @param hosts List of DUTs (AFE hosts) to be reassigned. |
| 362 | @param target_pool The `_DUTPool` object from which the hosts |
| 363 | are drawn. |
| 364 | @param spare_pool The `_DUTPool` object to which the hosts |
| 365 | will be added. |
| 366 | |
| 367 | """ |
| 368 | if not hosts: |
| 369 | return |
| 370 | _log_info(dry_run, 'Transferring %d DUTs from %s to %s.', |
| 371 | len(hosts), spare_pool.pool, target_pool.pool) |
| 372 | additions = target_pool.pool_labels |
| 373 | removals = spare_pool.pool_labels |
J. Richard Barnette | 70c03b0 | 2015-05-26 14:33:17 -0700 | [diff] [blame] | 374 | intersection = additions & removals |
| 375 | additions -= intersection |
| 376 | removals -= intersection |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 377 | for host in hosts: |
| 378 | if not dry_run: |
| 379 | _log_message('Updating host: %s.', host.hostname) |
J. Richard Barnette | 70c03b0 | 2015-05-26 14:33:17 -0700 | [diff] [blame] | 380 | host.remove_labels(list(removals)) |
| 381 | host.add_labels(list(additions)) |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 382 | else: |
| 383 | _log_message('atest label remove -m %s %s', |
| 384 | host.hostname, ' '.join(removals)) |
| 385 | _log_message('atest label add -m %s %s', |
| 386 | host.hostname, ' '.join(additions)) |
| 387 | |
| 388 | |
| 389 | def _balance_board(arguments, afe, board, start_time, end_time): |
| 390 | """Balance one board as requested by command line arguments. |
| 391 | |
| 392 | @param arguments Parsed command line arguments. |
| 393 | @param dry_run Whether the logging is for a dry run or |
| 394 | for actual execution. |
| 395 | @param afe AFE object to be used for the changes. |
| 396 | @param board Board to be balanced. |
| 397 | @param start_time Start time for HostJobHistory objects in |
| 398 | the DUT pools. |
| 399 | @param end_time End time for HostJobHistory objects in the |
| 400 | DUT pools. |
| 401 | |
| 402 | """ |
| 403 | spare_pool = _DUTPool(afe, board, arguments.spare, |
J. Richard Barnette | 70c03b0 | 2015-05-26 14:33:17 -0700 | [diff] [blame] | 404 | start_time, end_time, use_freon=True) |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 405 | main_pool = _DUTPool(afe, board, arguments.pool, |
| 406 | start_time, end_time) |
| 407 | |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 408 | target_total = main_pool.total_hosts |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 409 | if arguments.total is not None: |
| 410 | target_total = arguments.total |
| 411 | elif arguments.grow: |
| 412 | target_total += arguments.grow |
| 413 | elif arguments.shrink: |
| 414 | target_total -= arguments.shrink |
| 415 | |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 416 | spares_needed = main_pool.calculate_spares_needed(target_total) |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 417 | if spares_needed > 0: |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 418 | spare_duts = spare_pool.working_hosts[:spares_needed] |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 419 | shortfall = spares_needed - len(spare_duts) |
| 420 | else: |
| 421 | spare_duts = [] |
| 422 | shortfall = spares_needed |
| 423 | |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 424 | surplus_duts = main_pool.allocate_surplus(shortfall) |
| 425 | |
| 426 | if spares_needed or surplus_duts or arguments.verbose: |
| 427 | dry_run = arguments.dry_run |
| 428 | _log_message('') |
| 429 | |
| 430 | _log_info(dry_run, 'Balancing %s %s pool:', board, main_pool.pool) |
| 431 | _log_info(dry_run, |
| 432 | 'Total %d DUTs, %d working, %d broken, %d reserved.', |
| 433 | main_pool.total_hosts, len(main_pool.working_hosts), |
| 434 | len(main_pool.broken_hosts), len(main_pool.ineligible_hosts)) |
| 435 | |
| 436 | if spares_needed > 0: |
| 437 | add_msg = 'grow pool by %d DUTs' % spares_needed |
| 438 | elif spares_needed < 0: |
| 439 | add_msg = 'shrink pool by %d DUTs' % -spares_needed |
| 440 | else: |
| 441 | add_msg = 'no change to pool size' |
| 442 | _log_info(dry_run, 'Target is %d working DUTs; %s.', |
| 443 | target_total, add_msg) |
| 444 | |
| 445 | _log_info(dry_run, |
| 446 | '%s %s pool has %d spares available.', |
| 447 | board, main_pool.pool, len(spare_pool.working_hosts)) |
| 448 | |
| 449 | if spares_needed > len(spare_duts): |
| 450 | _log_error('Not enough spares: need %d, only have %d.', |
| 451 | spares_needed, len(spare_duts)) |
| 452 | elif shortfall >= 0: |
| 453 | _log_info(dry_run, |
| 454 | '%s %s pool will return %d broken DUTs, ' |
| 455 | 'leaving %d still in the pool.', |
| 456 | board, main_pool.pool, |
| 457 | len(surplus_duts), |
| 458 | len(main_pool.broken_hosts) - len(surplus_duts)) |
| 459 | else: |
| 460 | _log_info(dry_run, |
| 461 | '%s %s pool will return %d surplus DUTs, ' |
| 462 | 'including %d working DUTs.', |
| 463 | board, main_pool.pool, |
| 464 | len(main_pool.broken_hosts) - shortfall, |
| 465 | -shortfall) |
| 466 | |
David James | f1d6e45 | 2015-07-17 15:23:04 -0700 | [diff] [blame] | 467 | if (len(main_pool.broken_hosts) > arguments.max_broken and |
| 468 | not arguments.force_rebalance): |
| 469 | _log_error('%s %s pool: Refusing to act on pool with %d broken DUTs.', |
| 470 | board, main_pool.pool, len(main_pool.broken_hosts)) |
| 471 | _log_error('Please investigate this board to see if there is a bug ') |
| 472 | _log_error('that is bricking devices. Once you have finished your ') |
| 473 | _log_error('investigation, you can force a rebalance with ') |
| 474 | _log_error('--force-rebalance') |
| 475 | return |
| 476 | |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 477 | if not spare_duts and not surplus_duts: |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 478 | if arguments.verbose: |
| 479 | _log_info(arguments.dry_run, 'No exchange required.') |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 480 | return |
| 481 | |
| 482 | _exchange_labels(arguments.dry_run, surplus_duts, |
| 483 | spare_pool, main_pool) |
| 484 | _exchange_labels(arguments.dry_run, spare_duts, |
| 485 | main_pool, spare_pool) |
| 486 | |
| 487 | |
| 488 | def _parse_command(argv): |
| 489 | """Parse the command line arguments. |
| 490 | |
| 491 | Create an argument parser for this command's syntax, parse the |
| 492 | command line, and return the result of the `ArgumentParser` |
| 493 | `parse_args()` method. |
| 494 | |
| 495 | @param argv Standard command line argument vector; `argv[0]` is |
| 496 | assumed to be the command name. |
| 497 | |
| 498 | @return Result returned by `ArgumentParser.parse_args()`. |
| 499 | |
| 500 | """ |
| 501 | parser = argparse.ArgumentParser( |
| 502 | prog=argv[0], |
| 503 | description='Balance pool shortages from spares on reserve') |
| 504 | |
| 505 | count_group = parser.add_mutually_exclusive_group() |
| 506 | count_group.add_argument('-t', '--total', type=int, |
| 507 | metavar='COUNT', default=None, |
| 508 | help='Set the number of DUTs in the ' |
| 509 | 'pool to the specified count for ' |
| 510 | 'every BOARD') |
| 511 | count_group.add_argument('-a', '--grow', type=int, |
| 512 | metavar='COUNT', default=None, |
| 513 | help='Add the specified number of DUTs ' |
| 514 | 'to the pool for every BOARD') |
| 515 | count_group.add_argument('-d', '--shrink', type=int, |
| 516 | metavar='COUNT', default=None, |
| 517 | help='Remove the specified number of DUTs ' |
| 518 | 'from the pool for every BOARD') |
| 519 | |
| 520 | parser.add_argument('-s', '--spare', default='suites', |
| 521 | metavar='POOL', |
| 522 | help='Pool from which to draw replacement ' |
| 523 | 'spares (default: pool:suites)') |
| 524 | parser.add_argument('-n', '--dry-run', action='store_true', |
| 525 | help='Report actions to take in the form of ' |
| 526 | 'shell commands') |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 527 | parser.add_argument('-v', '--verbose', action='store_true', |
| 528 | help='Print more detail about calculations for debug ' |
| 529 | 'purposes.') |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 530 | |
David James | f1d6e45 | 2015-07-17 15:23:04 -0700 | [diff] [blame] | 531 | parser.add_argument('-m', '--max-broken', default=2, type=int, |
| 532 | metavar='COUNT', |
| 533 | help='Only rebalance a pool if it has at most ' |
| 534 | 'COUNT broken DUTs.') |
| 535 | parser.add_argument('-f', '--force-rebalance', action='store_true', |
| 536 | help='Forcefully rebalance all DUTs in a pool, even ' |
| 537 | 'if it has a large number of broken DUTs. ' |
| 538 | 'Before doing this, please investigate whether ' |
| 539 | 'there is a bug that is bricking devices in the ' |
| 540 | 'lab.') |
| 541 | |
David James | 8352bc2 | 2015-05-05 16:37:05 -0700 | [diff] [blame^] | 542 | parser.add_argument('--all-boards', action='store_true', |
| 543 | help='Rebalance all boards.') |
| 544 | |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 545 | parser.add_argument('pool', |
| 546 | metavar='POOL', |
David James | 8352bc2 | 2015-05-05 16:37:05 -0700 | [diff] [blame^] | 547 | help='Name of the pool to balance.') |
| 548 | parser.add_argument('boards', nargs='*', |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 549 | metavar='BOARD', |
David James | 8352bc2 | 2015-05-05 16:37:05 -0700 | [diff] [blame^] | 550 | help='Names of boards to balance.') |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 551 | |
| 552 | arguments = parser.parse_args(argv[1:]) |
David James | 8352bc2 | 2015-05-05 16:37:05 -0700 | [diff] [blame^] | 553 | |
| 554 | # Error-check arguments. |
| 555 | if not arguments.boards and not arguments.all_boards: |
| 556 | parser.error('No boards specified. To balance all boards, use ' |
| 557 | '--all-boards') |
| 558 | if arguments.boards and arguments.all_boards: |
| 559 | parser.error('Cannot specify boards with --all-boards.') |
| 560 | |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 561 | return arguments |
| 562 | |
| 563 | |
| 564 | def main(argv): |
| 565 | """Standard main routine. |
| 566 | |
| 567 | @param argv Command line arguments including `sys.argv[0]`. |
| 568 | |
| 569 | """ |
David James | 2a3cb54 | 2015-05-05 17:13:43 -0700 | [diff] [blame] | 570 | def balancer(i, board): |
| 571 | """Balance the specified board. |
| 572 | |
| 573 | @param i The index of the board. |
| 574 | @param board The board name. |
| 575 | """ |
| 576 | if i > 0: |
| 577 | _log_message('') |
| 578 | _balance_board(arguments, afe, board, start_time, end_time) |
| 579 | |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 580 | arguments = _parse_command(argv) |
| 581 | end_time = time.time() |
| 582 | start_time = end_time - 24 * 60 * 60 |
David James | 2a3cb54 | 2015-05-05 17:13:43 -0700 | [diff] [blame] | 583 | afe = frontend.AFE(server=None) |
David James | 8352bc2 | 2015-05-05 16:37:05 -0700 | [diff] [blame^] | 584 | boards = arguments.boards |
| 585 | if arguments.all_boards: |
| 586 | boards = host_label_utils.get_all_boards( |
| 587 | labels=[_POOL_PREFIX + arguments.pool]) |
| 588 | board_args = list(enumerate(boards)) |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 589 | try: |
David James | 2a3cb54 | 2015-05-05 17:13:43 -0700 | [diff] [blame] | 590 | parallel.RunTasksInProcessPool(balancer, board_args, processes=8) |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 591 | except KeyboardInterrupt: |
| 592 | pass |
| 593 | |
| 594 | |
| 595 | if __name__ == '__main__': |
| 596 | main(sys.argv) |