J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 1 | #!/usr/bin/env python |
| 2 | # Copyright 2015 The Chromium OS Authors. All rights reserved. |
| 3 | # Use of this source code is governed by a BSD-style license that can be |
| 4 | # found in the LICENSE file. |
| 5 | |
| 6 | """Adjust pool balances to cover DUT shortfalls. |
| 7 | |
| 8 | This command takes all broken DUTs in a specific pool for specific |
| 9 | boards and swaps them with working DUTs taken from a selected pool |
| 10 | of spares. The command is meant primarily for replacing broken DUTs |
| 11 | in critical pools like BVT or CQ, but it can also be used to adjust |
| 12 | pool sizes, or to create or remove pools. |
| 13 | |
| 14 | usage: balance_pool.py [ options ] POOL BOARD [ BOARD ... ] |
| 15 | |
| 16 | positional arguments: |
| 17 | POOL Name of the pool to balance |
| 18 | BOARD Names of boards to balance |
| 19 | |
| 20 | optional arguments: |
| 21 | -h, --help show this help message and exit |
| 22 | -t COUNT, --total COUNT |
| 23 | Set the number of DUTs in the pool to the specified |
| 24 | count for every BOARD |
| 25 | -a COUNT, --grow COUNT |
| 26 | Add the specified number of DUTs to the pool for every |
| 27 | BOARD |
| 28 | -d COUNT, --shrink COUNT |
| 29 | Remove the specified number of DUTs from the pool for |
| 30 | every BOARD |
| 31 | -s POOL, --spare POOL |
| 32 | Pool from which to draw replacement spares (default: |
| 33 | pool:suites) |
| 34 | -n, --dry-run Report actions to take in the form of shell commands |
| 35 | |
| 36 | |
| 37 | The command attempts to remove all broken DUTs from the target POOL |
| 38 | for every BOARD, and replace them with enough working DUTs taken |
| 39 | from the spare pool to bring the strength of POOL to the requested |
| 40 | total COUNT. |
| 41 | |
| 42 | If no COUNT options are supplied (i.e. there are no --total, --grow, |
| 43 | or --shrink options), the command will maintain the current totals of |
| 44 | DUTs for every BOARD in the target POOL. |
| 45 | |
| 46 | If not enough working spares are available, broken DUTs may be left |
| 47 | in the pool to keep the pool at the target COUNT. |
| 48 | |
| 49 | When reducing pool size, working DUTs will be returned after broken |
| 50 | DUTs, if it's necessary to achieve the target COUNT. |
| 51 | |
| 52 | """ |
| 53 | |
| 54 | |
| 55 | import argparse |
| 56 | import sys |
| 57 | import time |
| 58 | |
| 59 | import common |
| 60 | from autotest_lib.server import frontend |
Aviv Keshet | 7ee9586 | 2016-08-30 15:18:27 -0700 | [diff] [blame] | 61 | from autotest_lib.server.lib import status_history |
Kevin Cheng | cf0ad2b | 2016-04-19 14:51:39 -0700 | [diff] [blame] | 62 | from autotest_lib.site_utils import lab_inventory |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 63 | from autotest_lib.site_utils.suite_scheduler import constants |
| 64 | |
David James | 2a3cb54 | 2015-05-05 17:13:43 -0700 | [diff] [blame] | 65 | from chromite.lib import parallel |
| 66 | |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 67 | |
| 68 | _POOL_PREFIX = constants.Labels.POOL_PREFIX |
Kevin Cheng | cf0ad2b | 2016-04-19 14:51:39 -0700 | [diff] [blame] | 69 | # This is the ratio of all boards we should calculate the default max number of |
| 70 | # broken boards against. It seemed like the best choice that was neither too |
| 71 | # strict nor lax. |
| 72 | _MAX_BROKEN_BOARDS_DEFAULT_RATIO = 3.0 / 8.0 |
| 73 | |
| 74 | _ALL_CRITICAL_POOLS = 'all_critical_pools' |
| 75 | _SPARE_DEFAULT = lab_inventory.SPARE_POOL |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 76 | |
| 77 | |
| 78 | def _log_message(message, *args): |
| 79 | """Log a message with optional format arguments to stdout. |
| 80 | |
| 81 | This function logs a single line to stdout, with formatting |
| 82 | if necessary, and without adornments. |
| 83 | |
| 84 | If `*args` are supplied, the message will be formatted using |
| 85 | the arguments. |
| 86 | |
| 87 | @param message Message to be logged, possibly after formatting. |
| 88 | @param args Format arguments. If empty, the message is logged |
| 89 | without formatting. |
| 90 | |
| 91 | """ |
| 92 | if args: |
| 93 | message = message % args |
| 94 | sys.stdout.write('%s\n' % message) |
| 95 | |
| 96 | |
| 97 | def _log_info(dry_run, message, *args): |
| 98 | """Log information in a dry-run dependent fashion. |
| 99 | |
| 100 | This function logs a single line to stdout, with formatting |
| 101 | if necessary. When logging for a dry run, the message is |
| 102 | printed as a shell comment, rather than as unadorned text. |
| 103 | |
| 104 | If `*args` are supplied, the message will be formatted using |
| 105 | the arguments. |
| 106 | |
| 107 | @param message Message to be logged, possibly after formatting. |
| 108 | @param args Format arguments. If empty, the message is logged |
| 109 | without formatting. |
| 110 | |
| 111 | """ |
| 112 | if dry_run: |
| 113 | message = '# ' + message |
| 114 | _log_message(message, *args) |
| 115 | |
| 116 | |
| 117 | def _log_error(message, *args): |
| 118 | """Log an error to stderr, with optional format arguments. |
| 119 | |
| 120 | This function logs a single line to stderr, prefixed to indicate |
| 121 | that it is an error message. |
| 122 | |
| 123 | If `*args` are supplied, the message will be formatted using |
| 124 | the arguments. |
| 125 | |
| 126 | @param message Message to be logged, possibly after formatting. |
| 127 | @param args Format arguments. If empty, the message is logged |
| 128 | without formatting. |
| 129 | |
| 130 | """ |
| 131 | if args: |
| 132 | message = message % args |
| 133 | sys.stderr.write('ERROR: %s\n' % message) |
| 134 | |
| 135 | |
| 136 | class _DUTPool(object): |
| 137 | """Information about a pool of DUTs for a given board. |
| 138 | |
| 139 | This class collects information about all DUTs for a given |
| 140 | board and pool pair, and divides them into three categories: |
| 141 | + Working - the DUT is working for testing, and not locked. |
| 142 | + Broken - the DUT is unable to run tests, or it is locked. |
| 143 | + Ineligible - the DUT is not available to be removed from |
| 144 | this pool. The DUT may be either working or broken. |
| 145 | |
| 146 | DUTs with more than one pool: label are ineligible for exchange |
| 147 | during balancing. This is done for the sake of chameleon hosts, |
| 148 | which must always be assigned to pool:suites. These DUTs are |
| 149 | always marked with pool:chameleon to prevent their reassignment. |
| 150 | |
| 151 | TODO(jrbarnette): The use of `pool:chamelon` (instead of just |
| 152 | the `chameleon` label is a hack that should be eliminated. |
| 153 | |
| 154 | _DUTPool instances are used to track both main pools that need |
| 155 | to be resupplied with working DUTs and spare pools that supply |
| 156 | those DUTs. |
| 157 | |
| 158 | @property board Name of the board associated with |
| 159 | this pool of DUTs. |
| 160 | @property pool Name of the pool associated with |
| 161 | this pool of DUTs. |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 162 | @property working_hosts The list of this pool's working |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 163 | DUTs. |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 164 | @property broken_hosts The list of this pool's broken |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 165 | DUTs. |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 166 | @property ineligible_hosts The list of this pool's ineligible DUTs. |
| 167 | @property labels A list of labels that identify a DUT |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 168 | as part of this pool. |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 169 | @property total_hosts The total number of hosts in pool. |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 170 | |
| 171 | """ |
| 172 | |
Richard Barnette | 07303cb | 2016-04-15 16:56:16 -0700 | [diff] [blame] | 173 | def __init__(self, afe, board, pool, start_time, end_time): |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 174 | self.board = board |
| 175 | self.pool = pool |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 176 | self.working_hosts = [] |
| 177 | self.broken_hosts = [] |
| 178 | self.ineligible_hosts = [] |
Richard Barnette | 07303cb | 2016-04-15 16:56:16 -0700 | [diff] [blame] | 179 | self.total_hosts = self._get_hosts(afe, start_time, end_time) |
| 180 | self._labels = [_POOL_PREFIX + self.pool] |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 181 | |
| 182 | |
Richard Barnette | 07303cb | 2016-04-15 16:56:16 -0700 | [diff] [blame] | 183 | def _get_hosts(self, afe, start_time, end_time): |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 184 | all_histories = ( |
| 185 | status_history.HostJobHistory.get_multiple_histories( |
| 186 | afe, start_time, end_time, |
| 187 | board=self.board, pool=self.pool)) |
| 188 | for h in all_histories: |
| 189 | host = h.host |
| 190 | host_pools = [l for l in host.labels |
| 191 | if l.startswith(_POOL_PREFIX)] |
| 192 | if len(host_pools) != 1: |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 193 | self.ineligible_hosts.append(host) |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 194 | else: |
| 195 | diag = h.last_diagnosis()[0] |
| 196 | if (diag == status_history.WORKING and |
| 197 | not host.locked): |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 198 | self.working_hosts.append(host) |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 199 | else: |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 200 | self.broken_hosts.append(host) |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 201 | return len(all_histories) |
| 202 | |
| 203 | |
| 204 | @property |
| 205 | def pool_labels(self): |
| 206 | """Return the AFE labels that identify this pool. |
| 207 | |
| 208 | The returned labels are the labels that must be removed |
| 209 | to remove a DUT from the pool, or added to add a DUT. |
| 210 | |
| 211 | @return A list of AFE labels suitable for AFE.add_labels() |
| 212 | or AFE.remove_labels(). |
| 213 | |
| 214 | """ |
Richard Barnette | 07303cb | 2016-04-15 16:56:16 -0700 | [diff] [blame] | 215 | return self._labels |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 216 | |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 217 | def calculate_spares_needed(self, target_total): |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 218 | """Calculate and log the spares needed to achieve a target. |
| 219 | |
| 220 | Return how many working spares are needed to achieve the |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 221 | given `target_total` with all DUTs working. |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 222 | |
| 223 | The spares count may be positive or negative. Positive |
| 224 | values indicate spares are needed to replace broken DUTs in |
| 225 | order to reach the target; negative numbers indicate that |
| 226 | no spares are needed, and that a corresponding number of |
| 227 | working devices can be returned. |
| 228 | |
| 229 | If the new target total would require returning ineligible |
| 230 | DUTs, an error is logged, and the target total is adjusted |
| 231 | so that those DUTs are not exchanged. |
| 232 | |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 233 | @param target_total The new target pool size. |
| 234 | |
| 235 | @return The number of spares needed. |
| 236 | |
| 237 | """ |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 238 | num_ineligible = len(self.ineligible_hosts) |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 239 | if target_total < num_ineligible: |
| 240 | _log_error('%s %s pool: Target of %d is below ' |
| 241 | 'minimum of %d DUTs.', |
| 242 | self.board, self.pool, |
| 243 | target_total, num_ineligible) |
| 244 | _log_error('Adjusting target to %d DUTs.', num_ineligible) |
| 245 | target_total = num_ineligible |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 246 | adjustment = target_total - self.total_hosts |
| 247 | return len(self.broken_hosts) + adjustment |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 248 | |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 249 | def allocate_surplus(self, num_broken): |
| 250 | """Allocate a list DUTs that can returned as surplus. |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 251 | |
| 252 | Return a list of devices that can be returned in order to |
| 253 | reduce this pool's supply. Broken DUTs will be preferred |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 254 | over working ones. |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 255 | |
| 256 | The `num_broken` parameter indicates the number of broken |
| 257 | DUTs to be left in the pool. If this number exceeds the |
| 258 | number of broken DUTs actually in the pool, the returned |
| 259 | list will be empty. If this number is negative, it |
| 260 | indicates a number of working DUTs to be returned in |
| 261 | addition to all broken ones. |
| 262 | |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 263 | @param num_broken Total number of broken DUTs to be left in |
| 264 | this pool. |
| 265 | |
| 266 | @return A list of DUTs to be returned as surplus. |
| 267 | |
| 268 | """ |
| 269 | if num_broken >= 0: |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 270 | surplus = self.broken_hosts[num_broken:] |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 271 | return surplus |
| 272 | else: |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 273 | return (self.broken_hosts + |
| 274 | self.working_hosts[:-num_broken]) |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 275 | |
| 276 | |
| 277 | def _exchange_labels(dry_run, hosts, target_pool, spare_pool): |
| 278 | """Reassign a list of DUTs from one pool to another. |
| 279 | |
| 280 | For all the given hosts, remove all labels associated with |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 281 | `spare_pool`, and add the labels for `target_pool`. |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 282 | |
| 283 | If `dry_run` is true, perform no changes, but log the `atest` |
| 284 | commands needed to accomplish the necessary label changes. |
| 285 | |
| 286 | @param dry_run Whether the logging is for a dry run or |
| 287 | for actual execution. |
| 288 | @param hosts List of DUTs (AFE hosts) to be reassigned. |
| 289 | @param target_pool The `_DUTPool` object from which the hosts |
| 290 | are drawn. |
| 291 | @param spare_pool The `_DUTPool` object to which the hosts |
| 292 | will be added. |
| 293 | |
| 294 | """ |
| 295 | if not hosts: |
| 296 | return |
| 297 | _log_info(dry_run, 'Transferring %d DUTs from %s to %s.', |
| 298 | len(hosts), spare_pool.pool, target_pool.pool) |
| 299 | additions = target_pool.pool_labels |
| 300 | removals = spare_pool.pool_labels |
| 301 | for host in hosts: |
| 302 | if not dry_run: |
| 303 | _log_message('Updating host: %s.', host.hostname) |
Richard Barnette | 07303cb | 2016-04-15 16:56:16 -0700 | [diff] [blame] | 304 | host.remove_labels(removals) |
| 305 | host.add_labels(additions) |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 306 | else: |
| 307 | _log_message('atest label remove -m %s %s', |
| 308 | host.hostname, ' '.join(removals)) |
| 309 | _log_message('atest label add -m %s %s', |
| 310 | host.hostname, ' '.join(additions)) |
| 311 | |
| 312 | |
Kevin Cheng | cf0ad2b | 2016-04-19 14:51:39 -0700 | [diff] [blame] | 313 | def _balance_board(arguments, afe, board, pool, start_time, end_time): |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 314 | """Balance one board as requested by command line arguments. |
| 315 | |
| 316 | @param arguments Parsed command line arguments. |
| 317 | @param dry_run Whether the logging is for a dry run or |
| 318 | for actual execution. |
| 319 | @param afe AFE object to be used for the changes. |
| 320 | @param board Board to be balanced. |
Kevin Cheng | cf0ad2b | 2016-04-19 14:51:39 -0700 | [diff] [blame] | 321 | @param pool Pool of the board to be balanced. |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 322 | @param start_time Start time for HostJobHistory objects in |
| 323 | the DUT pools. |
| 324 | @param end_time End time for HostJobHistory objects in the |
| 325 | DUT pools. |
| 326 | |
| 327 | """ |
| 328 | spare_pool = _DUTPool(afe, board, arguments.spare, |
Richard Barnette | 07303cb | 2016-04-15 16:56:16 -0700 | [diff] [blame] | 329 | start_time, end_time) |
Kevin Cheng | cf0ad2b | 2016-04-19 14:51:39 -0700 | [diff] [blame] | 330 | main_pool = _DUTPool(afe, board, pool, |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 331 | start_time, end_time) |
| 332 | |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 333 | target_total = main_pool.total_hosts |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 334 | if arguments.total is not None: |
| 335 | target_total = arguments.total |
| 336 | elif arguments.grow: |
| 337 | target_total += arguments.grow |
| 338 | elif arguments.shrink: |
| 339 | target_total -= arguments.shrink |
| 340 | |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 341 | spares_needed = main_pool.calculate_spares_needed(target_total) |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 342 | if spares_needed > 0: |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 343 | spare_duts = spare_pool.working_hosts[:spares_needed] |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 344 | shortfall = spares_needed - len(spare_duts) |
| 345 | else: |
| 346 | spare_duts = [] |
| 347 | shortfall = spares_needed |
| 348 | |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 349 | surplus_duts = main_pool.allocate_surplus(shortfall) |
| 350 | |
| 351 | if spares_needed or surplus_duts or arguments.verbose: |
| 352 | dry_run = arguments.dry_run |
| 353 | _log_message('') |
| 354 | |
| 355 | _log_info(dry_run, 'Balancing %s %s pool:', board, main_pool.pool) |
| 356 | _log_info(dry_run, |
| 357 | 'Total %d DUTs, %d working, %d broken, %d reserved.', |
| 358 | main_pool.total_hosts, len(main_pool.working_hosts), |
| 359 | len(main_pool.broken_hosts), len(main_pool.ineligible_hosts)) |
| 360 | |
| 361 | if spares_needed > 0: |
| 362 | add_msg = 'grow pool by %d DUTs' % spares_needed |
| 363 | elif spares_needed < 0: |
| 364 | add_msg = 'shrink pool by %d DUTs' % -spares_needed |
| 365 | else: |
| 366 | add_msg = 'no change to pool size' |
| 367 | _log_info(dry_run, 'Target is %d working DUTs; %s.', |
| 368 | target_total, add_msg) |
| 369 | |
| 370 | _log_info(dry_run, |
| 371 | '%s %s pool has %d spares available.', |
| 372 | board, main_pool.pool, len(spare_pool.working_hosts)) |
| 373 | |
| 374 | if spares_needed > len(spare_duts): |
| 375 | _log_error('Not enough spares: need %d, only have %d.', |
| 376 | spares_needed, len(spare_duts)) |
| 377 | elif shortfall >= 0: |
| 378 | _log_info(dry_run, |
| 379 | '%s %s pool will return %d broken DUTs, ' |
| 380 | 'leaving %d still in the pool.', |
| 381 | board, main_pool.pool, |
| 382 | len(surplus_duts), |
| 383 | len(main_pool.broken_hosts) - len(surplus_duts)) |
| 384 | else: |
| 385 | _log_info(dry_run, |
| 386 | '%s %s pool will return %d surplus DUTs, ' |
| 387 | 'including %d working DUTs.', |
| 388 | board, main_pool.pool, |
| 389 | len(main_pool.broken_hosts) - shortfall, |
| 390 | -shortfall) |
| 391 | |
David James | f1d6e45 | 2015-07-17 15:23:04 -0700 | [diff] [blame] | 392 | if (len(main_pool.broken_hosts) > arguments.max_broken and |
| 393 | not arguments.force_rebalance): |
| 394 | _log_error('%s %s pool: Refusing to act on pool with %d broken DUTs.', |
| 395 | board, main_pool.pool, len(main_pool.broken_hosts)) |
| 396 | _log_error('Please investigate this board to see if there is a bug ') |
| 397 | _log_error('that is bricking devices. Once you have finished your ') |
| 398 | _log_error('investigation, you can force a rebalance with ') |
| 399 | _log_error('--force-rebalance') |
| 400 | return |
| 401 | |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 402 | if not spare_duts and not surplus_duts: |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 403 | if arguments.verbose: |
| 404 | _log_info(arguments.dry_run, 'No exchange required.') |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 405 | return |
| 406 | |
| 407 | _exchange_labels(arguments.dry_run, surplus_duts, |
| 408 | spare_pool, main_pool) |
| 409 | _exchange_labels(arguments.dry_run, spare_duts, |
| 410 | main_pool, spare_pool) |
| 411 | |
| 412 | |
Kevin Cheng | cf0ad2b | 2016-04-19 14:51:39 -0700 | [diff] [blame] | 413 | def _too_many_broken_boards(inventory, pool, arguments): |
| 414 | """ |
| 415 | Get the inventory of boards and check if too many boards are broken. |
| 416 | |
| 417 | @param inventory: inventory object to determine board status inventory. |
| 418 | @param pool: The pool to check on for the board. |
| 419 | @param arguments Parsed command line arguments. |
| 420 | |
| 421 | @return True if the number of boards with 1 or more broken duts exceed |
| 422 | max_broken_boards, False otherwise. |
| 423 | """ |
| 424 | # Let's check if we even need to check for this max_broken_boards. |
| 425 | if arguments.force_rebalance or arguments.max_broken_boards == 0: |
| 426 | return False |
| 427 | |
| 428 | # Let's get the number of broken duts for the specified pool and |
| 429 | # check that it's less than arguments.max_broken_boards. Or if |
| 430 | # it's not specified, calculate the default number of max broken |
| 431 | # boards based on the total number of boards per pool. |
| 432 | # TODO(kevcheng): Revisit to see if there's a better way to |
| 433 | # calculate the default max_broken_boards. |
| 434 | max_broken_boards = arguments.max_broken_boards |
| 435 | if max_broken_boards is None: |
| 436 | total_num_boards = len(inventory.get_managed_boards(pool=pool)) |
| 437 | max_broken_boards = int(_MAX_BROKEN_BOARDS_DEFAULT_RATIO * |
| 438 | total_num_boards) |
| 439 | _log_info(arguments.dry_run, |
| 440 | 'Default max broken boards calculated to be %d for ' |
| 441 | '%s pool', |
| 442 | max_broken_boards, pool) |
| 443 | |
| 444 | |
| 445 | broken_boards = [board for board, counts in inventory.items() |
| 446 | if counts.get_broken(pool) != 0] |
| 447 | broken_boards.sort() |
| 448 | num_of_broken_boards = len(broken_boards) |
| 449 | # TODO(kevcheng): Track which boards have broken duts, we can limit the |
| 450 | # number of boards we go through in the main loop with this knowledge. |
| 451 | _log_message('There are %d boards in the %s pool with at least 1 ' |
| 452 | 'broken DUT (max threshold %d)', num_of_broken_boards, |
| 453 | pool, max_broken_boards) |
| 454 | for broken_board in broken_boards: |
| 455 | _log_message(broken_board) |
| 456 | return num_of_broken_boards > max_broken_boards |
| 457 | |
| 458 | |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 459 | def _parse_command(argv): |
| 460 | """Parse the command line arguments. |
| 461 | |
| 462 | Create an argument parser for this command's syntax, parse the |
| 463 | command line, and return the result of the `ArgumentParser` |
| 464 | `parse_args()` method. |
| 465 | |
| 466 | @param argv Standard command line argument vector; `argv[0]` is |
| 467 | assumed to be the command name. |
| 468 | |
| 469 | @return Result returned by `ArgumentParser.parse_args()`. |
| 470 | |
| 471 | """ |
| 472 | parser = argparse.ArgumentParser( |
| 473 | prog=argv[0], |
| 474 | description='Balance pool shortages from spares on reserve') |
| 475 | |
| 476 | count_group = parser.add_mutually_exclusive_group() |
| 477 | count_group.add_argument('-t', '--total', type=int, |
| 478 | metavar='COUNT', default=None, |
| 479 | help='Set the number of DUTs in the ' |
| 480 | 'pool to the specified count for ' |
| 481 | 'every BOARD') |
| 482 | count_group.add_argument('-a', '--grow', type=int, |
| 483 | metavar='COUNT', default=None, |
| 484 | help='Add the specified number of DUTs ' |
| 485 | 'to the pool for every BOARD') |
| 486 | count_group.add_argument('-d', '--shrink', type=int, |
| 487 | metavar='COUNT', default=None, |
| 488 | help='Remove the specified number of DUTs ' |
| 489 | 'from the pool for every BOARD') |
| 490 | |
Kevin Cheng | cf0ad2b | 2016-04-19 14:51:39 -0700 | [diff] [blame] | 491 | parser.add_argument('-s', '--spare', default=_SPARE_DEFAULT, |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 492 | metavar='POOL', |
| 493 | help='Pool from which to draw replacement ' |
Kevin Cheng | cf0ad2b | 2016-04-19 14:51:39 -0700 | [diff] [blame] | 494 | 'spares (default: pool:%s)' % _SPARE_DEFAULT) |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 495 | parser.add_argument('-n', '--dry-run', action='store_true', |
| 496 | help='Report actions to take in the form of ' |
| 497 | 'shell commands') |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 498 | parser.add_argument('-v', '--verbose', action='store_true', |
| 499 | help='Print more detail about calculations for debug ' |
| 500 | 'purposes.') |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 501 | |
David James | f1d6e45 | 2015-07-17 15:23:04 -0700 | [diff] [blame] | 502 | parser.add_argument('-m', '--max-broken', default=2, type=int, |
| 503 | metavar='COUNT', |
| 504 | help='Only rebalance a pool if it has at most ' |
| 505 | 'COUNT broken DUTs.') |
| 506 | parser.add_argument('-f', '--force-rebalance', action='store_true', |
| 507 | help='Forcefully rebalance all DUTs in a pool, even ' |
| 508 | 'if it has a large number of broken DUTs. ' |
| 509 | 'Before doing this, please investigate whether ' |
| 510 | 'there is a bug that is bricking devices in the ' |
| 511 | 'lab.') |
| 512 | |
David James | 8352bc2 | 2015-05-05 16:37:05 -0700 | [diff] [blame] | 513 | parser.add_argument('--all-boards', action='store_true', |
Kevin Cheng | cf0ad2b | 2016-04-19 14:51:39 -0700 | [diff] [blame] | 514 | help='Rebalance all managed boards. This will do a ' |
| 515 | 'very expensive check to see how many boards have ' |
| 516 | 'at least one broken DUT. To bypass that check, ' |
| 517 | 'set --max-broken-boards to 0.') |
| 518 | parser.add_argument('--max-broken-boards', |
| 519 | default=None, type=int, |
| 520 | help='Only rebalance all boards if number of boards ' |
| 521 | 'with broken DUTs in the specified pool ' |
| 522 | 'is less than COUNT.') |
David James | 8352bc2 | 2015-05-05 16:37:05 -0700 | [diff] [blame] | 523 | |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 524 | parser.add_argument('pool', |
| 525 | metavar='POOL', |
Kevin Cheng | cf0ad2b | 2016-04-19 14:51:39 -0700 | [diff] [blame] | 526 | help='Name of the pool to balance. Use %s to balance ' |
| 527 | 'all critical pools' % _ALL_CRITICAL_POOLS) |
David James | 8352bc2 | 2015-05-05 16:37:05 -0700 | [diff] [blame] | 528 | parser.add_argument('boards', nargs='*', |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 529 | metavar='BOARD', |
David James | 8352bc2 | 2015-05-05 16:37:05 -0700 | [diff] [blame] | 530 | help='Names of boards to balance.') |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 531 | |
| 532 | arguments = parser.parse_args(argv[1:]) |
David James | 8352bc2 | 2015-05-05 16:37:05 -0700 | [diff] [blame] | 533 | |
| 534 | # Error-check arguments. |
| 535 | if not arguments.boards and not arguments.all_boards: |
| 536 | parser.error('No boards specified. To balance all boards, use ' |
| 537 | '--all-boards') |
| 538 | if arguments.boards and arguments.all_boards: |
| 539 | parser.error('Cannot specify boards with --all-boards.') |
Kevin Cheng | cf0ad2b | 2016-04-19 14:51:39 -0700 | [diff] [blame] | 540 | if (arguments.pool == _ALL_CRITICAL_POOLS and |
| 541 | arguments.spare != _SPARE_DEFAULT): |
| 542 | parser.error('Cannot specify --spare pool to be %s when balancing all ' |
| 543 | 'critical pools.' % _SPARE_DEFAULT) |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 544 | return arguments |
| 545 | |
| 546 | |
| 547 | def main(argv): |
| 548 | """Standard main routine. |
| 549 | |
| 550 | @param argv Command line arguments including `sys.argv[0]`. |
| 551 | |
| 552 | """ |
Kevin Cheng | cf0ad2b | 2016-04-19 14:51:39 -0700 | [diff] [blame] | 553 | def balancer(i, board, pool): |
David James | 2a3cb54 | 2015-05-05 17:13:43 -0700 | [diff] [blame] | 554 | """Balance the specified board. |
| 555 | |
| 556 | @param i The index of the board. |
| 557 | @param board The board name. |
Kevin Cheng | cf0ad2b | 2016-04-19 14:51:39 -0700 | [diff] [blame] | 558 | @param pool The pool to rebalance for the board. |
David James | 2a3cb54 | 2015-05-05 17:13:43 -0700 | [diff] [blame] | 559 | """ |
| 560 | if i > 0: |
| 561 | _log_message('') |
Kevin Cheng | cf0ad2b | 2016-04-19 14:51:39 -0700 | [diff] [blame] | 562 | _balance_board(arguments, afe, board, pool, start_time, end_time) |
David James | 2a3cb54 | 2015-05-05 17:13:43 -0700 | [diff] [blame] | 563 | |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 564 | arguments = _parse_command(argv) |
| 565 | end_time = time.time() |
| 566 | start_time = end_time - 24 * 60 * 60 |
David James | 2a3cb54 | 2015-05-05 17:13:43 -0700 | [diff] [blame] | 567 | afe = frontend.AFE(server=None) |
David James | 8352bc2 | 2015-05-05 16:37:05 -0700 | [diff] [blame] | 568 | boards = arguments.boards |
Kevin Cheng | cf0ad2b | 2016-04-19 14:51:39 -0700 | [diff] [blame] | 569 | pools = (lab_inventory.CRITICAL_POOLS |
| 570 | if arguments.pool == _ALL_CRITICAL_POOLS |
| 571 | else [arguments.pool]) |
| 572 | board_info = [] |
David James | 8352bc2 | 2015-05-05 16:37:05 -0700 | [diff] [blame] | 573 | if arguments.all_boards: |
Kevin Cheng | cf0ad2b | 2016-04-19 14:51:39 -0700 | [diff] [blame] | 574 | inventory = lab_inventory.get_inventory(afe) |
| 575 | for pool in pools: |
| 576 | if _too_many_broken_boards(inventory, pool, arguments): |
| 577 | _log_error('Refusing to balance all boards for %s pool, ' |
| 578 | 'too many boards with at least 1 broken DUT ' |
| 579 | 'detected.', pool) |
| 580 | else: |
| 581 | boards_in_pool = inventory.get_managed_boards(pool=pool) |
| 582 | current_len_board_info = len(board_info) |
| 583 | board_info.extend([(i + current_len_board_info, board, pool) |
| 584 | for i, board in enumerate(boards_in_pool)]) |
| 585 | else: |
| 586 | # We have specified boards with a specified pool, setup the args to the |
| 587 | # balancer properly. |
| 588 | for pool in pools: |
| 589 | current_len_board_info = len(board_info) |
| 590 | board_info.extend([(i + current_len_board_info, board, pool) |
| 591 | for i, board in enumerate(boards)]) |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 592 | try: |
Kevin Cheng | cf0ad2b | 2016-04-19 14:51:39 -0700 | [diff] [blame] | 593 | parallel.RunTasksInProcessPool(balancer, board_info, processes=8) |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 594 | except KeyboardInterrupt: |
| 595 | pass |
| 596 | |
| 597 | |
| 598 | if __name__ == '__main__': |
| 599 | main(sys.argv) |