Mike Frysinger | d03e6b5 | 2019-08-03 12:49:01 -0400 | [diff] [blame^] | 1 | #!/usr/bin/env python2 |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 2 | # Copyright 2015 The Chromium OS Authors. All rights reserved. |
| 3 | # Use of this source code is governed by a BSD-style license that can be |
| 4 | # found in the LICENSE file. |
| 5 | |
| 6 | """Adjust pool balances to cover DUT shortfalls. |
| 7 | |
| 8 | This command takes all broken DUTs in a specific pool for specific |
Richard Barnette | 5de01eb | 2017-12-15 09:53:42 -0800 | [diff] [blame] | 9 | models and swaps them with working DUTs taken from a selected pool |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 10 | of spares. The command is meant primarily for replacing broken DUTs |
| 11 | in critical pools like BVT or CQ, but it can also be used to adjust |
| 12 | pool sizes, or to create or remove pools. |
| 13 | |
Richard Barnette | 5de01eb | 2017-12-15 09:53:42 -0800 | [diff] [blame] | 14 | usage: balance_pool.py [ options ] POOL MODEL [ MODEL ... ] |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 15 | |
| 16 | positional arguments: |
| 17 | POOL Name of the pool to balance |
Richard Barnette | 5de01eb | 2017-12-15 09:53:42 -0800 | [diff] [blame] | 18 | MODEL Names of models to balance |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 19 | |
| 20 | optional arguments: |
| 21 | -h, --help show this help message and exit |
| 22 | -t COUNT, --total COUNT |
| 23 | Set the number of DUTs in the pool to the specified |
Richard Barnette | 5de01eb | 2017-12-15 09:53:42 -0800 | [diff] [blame] | 24 | count for every MODEL |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 25 | -a COUNT, --grow COUNT |
| 26 | Add the specified number of DUTs to the pool for every |
Richard Barnette | 5de01eb | 2017-12-15 09:53:42 -0800 | [diff] [blame] | 27 | MODEL |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 28 | -d COUNT, --shrink COUNT |
| 29 | Remove the specified number of DUTs from the pool for |
Richard Barnette | 5de01eb | 2017-12-15 09:53:42 -0800 | [diff] [blame] | 30 | every MODEL |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 31 | -s POOL, --spare POOL |
| 32 | Pool from which to draw replacement spares (default: |
| 33 | pool:suites) |
Matthew Leszczenski | df2c3d7 | 2018-11-13 14:50:01 -0800 | [diff] [blame] | 34 | -p PHASE, --phase PHASE |
| 35 | Phase to restrict the balance pool operation to |
Chung-yih Wang | cc1d9cb | 2017-11-30 11:20:45 +0800 | [diff] [blame] | 36 | --sku SKU The specific SKU we intend to swap with |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 37 | -n, --dry-run Report actions to take in the form of shell commands |
| 38 | |
| 39 | |
| 40 | The command attempts to remove all broken DUTs from the target POOL |
Richard Barnette | 5de01eb | 2017-12-15 09:53:42 -0800 | [diff] [blame] | 41 | for every MODEL, and replace them with enough working DUTs taken |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 42 | from the spare pool to bring the strength of POOL to the requested |
| 43 | total COUNT. |
| 44 | |
| 45 | If no COUNT options are supplied (i.e. there are no --total, --grow, |
| 46 | or --shrink options), the command will maintain the current totals of |
Richard Barnette | 5de01eb | 2017-12-15 09:53:42 -0800 | [diff] [blame] | 47 | DUTs for every MODEL in the target POOL. |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 48 | |
| 49 | If not enough working spares are available, broken DUTs may be left |
| 50 | in the pool to keep the pool at the target COUNT. |
| 51 | |
| 52 | When reducing pool size, working DUTs will be returned after broken |
| 53 | DUTs, if it's necessary to achieve the target COUNT. |
| 54 | |
| 55 | """ |
| 56 | |
| 57 | |
| 58 | import argparse |
Richard Barnette | 81ef224 | 2018-06-14 14:34:34 -0700 | [diff] [blame] | 59 | import os |
| 60 | import re |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 61 | import sys |
| 62 | import time |
| 63 | |
| 64 | import common |
Xixuan Wu | 93e646c | 2017-12-07 18:36:10 -0800 | [diff] [blame] | 65 | from autotest_lib.server import constants |
Jacob Kopczynski | c6e483e | 2017-08-25 17:28:35 -0700 | [diff] [blame] | 66 | from autotest_lib.server import site_utils |
Aviv Keshet | a883432 | 2018-05-07 13:28:32 -0700 | [diff] [blame] | 67 | from autotest_lib.server.cros.dynamic_suite import frontend_wrappers |
Aviv Keshet | 7ee9586 | 2016-08-30 15:18:27 -0700 | [diff] [blame] | 68 | from autotest_lib.server.lib import status_history |
Kevin Cheng | cf0ad2b | 2016-04-19 14:51:39 -0700 | [diff] [blame] | 69 | from autotest_lib.site_utils import lab_inventory |
Prathmesh Prabhu | 68acc40 | 2017-11-09 15:24:15 -0800 | [diff] [blame] | 70 | from autotest_lib.utils import labellib |
Jacob Kopczynski | c6e483e | 2017-08-25 17:28:35 -0700 | [diff] [blame] | 71 | from chromite.lib import metrics |
David James | 2a3cb54 | 2015-05-05 17:13:43 -0700 | [diff] [blame] | 72 | from chromite.lib import parallel |
| 73 | |
Jacob Kopczynski | f8d90a8 | 2017-10-10 14:37:33 -0700 | [diff] [blame] | 74 | #This must be imported after chromite.lib.metrics |
| 75 | from infra_libs import ts_mon |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 76 | |
| 77 | _POOL_PREFIX = constants.Labels.POOL_PREFIX |
Richard Barnette | 5de01eb | 2017-12-15 09:53:42 -0800 | [diff] [blame] | 78 | # This is the ratio of all models we should calculate the default max |
| 79 | # number of broken models against. It seemed like the best choice that |
| 80 | # was neither too strict nor lax. |
Prathmesh Prabhu | bb5cb29 | 2017-11-09 16:42:48 -0800 | [diff] [blame] | 81 | _MAX_BROKEN_DEFAULT_RATIO = 3.0 / 8.0 |
Kevin Cheng | cf0ad2b | 2016-04-19 14:51:39 -0700 | [diff] [blame] | 82 | |
| 83 | _ALL_CRITICAL_POOLS = 'all_critical_pools' |
| 84 | _SPARE_DEFAULT = lab_inventory.SPARE_POOL |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 85 | |
| 86 | |
Richard Barnette | 81ef224 | 2018-06-14 14:34:34 -0700 | [diff] [blame] | 87 | # _VALID_POOL_PATTERN - Regular expression matching pool names that will |
| 88 | # be accepted on the command line. |
| 89 | # |
| 90 | # Note: This pattern was selected merely to recognize all existing pool |
| 91 | # names; there's no underlying technical restriction motivating this |
| 92 | # pattern. No reasonable request to add more special characters to the |
| 93 | # allowed set should be refused. |
| 94 | |
| 95 | _VALID_POOL_PATTERN = re.compile('^[a-zA-z0-9_\-]+$') |
| 96 | |
| 97 | |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 98 | def _log_message(message, *args): |
| 99 | """Log a message with optional format arguments to stdout. |
| 100 | |
| 101 | This function logs a single line to stdout, with formatting |
| 102 | if necessary, and without adornments. |
| 103 | |
| 104 | If `*args` are supplied, the message will be formatted using |
| 105 | the arguments. |
| 106 | |
| 107 | @param message Message to be logged, possibly after formatting. |
| 108 | @param args Format arguments. If empty, the message is logged |
| 109 | without formatting. |
| 110 | |
| 111 | """ |
| 112 | if args: |
| 113 | message = message % args |
| 114 | sys.stdout.write('%s\n' % message) |
| 115 | |
| 116 | |
| 117 | def _log_info(dry_run, message, *args): |
| 118 | """Log information in a dry-run dependent fashion. |
| 119 | |
| 120 | This function logs a single line to stdout, with formatting |
| 121 | if necessary. When logging for a dry run, the message is |
| 122 | printed as a shell comment, rather than as unadorned text. |
| 123 | |
| 124 | If `*args` are supplied, the message will be formatted using |
| 125 | the arguments. |
| 126 | |
| 127 | @param message Message to be logged, possibly after formatting. |
| 128 | @param args Format arguments. If empty, the message is logged |
| 129 | without formatting. |
| 130 | |
| 131 | """ |
| 132 | if dry_run: |
| 133 | message = '# ' + message |
| 134 | _log_message(message, *args) |
| 135 | |
| 136 | |
| 137 | def _log_error(message, *args): |
| 138 | """Log an error to stderr, with optional format arguments. |
| 139 | |
| 140 | This function logs a single line to stderr, prefixed to indicate |
| 141 | that it is an error message. |
| 142 | |
| 143 | If `*args` are supplied, the message will be formatted using |
| 144 | the arguments. |
| 145 | |
| 146 | @param message Message to be logged, possibly after formatting. |
| 147 | @param args Format arguments. If empty, the message is logged |
| 148 | without formatting. |
| 149 | |
| 150 | """ |
| 151 | if args: |
| 152 | message = message % args |
| 153 | sys.stderr.write('ERROR: %s\n' % message) |
| 154 | |
| 155 | |
| 156 | class _DUTPool(object): |
Prathmesh Prabhu | bb5cb29 | 2017-11-09 16:42:48 -0800 | [diff] [blame] | 157 | """Information about a pool of DUTs matching given labels. |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 158 | |
Prathmesh Prabhu | bb5cb29 | 2017-11-09 16:42:48 -0800 | [diff] [blame] | 159 | This class collects information about all DUTs for a given pool and matching |
| 160 | the given labels, and divides them into three categories: |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 161 | + Working - the DUT is working for testing, and not locked. |
| 162 | + Broken - the DUT is unable to run tests, or it is locked. |
Prathmesh Prabhu | bb5cb29 | 2017-11-09 16:42:48 -0800 | [diff] [blame] | 163 | + Ineligible - the DUT is not available to be removed from this pool. The |
| 164 | DUT may be either working or broken. |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 165 | |
| 166 | DUTs with more than one pool: label are ineligible for exchange |
| 167 | during balancing. This is done for the sake of chameleon hosts, |
| 168 | which must always be assigned to pool:suites. These DUTs are |
| 169 | always marked with pool:chameleon to prevent their reassignment. |
| 170 | |
| 171 | TODO(jrbarnette): The use of `pool:chamelon` (instead of just |
| 172 | the `chameleon` label is a hack that should be eliminated. |
| 173 | |
| 174 | _DUTPool instances are used to track both main pools that need |
| 175 | to be resupplied with working DUTs and spare pools that supply |
| 176 | those DUTs. |
| 177 | |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 178 | @property pool Name of the pool associated with |
| 179 | this pool of DUTs. |
Prathmesh Prabhu | bb5cb29 | 2017-11-09 16:42:48 -0800 | [diff] [blame] | 180 | @property labels Labels that constrain the DUTs to consider. |
Prathmesh Prabhu | bb5cb29 | 2017-11-09 16:42:48 -0800 | [diff] [blame] | 181 | @property working_hosts The list of this pool's working DUTs. |
| 182 | @property broken_hosts The list of this pool's broken DUTs. |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 183 | @property ineligible_hosts The list of this pool's ineligible DUTs. |
Prathmesh Prabhu | bb5cb29 | 2017-11-09 16:42:48 -0800 | [diff] [blame] | 184 | @property pool_labels A list of labels that identify a DUT as part |
| 185 | of this pool. |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 186 | @property total_hosts The total number of hosts in pool. |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 187 | |
| 188 | """ |
| 189 | |
Prathmesh Prabhu | bb5cb29 | 2017-11-09 16:42:48 -0800 | [diff] [blame] | 190 | def __init__(self, afe, pool, labels, start_time, end_time): |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 191 | self.pool = pool |
Prathmesh Prabhu | bb5cb29 | 2017-11-09 16:42:48 -0800 | [diff] [blame] | 192 | self.labels = labellib.LabelsMapping(labels) |
| 193 | self.labels['pool'] = pool |
| 194 | self._pool_labels = [_POOL_PREFIX + self.pool] |
| 195 | |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 196 | self.working_hosts = [] |
| 197 | self.broken_hosts = [] |
| 198 | self.ineligible_hosts = [] |
Richard Barnette | 07303cb | 2016-04-15 16:56:16 -0700 | [diff] [blame] | 199 | self.total_hosts = self._get_hosts(afe, start_time, end_time) |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 200 | |
| 201 | |
Richard Barnette | 07303cb | 2016-04-15 16:56:16 -0700 | [diff] [blame] | 202 | def _get_hosts(self, afe, start_time, end_time): |
Prathmesh Prabhu | 68acc40 | 2017-11-09 15:24:15 -0800 | [diff] [blame] | 203 | all_histories = status_history.HostJobHistory.get_multiple_histories( |
Prathmesh Prabhu | bb5cb29 | 2017-11-09 16:42:48 -0800 | [diff] [blame] | 204 | afe, start_time, end_time, self.labels.getlabels()) |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 205 | for h in all_histories: |
| 206 | host = h.host |
| 207 | host_pools = [l for l in host.labels |
| 208 | if l.startswith(_POOL_PREFIX)] |
| 209 | if len(host_pools) != 1: |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 210 | self.ineligible_hosts.append(host) |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 211 | else: |
| 212 | diag = h.last_diagnosis()[0] |
| 213 | if (diag == status_history.WORKING and |
| 214 | not host.locked): |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 215 | self.working_hosts.append(host) |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 216 | else: |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 217 | self.broken_hosts.append(host) |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 218 | return len(all_histories) |
| 219 | |
| 220 | |
| 221 | @property |
| 222 | def pool_labels(self): |
| 223 | """Return the AFE labels that identify this pool. |
| 224 | |
| 225 | The returned labels are the labels that must be removed |
| 226 | to remove a DUT from the pool, or added to add a DUT. |
| 227 | |
| 228 | @return A list of AFE labels suitable for AFE.add_labels() |
| 229 | or AFE.remove_labels(). |
| 230 | |
| 231 | """ |
Prathmesh Prabhu | bb5cb29 | 2017-11-09 16:42:48 -0800 | [diff] [blame] | 232 | return self._pool_labels |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 233 | |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 234 | def calculate_spares_needed(self, target_total): |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 235 | """Calculate and log the spares needed to achieve a target. |
| 236 | |
| 237 | Return how many working spares are needed to achieve the |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 238 | given `target_total` with all DUTs working. |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 239 | |
| 240 | The spares count may be positive or negative. Positive |
| 241 | values indicate spares are needed to replace broken DUTs in |
| 242 | order to reach the target; negative numbers indicate that |
| 243 | no spares are needed, and that a corresponding number of |
| 244 | working devices can be returned. |
| 245 | |
| 246 | If the new target total would require returning ineligible |
| 247 | DUTs, an error is logged, and the target total is adjusted |
| 248 | so that those DUTs are not exchanged. |
| 249 | |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 250 | @param target_total The new target pool size. |
| 251 | |
| 252 | @return The number of spares needed. |
| 253 | |
| 254 | """ |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 255 | num_ineligible = len(self.ineligible_hosts) |
Jacob Kopczynski | c6e483e | 2017-08-25 17:28:35 -0700 | [diff] [blame] | 256 | spares_needed = target_total >= num_ineligible |
Jacob Kopczynski | ed55f2e | 2017-11-10 16:26:42 -0800 | [diff] [blame] | 257 | metrics.Boolean( |
| 258 | 'chromeos/autotest/balance_pools/exhausted_pools', |
Richard Barnette | 5de01eb | 2017-12-15 09:53:42 -0800 | [diff] [blame] | 259 | 'True for each pool/model which requests more DUTs than supplied', |
| 260 | # TODO(jrbarnette) The 'board' field is a legacy. We need |
| 261 | # to leave it here until we do the extra work Monarch |
| 262 | # requires to delete a field. |
Jacob Kopczynski | ed55f2e | 2017-11-10 16:26:42 -0800 | [diff] [blame] | 263 | field_spec=[ |
Prathmesh Prabhu | bb5cb29 | 2017-11-09 16:42:48 -0800 | [diff] [blame] | 264 | ts_mon.StringField('pool'), |
| 265 | ts_mon.StringField('board'), |
| 266 | ts_mon.StringField('model'), |
| 267 | ]).set( |
| 268 | not spares_needed, |
| 269 | fields={ |
| 270 | 'pool': self.pool, |
Richard Barnette | 5de01eb | 2017-12-15 09:53:42 -0800 | [diff] [blame] | 271 | 'board': self.labels.get('model', ''), |
Prathmesh Prabhu | bb5cb29 | 2017-11-09 16:42:48 -0800 | [diff] [blame] | 272 | 'model': self.labels.get('model', ''), |
| 273 | }, |
Jacob Kopczynski | ed55f2e | 2017-11-10 16:26:42 -0800 | [diff] [blame] | 274 | ) |
Jacob Kopczynski | c6e483e | 2017-08-25 17:28:35 -0700 | [diff] [blame] | 275 | if not spares_needed: |
Prathmesh Prabhu | bb5cb29 | 2017-11-09 16:42:48 -0800 | [diff] [blame] | 276 | _log_error( |
| 277 | '%s pool (%s): Target of %d is below minimum of %d DUTs.', |
| 278 | self.pool, self.labels, target_total, num_ineligible, |
| 279 | ) |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 280 | _log_error('Adjusting target to %d DUTs.', num_ineligible) |
| 281 | target_total = num_ineligible |
Jacob Kopczynski | ed55f2e | 2017-11-10 16:26:42 -0800 | [diff] [blame] | 282 | else: |
| 283 | _log_message('%s %s pool: Target of %d is above minimum.', |
Richard Barnette | 5de01eb | 2017-12-15 09:53:42 -0800 | [diff] [blame] | 284 | self.labels.get('model', ''), self.pool, target_total) |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 285 | adjustment = target_total - self.total_hosts |
| 286 | return len(self.broken_hosts) + adjustment |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 287 | |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 288 | def allocate_surplus(self, num_broken): |
| 289 | """Allocate a list DUTs that can returned as surplus. |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 290 | |
| 291 | Return a list of devices that can be returned in order to |
| 292 | reduce this pool's supply. Broken DUTs will be preferred |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 293 | over working ones. |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 294 | |
| 295 | The `num_broken` parameter indicates the number of broken |
| 296 | DUTs to be left in the pool. If this number exceeds the |
| 297 | number of broken DUTs actually in the pool, the returned |
| 298 | list will be empty. If this number is negative, it |
| 299 | indicates a number of working DUTs to be returned in |
| 300 | addition to all broken ones. |
| 301 | |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 302 | @param num_broken Total number of broken DUTs to be left in |
| 303 | this pool. |
| 304 | |
| 305 | @return A list of DUTs to be returned as surplus. |
| 306 | |
| 307 | """ |
| 308 | if num_broken >= 0: |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 309 | surplus = self.broken_hosts[num_broken:] |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 310 | return surplus |
| 311 | else: |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 312 | return (self.broken_hosts + |
| 313 | self.working_hosts[:-num_broken]) |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 314 | |
| 315 | |
| 316 | def _exchange_labels(dry_run, hosts, target_pool, spare_pool): |
| 317 | """Reassign a list of DUTs from one pool to another. |
| 318 | |
| 319 | For all the given hosts, remove all labels associated with |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 320 | `spare_pool`, and add the labels for `target_pool`. |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 321 | |
| 322 | If `dry_run` is true, perform no changes, but log the `atest` |
| 323 | commands needed to accomplish the necessary label changes. |
| 324 | |
| 325 | @param dry_run Whether the logging is for a dry run or |
| 326 | for actual execution. |
| 327 | @param hosts List of DUTs (AFE hosts) to be reassigned. |
| 328 | @param target_pool The `_DUTPool` object from which the hosts |
| 329 | are drawn. |
| 330 | @param spare_pool The `_DUTPool` object to which the hosts |
| 331 | will be added. |
| 332 | |
| 333 | """ |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 334 | _log_info(dry_run, 'Transferring %d DUTs from %s to %s.', |
| 335 | len(hosts), spare_pool.pool, target_pool.pool) |
Jacob Kopczynski | c6e483e | 2017-08-25 17:28:35 -0700 | [diff] [blame] | 336 | metrics.Counter( |
| 337 | 'chromeos/autotest/balance_pools/duts_moved', |
Richard Barnette | 5de01eb | 2017-12-15 09:53:42 -0800 | [diff] [blame] | 338 | 'DUTs transferred between pools', |
| 339 | # TODO(jrbarnette) The 'board' field is a legacy. We need to |
| 340 | # leave it here until we do the extra work Monarch requires to |
| 341 | # delete a field. |
Prathmesh Prabhu | bb5cb29 | 2017-11-09 16:42:48 -0800 | [diff] [blame] | 342 | field_spec=[ |
| 343 | ts_mon.StringField('board'), |
| 344 | ts_mon.StringField('model'), |
| 345 | ts_mon.StringField('source_pool'), |
| 346 | ts_mon.StringField('target_pool'), |
| 347 | ] |
| 348 | ).increment_by( |
| 349 | len(hosts), |
| 350 | fields={ |
Richard Barnette | 5de01eb | 2017-12-15 09:53:42 -0800 | [diff] [blame] | 351 | 'board': target_pool.labels.get('model', ''), |
Prathmesh Prabhu | bb5cb29 | 2017-11-09 16:42:48 -0800 | [diff] [blame] | 352 | 'model': target_pool.labels.get('model', ''), |
| 353 | 'source_pool': spare_pool.pool, |
| 354 | 'target_pool': target_pool.pool, |
| 355 | }, |
| 356 | ) |
Jacob Kopczynski | ed55f2e | 2017-11-10 16:26:42 -0800 | [diff] [blame] | 357 | if not hosts: |
| 358 | return |
Prathmesh Prabhu | bb5cb29 | 2017-11-09 16:42:48 -0800 | [diff] [blame] | 359 | |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 360 | additions = target_pool.pool_labels |
| 361 | removals = spare_pool.pool_labels |
| 362 | for host in hosts: |
| 363 | if not dry_run: |
| 364 | _log_message('Updating host: %s.', host.hostname) |
Richard Barnette | 07303cb | 2016-04-15 16:56:16 -0700 | [diff] [blame] | 365 | host.remove_labels(removals) |
| 366 | host.add_labels(additions) |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 367 | else: |
| 368 | _log_message('atest label remove -m %s %s', |
| 369 | host.hostname, ' '.join(removals)) |
| 370 | _log_message('atest label add -m %s %s', |
| 371 | host.hostname, ' '.join(additions)) |
| 372 | |
| 373 | |
Richard Barnette | 5de01eb | 2017-12-15 09:53:42 -0800 | [diff] [blame] | 374 | def _balance_model(arguments, afe, pool, labels, start_time, end_time): |
| 375 | """Balance one model as requested by command line arguments. |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 376 | |
| 377 | @param arguments Parsed command line arguments. |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 378 | @param afe AFE object to be used for the changes. |
Richard Barnette | 5de01eb | 2017-12-15 09:53:42 -0800 | [diff] [blame] | 379 | @param pool Pool of the model to be balanced. |
| 380 | @param labels Restrict the balancing operation within DUTs |
| 381 | that have these labels. |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 382 | @param start_time Start time for HostJobHistory objects in |
| 383 | the DUT pools. |
| 384 | @param end_time End time for HostJobHistory objects in the |
| 385 | DUT pools. |
| 386 | |
| 387 | """ |
Prathmesh Prabhu | bb5cb29 | 2017-11-09 16:42:48 -0800 | [diff] [blame] | 388 | spare_pool = _DUTPool(afe, arguments.spare, labels, start_time, end_time) |
| 389 | main_pool = _DUTPool(afe, pool, labels, start_time, end_time) |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 390 | |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 391 | target_total = main_pool.total_hosts |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 392 | if arguments.total is not None: |
| 393 | target_total = arguments.total |
| 394 | elif arguments.grow: |
| 395 | target_total += arguments.grow |
| 396 | elif arguments.shrink: |
| 397 | target_total -= arguments.shrink |
| 398 | |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 399 | spares_needed = main_pool.calculate_spares_needed(target_total) |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 400 | if spares_needed > 0: |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 401 | spare_duts = spare_pool.working_hosts[:spares_needed] |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 402 | shortfall = spares_needed - len(spare_duts) |
| 403 | else: |
| 404 | spare_duts = [] |
| 405 | shortfall = spares_needed |
| 406 | |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 407 | surplus_duts = main_pool.allocate_surplus(shortfall) |
| 408 | |
| 409 | if spares_needed or surplus_duts or arguments.verbose: |
| 410 | dry_run = arguments.dry_run |
| 411 | _log_message('') |
| 412 | |
Prathmesh Prabhu | bb5cb29 | 2017-11-09 16:42:48 -0800 | [diff] [blame] | 413 | _log_info(dry_run, 'Balancing %s %s pool:', labels, main_pool.pool) |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 414 | _log_info(dry_run, |
| 415 | 'Total %d DUTs, %d working, %d broken, %d reserved.', |
| 416 | main_pool.total_hosts, len(main_pool.working_hosts), |
| 417 | len(main_pool.broken_hosts), len(main_pool.ineligible_hosts)) |
| 418 | |
| 419 | if spares_needed > 0: |
| 420 | add_msg = 'grow pool by %d DUTs' % spares_needed |
| 421 | elif spares_needed < 0: |
| 422 | add_msg = 'shrink pool by %d DUTs' % -spares_needed |
| 423 | else: |
| 424 | add_msg = 'no change to pool size' |
| 425 | _log_info(dry_run, 'Target is %d working DUTs; %s.', |
| 426 | target_total, add_msg) |
| 427 | |
| 428 | _log_info(dry_run, |
Chung-yih Wang | cc1d9cb | 2017-11-30 11:20:45 +0800 | [diff] [blame] | 429 | '%s %s pool has %d spares available for balancing pool %s', |
| 430 | labels, spare_pool.pool, len(spare_pool.working_hosts), |
| 431 | main_pool.pool) |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 432 | |
| 433 | if spares_needed > len(spare_duts): |
| 434 | _log_error('Not enough spares: need %d, only have %d.', |
| 435 | spares_needed, len(spare_duts)) |
| 436 | elif shortfall >= 0: |
| 437 | _log_info(dry_run, |
| 438 | '%s %s pool will return %d broken DUTs, ' |
| 439 | 'leaving %d still in the pool.', |
Prathmesh Prabhu | bb5cb29 | 2017-11-09 16:42:48 -0800 | [diff] [blame] | 440 | labels, main_pool.pool, |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 441 | len(surplus_duts), |
| 442 | len(main_pool.broken_hosts) - len(surplus_duts)) |
| 443 | else: |
| 444 | _log_info(dry_run, |
| 445 | '%s %s pool will return %d surplus DUTs, ' |
| 446 | 'including %d working DUTs.', |
Prathmesh Prabhu | bb5cb29 | 2017-11-09 16:42:48 -0800 | [diff] [blame] | 447 | labels, main_pool.pool, |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 448 | len(main_pool.broken_hosts) - shortfall, |
| 449 | -shortfall) |
| 450 | |
David James | f1d6e45 | 2015-07-17 15:23:04 -0700 | [diff] [blame] | 451 | if (len(main_pool.broken_hosts) > arguments.max_broken and |
| 452 | not arguments.force_rebalance): |
| 453 | _log_error('%s %s pool: Refusing to act on pool with %d broken DUTs.', |
Prathmesh Prabhu | bb5cb29 | 2017-11-09 16:42:48 -0800 | [diff] [blame] | 454 | labels, main_pool.pool, len(main_pool.broken_hosts)) |
Richard Barnette | 5de01eb | 2017-12-15 09:53:42 -0800 | [diff] [blame] | 455 | _log_error('Please investigate this model to for a bug ') |
David James | f1d6e45 | 2015-07-17 15:23:04 -0700 | [diff] [blame] | 456 | _log_error('that is bricking devices. Once you have finished your ') |
| 457 | _log_error('investigation, you can force a rebalance with ') |
| 458 | _log_error('--force-rebalance') |
Jacob Kopczynski | ed55f2e | 2017-11-10 16:26:42 -0800 | [diff] [blame] | 459 | spare_duts = [] |
| 460 | surplus_duts = [] |
David James | f1d6e45 | 2015-07-17 15:23:04 -0700 | [diff] [blame] | 461 | |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 462 | if not spare_duts and not surplus_duts: |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 463 | if arguments.verbose: |
| 464 | _log_info(arguments.dry_run, 'No exchange required.') |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 465 | |
| 466 | _exchange_labels(arguments.dry_run, surplus_duts, |
| 467 | spare_pool, main_pool) |
| 468 | _exchange_labels(arguments.dry_run, spare_duts, |
| 469 | main_pool, spare_pool) |
| 470 | |
| 471 | |
Prathmesh Prabhu | bb5cb29 | 2017-11-09 16:42:48 -0800 | [diff] [blame] | 472 | def _too_many_broken(inventory, pool, args): |
Kevin Cheng | cf0ad2b | 2016-04-19 14:51:39 -0700 | [diff] [blame] | 473 | """ |
Richard Barnette | 5de01eb | 2017-12-15 09:53:42 -0800 | [diff] [blame] | 474 | Get the inventory of models and check if too many are broken. |
Kevin Cheng | cf0ad2b | 2016-04-19 14:51:39 -0700 | [diff] [blame] | 475 | |
Prathmesh Prabhu | bb5cb29 | 2017-11-09 16:42:48 -0800 | [diff] [blame] | 476 | @param inventory: _LabInventory object. |
| 477 | @param pool: The pool to check. |
| 478 | @param args: Parsed command line arguments. |
Kevin Cheng | cf0ad2b | 2016-04-19 14:51:39 -0700 | [diff] [blame] | 479 | |
Richard Barnette | 5de01eb | 2017-12-15 09:53:42 -0800 | [diff] [blame] | 480 | @return True if the number of models with 1 or more broken duts |
| 481 | exceed max_broken_models, False otherwise. |
Kevin Cheng | cf0ad2b | 2016-04-19 14:51:39 -0700 | [diff] [blame] | 482 | """ |
Prathmesh Prabhu | bb5cb29 | 2017-11-09 16:42:48 -0800 | [diff] [blame] | 483 | # Were we asked to skip this check? |
| 484 | if (args.force_rebalance or |
Richard Barnette | 5de01eb | 2017-12-15 09:53:42 -0800 | [diff] [blame] | 485 | (args.all_models and args.max_broken_models == 0)): |
Kevin Cheng | cf0ad2b | 2016-04-19 14:51:39 -0700 | [diff] [blame] | 486 | return False |
| 487 | |
Richard Barnette | 5de01eb | 2017-12-15 09:53:42 -0800 | [diff] [blame] | 488 | max_broken = args.max_broken_models |
| 489 | if max_broken is None: |
| 490 | total_num = len(inventory.get_pool_models(pool)) |
| 491 | max_broken = int(_MAX_BROKEN_DEFAULT_RATIO * total_num) |
Prathmesh Prabhu | bb5cb29 | 2017-11-09 16:42:48 -0800 | [diff] [blame] | 492 | _log_info(args.dry_run, |
Richard Barnette | 5de01eb | 2017-12-15 09:53:42 -0800 | [diff] [blame] | 493 | 'Max broken models for pool %s: %d', |
Prathmesh Prabhu | bb5cb29 | 2017-11-09 16:42:48 -0800 | [diff] [blame] | 494 | pool, max_broken) |
Kevin Cheng | cf0ad2b | 2016-04-19 14:51:39 -0700 | [diff] [blame] | 495 | |
Richard Barnette | 5de01eb | 2017-12-15 09:53:42 -0800 | [diff] [blame] | 496 | broken = [model for model, counts in inventory.iteritems() |
| 497 | if counts.get_broken(pool) != 0] |
| 498 | _log_message('There are %d models in the %s pool with at least 1 ' |
Prathmesh Prabhu | bb5cb29 | 2017-11-09 16:42:48 -0800 | [diff] [blame] | 499 | 'broken DUT (max threshold %d)', |
| 500 | len(broken), pool, max_broken) |
| 501 | for b in sorted(broken): |
| 502 | _log_message(b) |
| 503 | return len(broken) > max_broken |
Kevin Cheng | cf0ad2b | 2016-04-19 14:51:39 -0700 | [diff] [blame] | 504 | |
| 505 | |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 506 | def _parse_command(argv): |
| 507 | """Parse the command line arguments. |
| 508 | |
| 509 | Create an argument parser for this command's syntax, parse the |
| 510 | command line, and return the result of the `ArgumentParser` |
| 511 | `parse_args()` method. |
| 512 | |
| 513 | @param argv Standard command line argument vector; `argv[0]` is |
| 514 | assumed to be the command name. |
| 515 | |
| 516 | @return Result returned by `ArgumentParser.parse_args()`. |
| 517 | |
| 518 | """ |
| 519 | parser = argparse.ArgumentParser( |
Richard Barnette | 81ef224 | 2018-06-14 14:34:34 -0700 | [diff] [blame] | 520 | prog=os.path.basename(argv[0]), |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 521 | description='Balance pool shortages from spares on reserve') |
| 522 | |
Prathmesh Prabhu | c8cf0f6 | 2017-11-09 08:57:58 -0800 | [diff] [blame] | 523 | parser.add_argument( |
| 524 | '-w', '--web', type=str, default=None, |
| 525 | help='AFE host to use. Default comes from shadow_config.', |
| 526 | ) |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 527 | count_group = parser.add_mutually_exclusive_group() |
| 528 | count_group.add_argument('-t', '--total', type=int, |
| 529 | metavar='COUNT', default=None, |
| 530 | help='Set the number of DUTs in the ' |
| 531 | 'pool to the specified count for ' |
Richard Barnette | 5de01eb | 2017-12-15 09:53:42 -0800 | [diff] [blame] | 532 | 'every MODEL') |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 533 | count_group.add_argument('-a', '--grow', type=int, |
| 534 | metavar='COUNT', default=None, |
| 535 | help='Add the specified number of DUTs ' |
Richard Barnette | 5de01eb | 2017-12-15 09:53:42 -0800 | [diff] [blame] | 536 | 'to the pool for every MODEL') |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 537 | count_group.add_argument('-d', '--shrink', type=int, |
| 538 | metavar='COUNT', default=None, |
| 539 | help='Remove the specified number of DUTs ' |
Richard Barnette | 5de01eb | 2017-12-15 09:53:42 -0800 | [diff] [blame] | 540 | 'from the pool for every MODEL') |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 541 | |
Kevin Cheng | cf0ad2b | 2016-04-19 14:51:39 -0700 | [diff] [blame] | 542 | parser.add_argument('-s', '--spare', default=_SPARE_DEFAULT, |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 543 | metavar='POOL', |
| 544 | help='Pool from which to draw replacement ' |
Kevin Cheng | cf0ad2b | 2016-04-19 14:51:39 -0700 | [diff] [blame] | 545 | 'spares (default: pool:%s)' % _SPARE_DEFAULT) |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 546 | parser.add_argument('-n', '--dry-run', action='store_true', |
| 547 | help='Report actions to take in the form of ' |
| 548 | 'shell commands') |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 549 | parser.add_argument('-v', '--verbose', action='store_true', |
| 550 | help='Print more detail about calculations for debug ' |
| 551 | 'purposes.') |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 552 | |
David James | f1d6e45 | 2015-07-17 15:23:04 -0700 | [diff] [blame] | 553 | parser.add_argument('-m', '--max-broken', default=2, type=int, |
| 554 | metavar='COUNT', |
| 555 | help='Only rebalance a pool if it has at most ' |
| 556 | 'COUNT broken DUTs.') |
| 557 | parser.add_argument('-f', '--force-rebalance', action='store_true', |
| 558 | help='Forcefully rebalance all DUTs in a pool, even ' |
| 559 | 'if it has a large number of broken DUTs. ' |
| 560 | 'Before doing this, please investigate whether ' |
| 561 | 'there is a bug that is bricking devices in the ' |
| 562 | 'lab.') |
Jacob Kopczynski | f8d90a8 | 2017-10-10 14:37:33 -0700 | [diff] [blame] | 563 | parser.add_argument('--production', action='store_true', |
Prathmesh Prabhu | 7a050da | 2017-11-09 09:15:03 -0800 | [diff] [blame] | 564 | help='Treat this as a production run. This will ' |
| 565 | 'collect metrics.') |
David James | f1d6e45 | 2015-07-17 15:23:04 -0700 | [diff] [blame] | 566 | |
Prathmesh Prabhu | bb5cb29 | 2017-11-09 16:42:48 -0800 | [diff] [blame] | 567 | parser.add_argument( |
Prathmesh Prabhu | bb5cb29 | 2017-11-09 16:42:48 -0800 | [diff] [blame] | 568 | '--all-models', |
| 569 | action='store_true', |
| 570 | help='Rebalance all managed models. This will do a very expensive ' |
| 571 | 'check to see how many models have at least one broken DUT. ' |
| 572 | 'To bypass that check, set --max-broken-models to 0.', |
| 573 | ) |
| 574 | parser.add_argument( |
Richard Barnette | 5de01eb | 2017-12-15 09:53:42 -0800 | [diff] [blame] | 575 | '--max-broken-models', default=None, type=int, metavar='COUNT', |
Prathmesh Prabhu | bb5cb29 | 2017-11-09 16:42:48 -0800 | [diff] [blame] | 576 | help='Only rebalance all models if number of models with broken ' |
| 577 | 'DUTs in the specified pool is less than COUNT.', |
| 578 | ) |
| 579 | |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 580 | parser.add_argument('pool', |
| 581 | metavar='POOL', |
Kevin Cheng | cf0ad2b | 2016-04-19 14:51:39 -0700 | [diff] [blame] | 582 | help='Name of the pool to balance. Use %s to balance ' |
| 583 | 'all critical pools' % _ALL_CRITICAL_POOLS) |
Richard Barnette | 5de01eb | 2017-12-15 09:53:42 -0800 | [diff] [blame] | 584 | parser.add_argument('models', nargs='*', metavar='MODEL', |
| 585 | help='Names of models to balance.') |
Aviv Keshet | 39c20dd | 2017-11-02 10:17:13 -0700 | [diff] [blame] | 586 | |
Matthew Leszczenski | df2c3d7 | 2018-11-13 14:50:01 -0800 | [diff] [blame] | 587 | parser.add_argument('-p', '--phase', metavar='PHASE', |
| 588 | help='Optional phase label to restrict balance ' |
| 589 | 'operation to.') |
| 590 | |
Chung-yih Wang | cc1d9cb | 2017-11-30 11:20:45 +0800 | [diff] [blame] | 591 | parser.add_argument('--sku', type=str, |
| 592 | help='Optional name of sku to restrict to.') |
| 593 | |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 594 | arguments = parser.parse_args(argv[1:]) |
David James | 8352bc2 | 2015-05-05 16:37:05 -0700 | [diff] [blame] | 595 | |
| 596 | # Error-check arguments. |
Richard Barnette | 5de01eb | 2017-12-15 09:53:42 -0800 | [diff] [blame] | 597 | if arguments.models and arguments.all_models: |
| 598 | parser.error('Cannot specify individual models on the command line ' |
| 599 | 'when using --all-models.') |
Kevin Cheng | cf0ad2b | 2016-04-19 14:51:39 -0700 | [diff] [blame] | 600 | if (arguments.pool == _ALL_CRITICAL_POOLS and |
Prathmesh Prabhu | bb5cb29 | 2017-11-09 16:42:48 -0800 | [diff] [blame] | 601 | arguments.spare != _SPARE_DEFAULT): |
Kevin Cheng | cf0ad2b | 2016-04-19 14:51:39 -0700 | [diff] [blame] | 602 | parser.error('Cannot specify --spare pool to be %s when balancing all ' |
| 603 | 'critical pools.' % _SPARE_DEFAULT) |
Richard Barnette | 81ef224 | 2018-06-14 14:34:34 -0700 | [diff] [blame] | 604 | for p in (arguments.spare, arguments.pool): |
| 605 | if not _VALID_POOL_PATTERN.match(p): |
| 606 | parser.error('Invalid pool name: %s' % p) |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 607 | return arguments |
| 608 | |
| 609 | |
Prathmesh Prabhu | bb5cb29 | 2017-11-09 16:42:48 -0800 | [diff] [blame] | 610 | def infer_balancer_targets(afe, arguments, pools): |
Richard Barnette | 5de01eb | 2017-12-15 09:53:42 -0800 | [diff] [blame] | 611 | """Take some arguments and translate them to a list of models to balance |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 612 | |
Jacob Kopczynski | c6e483e | 2017-08-25 17:28:35 -0700 | [diff] [blame] | 613 | Args: |
| 614 | @param afe AFE object to be used for taking inventory. |
| 615 | @param arguments Parsed command line arguments. |
| 616 | @param pools The list of pools to balance. |
| 617 | |
Richard Barnette | 5de01eb | 2017-12-15 09:53:42 -0800 | [diff] [blame] | 618 | @returns a list of (model, labels) tuples to be balanced |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 619 | |
| 620 | """ |
Prathmesh Prabhu | bb5cb29 | 2017-11-09 16:42:48 -0800 | [diff] [blame] | 621 | balancer_targets = [] |
Aviv Keshet | 39c20dd | 2017-11-02 10:17:13 -0700 | [diff] [blame] | 622 | |
Prathmesh Prabhu | bb5cb29 | 2017-11-09 16:42:48 -0800 | [diff] [blame] | 623 | for pool in pools: |
Richard Barnette | 5de01eb | 2017-12-15 09:53:42 -0800 | [diff] [blame] | 624 | if arguments.all_models: |
Prathmesh Prabhu | bb5cb29 | 2017-11-09 16:42:48 -0800 | [diff] [blame] | 625 | inventory = lab_inventory.get_inventory(afe) |
| 626 | quarantine = _too_many_broken(inventory, pool, arguments) |
Jacob Kopczynski | c6e483e | 2017-08-25 17:28:35 -0700 | [diff] [blame] | 627 | if quarantine: |
Richard Barnette | 5de01eb | 2017-12-15 09:53:42 -0800 | [diff] [blame] | 628 | _log_error('Refusing to balance all models for %s pool, ' |
| 629 | 'too many models with at least 1 broken DUT ' |
| 630 | 'detected.', pool) |
Kevin Cheng | cf0ad2b | 2016-04-19 14:51:39 -0700 | [diff] [blame] | 631 | else: |
Richard Barnette | 685ac85 | 2018-04-04 16:14:06 -0700 | [diff] [blame] | 632 | for model in inventory.get_pool_models(pool): |
Richard Barnette | 5de01eb | 2017-12-15 09:53:42 -0800 | [diff] [blame] | 633 | labels = labellib.LabelsMapping() |
| 634 | labels['model'] = model |
Matthew Leszczenski | df2c3d7 | 2018-11-13 14:50:01 -0800 | [diff] [blame] | 635 | if arguments.phase: |
| 636 | labels['phase'] = arguments.phase |
Richard Barnette | 5de01eb | 2017-12-15 09:53:42 -0800 | [diff] [blame] | 637 | balancer_targets.append((pool, labels.getlabels())) |
Jacob Kopczynski | c6e483e | 2017-08-25 17:28:35 -0700 | [diff] [blame] | 638 | metrics.Boolean( |
| 639 | 'chromeos/autotest/balance_pools/unchanged_pools').set( |
| 640 | quarantine, fields={'pool': pool}) |
Prathmesh Prabhu | bb5cb29 | 2017-11-09 16:42:48 -0800 | [diff] [blame] | 641 | _log_message('Pool %s quarantine status: %s', pool, quarantine) |
| 642 | else: |
Richard Barnette | 5de01eb | 2017-12-15 09:53:42 -0800 | [diff] [blame] | 643 | for model in arguments.models: |
Prathmesh Prabhu | bb5cb29 | 2017-11-09 16:42:48 -0800 | [diff] [blame] | 644 | labels = labellib.LabelsMapping() |
Richard Barnette | 5de01eb | 2017-12-15 09:53:42 -0800 | [diff] [blame] | 645 | labels['model'] = model |
Richard Barnette | 32fb1e8 | 2018-01-30 13:39:30 -0800 | [diff] [blame] | 646 | if arguments.sku: |
| 647 | labels['sku'] = arguments.sku |
Matthew Leszczenski | df2c3d7 | 2018-11-13 14:50:01 -0800 | [diff] [blame] | 648 | if arguments.phase: |
| 649 | labels['phase'] = arguments.phase |
Richard Barnette | 32fb1e8 | 2018-01-30 13:39:30 -0800 | [diff] [blame] | 650 | balancer_targets.append((pool, labels.getlabels())) |
Prathmesh Prabhu | bb5cb29 | 2017-11-09 16:42:48 -0800 | [diff] [blame] | 651 | return balancer_targets |
Jacob Kopczynski | c6e483e | 2017-08-25 17:28:35 -0700 | [diff] [blame] | 652 | |
| 653 | |
| 654 | def main(argv): |
| 655 | """Standard main routine. |
| 656 | |
| 657 | @param argv Command line arguments including `sys.argv[0]`. |
| 658 | |
| 659 | """ |
Jacob Kopczynski | f8d90a8 | 2017-10-10 14:37:33 -0700 | [diff] [blame] | 660 | arguments = _parse_command(argv) |
| 661 | if arguments.production: |
Aviv Keshet | 2cc427d | 2018-04-18 13:39:24 -0700 | [diff] [blame] | 662 | metrics_manager = site_utils.SetupTsMonGlobalState('balance_pools', |
| 663 | indirect=True) |
Jacob Kopczynski | f8d90a8 | 2017-10-10 14:37:33 -0700 | [diff] [blame] | 664 | else: |
Jacob Kopczynski | f8d90a8 | 2017-10-10 14:37:33 -0700 | [diff] [blame] | 665 | metrics_manager = site_utils.TrivialContextManager() |
| 666 | |
| 667 | with metrics_manager: |
Aviv Keshet | 259a650 | 2018-04-18 13:48:02 -0700 | [diff] [blame] | 668 | with metrics.SuccessCounter('chromeos/autotest/balance_pools/runs'): |
| 669 | end_time = time.time() |
| 670 | start_time = end_time - 24 * 60 * 60 |
Aviv Keshet | a883432 | 2018-05-07 13:28:32 -0700 | [diff] [blame] | 671 | afe = frontend_wrappers.RetryingAFE(server=arguments.web) |
Prathmesh Prabhu | bb5cb29 | 2017-11-09 16:42:48 -0800 | [diff] [blame] | 672 | |
Aviv Keshet | 259a650 | 2018-04-18 13:48:02 -0700 | [diff] [blame] | 673 | def balancer(pool, labels): |
| 674 | """Balance the specified model. |
Prathmesh Prabhu | bb5cb29 | 2017-11-09 16:42:48 -0800 | [diff] [blame] | 675 | |
Aviv Keshet | 259a650 | 2018-04-18 13:48:02 -0700 | [diff] [blame] | 676 | @param pool: The pool to rebalance for the model. |
| 677 | @param labels: labels to restrict to balancing operations |
| 678 | within. |
| 679 | """ |
| 680 | _balance_model(arguments, afe, pool, labels, |
| 681 | start_time, end_time) |
| 682 | _log_message('') |
Prathmesh Prabhu | bb5cb29 | 2017-11-09 16:42:48 -0800 | [diff] [blame] | 683 | |
Aviv Keshet | 259a650 | 2018-04-18 13:48:02 -0700 | [diff] [blame] | 684 | pools = (lab_inventory.CRITICAL_POOLS |
| 685 | if arguments.pool == _ALL_CRITICAL_POOLS |
| 686 | else [arguments.pool]) |
| 687 | balancer_targets = infer_balancer_targets(afe, arguments, pools) |
| 688 | try: |
| 689 | parallel.RunTasksInProcessPool( |
| 690 | balancer, |
| 691 | balancer_targets, |
| 692 | processes=8, |
| 693 | ) |
| 694 | except KeyboardInterrupt: |
| 695 | pass |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 696 | |
| 697 | |
| 698 | if __name__ == '__main__': |
| 699 | main(sys.argv) |