J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 1 | #!/usr/bin/env python |
| 2 | # Copyright 2015 The Chromium OS Authors. All rights reserved. |
| 3 | # Use of this source code is governed by a BSD-style license that can be |
| 4 | # found in the LICENSE file. |
| 5 | |
| 6 | """Adjust pool balances to cover DUT shortfalls. |
| 7 | |
| 8 | This command takes all broken DUTs in a specific pool for specific |
Richard Barnette | 5de01eb | 2017-12-15 09:53:42 -0800 | [diff] [blame] | 9 | models and swaps them with working DUTs taken from a selected pool |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 10 | of spares. The command is meant primarily for replacing broken DUTs |
| 11 | in critical pools like BVT or CQ, but it can also be used to adjust |
| 12 | pool sizes, or to create or remove pools. |
| 13 | |
Richard Barnette | 5de01eb | 2017-12-15 09:53:42 -0800 | [diff] [blame] | 14 | usage: balance_pool.py [ options ] POOL MODEL [ MODEL ... ] |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 15 | |
| 16 | positional arguments: |
| 17 | POOL Name of the pool to balance |
Richard Barnette | 5de01eb | 2017-12-15 09:53:42 -0800 | [diff] [blame] | 18 | MODEL Names of models to balance |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 19 | |
| 20 | optional arguments: |
| 21 | -h, --help show this help message and exit |
| 22 | -t COUNT, --total COUNT |
| 23 | Set the number of DUTs in the pool to the specified |
Richard Barnette | 5de01eb | 2017-12-15 09:53:42 -0800 | [diff] [blame] | 24 | count for every MODEL |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 25 | -a COUNT, --grow COUNT |
| 26 | Add the specified number of DUTs to the pool for every |
Richard Barnette | 5de01eb | 2017-12-15 09:53:42 -0800 | [diff] [blame] | 27 | MODEL |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 28 | -d COUNT, --shrink COUNT |
| 29 | Remove the specified number of DUTs from the pool for |
Richard Barnette | 5de01eb | 2017-12-15 09:53:42 -0800 | [diff] [blame] | 30 | every MODEL |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 31 | -s POOL, --spare POOL |
| 32 | Pool from which to draw replacement spares (default: |
| 33 | pool:suites) |
Chung-yih Wang | cc1d9cb | 2017-11-30 11:20:45 +0800 | [diff] [blame] | 34 | --sku SKU The specific SKU we intend to swap with |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 35 | -n, --dry-run Report actions to take in the form of shell commands |
| 36 | |
| 37 | |
| 38 | The command attempts to remove all broken DUTs from the target POOL |
Richard Barnette | 5de01eb | 2017-12-15 09:53:42 -0800 | [diff] [blame] | 39 | for every MODEL, and replace them with enough working DUTs taken |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 40 | from the spare pool to bring the strength of POOL to the requested |
| 41 | total COUNT. |
| 42 | |
| 43 | If no COUNT options are supplied (i.e. there are no --total, --grow, |
| 44 | or --shrink options), the command will maintain the current totals of |
Richard Barnette | 5de01eb | 2017-12-15 09:53:42 -0800 | [diff] [blame] | 45 | DUTs for every MODEL in the target POOL. |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 46 | |
| 47 | If not enough working spares are available, broken DUTs may be left |
| 48 | in the pool to keep the pool at the target COUNT. |
| 49 | |
| 50 | When reducing pool size, working DUTs will be returned after broken |
| 51 | DUTs, if it's necessary to achieve the target COUNT. |
| 52 | |
| 53 | """ |
| 54 | |
| 55 | |
| 56 | import argparse |
| 57 | import sys |
| 58 | import time |
| 59 | |
| 60 | import common |
Xixuan Wu | 93e646c | 2017-12-07 18:36:10 -0800 | [diff] [blame] | 61 | from autotest_lib.server import constants |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 62 | from autotest_lib.server import frontend |
Jacob Kopczynski | c6e483e | 2017-08-25 17:28:35 -0700 | [diff] [blame] | 63 | from autotest_lib.server import site_utils |
Aviv Keshet | 7ee9586 | 2016-08-30 15:18:27 -0700 | [diff] [blame] | 64 | from autotest_lib.server.lib import status_history |
Kevin Cheng | cf0ad2b | 2016-04-19 14:51:39 -0700 | [diff] [blame] | 65 | from autotest_lib.site_utils import lab_inventory |
Prathmesh Prabhu | 68acc40 | 2017-11-09 15:24:15 -0800 | [diff] [blame] | 66 | from autotest_lib.utils import labellib |
Jacob Kopczynski | c6e483e | 2017-08-25 17:28:35 -0700 | [diff] [blame] | 67 | from chromite.lib import metrics |
David James | 2a3cb54 | 2015-05-05 17:13:43 -0700 | [diff] [blame] | 68 | from chromite.lib import parallel |
| 69 | |
Jacob Kopczynski | f8d90a8 | 2017-10-10 14:37:33 -0700 | [diff] [blame] | 70 | #This must be imported after chromite.lib.metrics |
| 71 | from infra_libs import ts_mon |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 72 | |
| 73 | _POOL_PREFIX = constants.Labels.POOL_PREFIX |
Richard Barnette | 5de01eb | 2017-12-15 09:53:42 -0800 | [diff] [blame] | 74 | # This is the ratio of all models we should calculate the default max |
| 75 | # number of broken models against. It seemed like the best choice that |
| 76 | # was neither too strict nor lax. |
Prathmesh Prabhu | bb5cb29 | 2017-11-09 16:42:48 -0800 | [diff] [blame] | 77 | _MAX_BROKEN_DEFAULT_RATIO = 3.0 / 8.0 |
Kevin Cheng | cf0ad2b | 2016-04-19 14:51:39 -0700 | [diff] [blame] | 78 | |
| 79 | _ALL_CRITICAL_POOLS = 'all_critical_pools' |
| 80 | _SPARE_DEFAULT = lab_inventory.SPARE_POOL |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 81 | |
| 82 | |
| 83 | def _log_message(message, *args): |
| 84 | """Log a message with optional format arguments to stdout. |
| 85 | |
| 86 | This function logs a single line to stdout, with formatting |
| 87 | if necessary, and without adornments. |
| 88 | |
| 89 | If `*args` are supplied, the message will be formatted using |
| 90 | the arguments. |
| 91 | |
| 92 | @param message Message to be logged, possibly after formatting. |
| 93 | @param args Format arguments. If empty, the message is logged |
| 94 | without formatting. |
| 95 | |
| 96 | """ |
| 97 | if args: |
| 98 | message = message % args |
| 99 | sys.stdout.write('%s\n' % message) |
| 100 | |
| 101 | |
| 102 | def _log_info(dry_run, message, *args): |
| 103 | """Log information in a dry-run dependent fashion. |
| 104 | |
| 105 | This function logs a single line to stdout, with formatting |
| 106 | if necessary. When logging for a dry run, the message is |
| 107 | printed as a shell comment, rather than as unadorned text. |
| 108 | |
| 109 | If `*args` are supplied, the message will be formatted using |
| 110 | the arguments. |
| 111 | |
| 112 | @param message Message to be logged, possibly after formatting. |
| 113 | @param args Format arguments. If empty, the message is logged |
| 114 | without formatting. |
| 115 | |
| 116 | """ |
| 117 | if dry_run: |
| 118 | message = '# ' + message |
| 119 | _log_message(message, *args) |
| 120 | |
| 121 | |
| 122 | def _log_error(message, *args): |
| 123 | """Log an error to stderr, with optional format arguments. |
| 124 | |
| 125 | This function logs a single line to stderr, prefixed to indicate |
| 126 | that it is an error message. |
| 127 | |
| 128 | If `*args` are supplied, the message will be formatted using |
| 129 | the arguments. |
| 130 | |
| 131 | @param message Message to be logged, possibly after formatting. |
| 132 | @param args Format arguments. If empty, the message is logged |
| 133 | without formatting. |
| 134 | |
| 135 | """ |
| 136 | if args: |
| 137 | message = message % args |
| 138 | sys.stderr.write('ERROR: %s\n' % message) |
| 139 | |
| 140 | |
| 141 | class _DUTPool(object): |
Prathmesh Prabhu | bb5cb29 | 2017-11-09 16:42:48 -0800 | [diff] [blame] | 142 | """Information about a pool of DUTs matching given labels. |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 143 | |
Prathmesh Prabhu | bb5cb29 | 2017-11-09 16:42:48 -0800 | [diff] [blame] | 144 | This class collects information about all DUTs for a given pool and matching |
| 145 | the given labels, and divides them into three categories: |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 146 | + Working - the DUT is working for testing, and not locked. |
| 147 | + Broken - the DUT is unable to run tests, or it is locked. |
Prathmesh Prabhu | bb5cb29 | 2017-11-09 16:42:48 -0800 | [diff] [blame] | 148 | + Ineligible - the DUT is not available to be removed from this pool. The |
| 149 | DUT may be either working or broken. |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 150 | |
| 151 | DUTs with more than one pool: label are ineligible for exchange |
| 152 | during balancing. This is done for the sake of chameleon hosts, |
| 153 | which must always be assigned to pool:suites. These DUTs are |
| 154 | always marked with pool:chameleon to prevent their reassignment. |
| 155 | |
| 156 | TODO(jrbarnette): The use of `pool:chamelon` (instead of just |
| 157 | the `chameleon` label is a hack that should be eliminated. |
| 158 | |
| 159 | _DUTPool instances are used to track both main pools that need |
| 160 | to be resupplied with working DUTs and spare pools that supply |
| 161 | those DUTs. |
| 162 | |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 163 | @property pool Name of the pool associated with |
| 164 | this pool of DUTs. |
Prathmesh Prabhu | bb5cb29 | 2017-11-09 16:42:48 -0800 | [diff] [blame] | 165 | @property labels Labels that constrain the DUTs to consider. |
Prathmesh Prabhu | bb5cb29 | 2017-11-09 16:42:48 -0800 | [diff] [blame] | 166 | @property working_hosts The list of this pool's working DUTs. |
| 167 | @property broken_hosts The list of this pool's broken DUTs. |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 168 | @property ineligible_hosts The list of this pool's ineligible DUTs. |
Prathmesh Prabhu | bb5cb29 | 2017-11-09 16:42:48 -0800 | [diff] [blame] | 169 | @property pool_labels A list of labels that identify a DUT as part |
| 170 | of this pool. |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 171 | @property total_hosts The total number of hosts in pool. |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 172 | |
| 173 | """ |
| 174 | |
Prathmesh Prabhu | bb5cb29 | 2017-11-09 16:42:48 -0800 | [diff] [blame] | 175 | def __init__(self, afe, pool, labels, start_time, end_time): |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 176 | self.pool = pool |
Prathmesh Prabhu | bb5cb29 | 2017-11-09 16:42:48 -0800 | [diff] [blame] | 177 | self.labels = labellib.LabelsMapping(labels) |
| 178 | self.labels['pool'] = pool |
| 179 | self._pool_labels = [_POOL_PREFIX + self.pool] |
| 180 | |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 181 | self.working_hosts = [] |
| 182 | self.broken_hosts = [] |
| 183 | self.ineligible_hosts = [] |
Richard Barnette | 07303cb | 2016-04-15 16:56:16 -0700 | [diff] [blame] | 184 | self.total_hosts = self._get_hosts(afe, start_time, end_time) |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 185 | |
| 186 | |
Richard Barnette | 07303cb | 2016-04-15 16:56:16 -0700 | [diff] [blame] | 187 | def _get_hosts(self, afe, start_time, end_time): |
Prathmesh Prabhu | 68acc40 | 2017-11-09 15:24:15 -0800 | [diff] [blame] | 188 | all_histories = status_history.HostJobHistory.get_multiple_histories( |
Prathmesh Prabhu | bb5cb29 | 2017-11-09 16:42:48 -0800 | [diff] [blame] | 189 | afe, start_time, end_time, self.labels.getlabels()) |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 190 | for h in all_histories: |
| 191 | host = h.host |
| 192 | host_pools = [l for l in host.labels |
| 193 | if l.startswith(_POOL_PREFIX)] |
| 194 | if len(host_pools) != 1: |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 195 | self.ineligible_hosts.append(host) |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 196 | else: |
| 197 | diag = h.last_diagnosis()[0] |
| 198 | if (diag == status_history.WORKING and |
| 199 | not host.locked): |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 200 | self.working_hosts.append(host) |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 201 | else: |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 202 | self.broken_hosts.append(host) |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 203 | return len(all_histories) |
| 204 | |
| 205 | |
| 206 | @property |
| 207 | def pool_labels(self): |
| 208 | """Return the AFE labels that identify this pool. |
| 209 | |
| 210 | The returned labels are the labels that must be removed |
| 211 | to remove a DUT from the pool, or added to add a DUT. |
| 212 | |
| 213 | @return A list of AFE labels suitable for AFE.add_labels() |
| 214 | or AFE.remove_labels(). |
| 215 | |
| 216 | """ |
Prathmesh Prabhu | bb5cb29 | 2017-11-09 16:42:48 -0800 | [diff] [blame] | 217 | return self._pool_labels |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 218 | |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 219 | def calculate_spares_needed(self, target_total): |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 220 | """Calculate and log the spares needed to achieve a target. |
| 221 | |
| 222 | Return how many working spares are needed to achieve the |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 223 | given `target_total` with all DUTs working. |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 224 | |
| 225 | The spares count may be positive or negative. Positive |
| 226 | values indicate spares are needed to replace broken DUTs in |
| 227 | order to reach the target; negative numbers indicate that |
| 228 | no spares are needed, and that a corresponding number of |
| 229 | working devices can be returned. |
| 230 | |
| 231 | If the new target total would require returning ineligible |
| 232 | DUTs, an error is logged, and the target total is adjusted |
| 233 | so that those DUTs are not exchanged. |
| 234 | |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 235 | @param target_total The new target pool size. |
| 236 | |
| 237 | @return The number of spares needed. |
| 238 | |
| 239 | """ |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 240 | num_ineligible = len(self.ineligible_hosts) |
Jacob Kopczynski | c6e483e | 2017-08-25 17:28:35 -0700 | [diff] [blame] | 241 | spares_needed = target_total >= num_ineligible |
Jacob Kopczynski | ed55f2e | 2017-11-10 16:26:42 -0800 | [diff] [blame] | 242 | metrics.Boolean( |
| 243 | 'chromeos/autotest/balance_pools/exhausted_pools', |
Richard Barnette | 5de01eb | 2017-12-15 09:53:42 -0800 | [diff] [blame] | 244 | 'True for each pool/model which requests more DUTs than supplied', |
| 245 | # TODO(jrbarnette) The 'board' field is a legacy. We need |
| 246 | # to leave it here until we do the extra work Monarch |
| 247 | # requires to delete a field. |
Jacob Kopczynski | ed55f2e | 2017-11-10 16:26:42 -0800 | [diff] [blame] | 248 | field_spec=[ |
Prathmesh Prabhu | bb5cb29 | 2017-11-09 16:42:48 -0800 | [diff] [blame] | 249 | ts_mon.StringField('pool'), |
| 250 | ts_mon.StringField('board'), |
| 251 | ts_mon.StringField('model'), |
| 252 | ]).set( |
| 253 | not spares_needed, |
| 254 | fields={ |
| 255 | 'pool': self.pool, |
Richard Barnette | 5de01eb | 2017-12-15 09:53:42 -0800 | [diff] [blame] | 256 | 'board': self.labels.get('model', ''), |
Prathmesh Prabhu | bb5cb29 | 2017-11-09 16:42:48 -0800 | [diff] [blame] | 257 | 'model': self.labels.get('model', ''), |
| 258 | }, |
Jacob Kopczynski | ed55f2e | 2017-11-10 16:26:42 -0800 | [diff] [blame] | 259 | ) |
Jacob Kopczynski | c6e483e | 2017-08-25 17:28:35 -0700 | [diff] [blame] | 260 | if not spares_needed: |
Prathmesh Prabhu | bb5cb29 | 2017-11-09 16:42:48 -0800 | [diff] [blame] | 261 | _log_error( |
| 262 | '%s pool (%s): Target of %d is below minimum of %d DUTs.', |
| 263 | self.pool, self.labels, target_total, num_ineligible, |
| 264 | ) |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 265 | _log_error('Adjusting target to %d DUTs.', num_ineligible) |
| 266 | target_total = num_ineligible |
Jacob Kopczynski | ed55f2e | 2017-11-10 16:26:42 -0800 | [diff] [blame] | 267 | else: |
| 268 | _log_message('%s %s pool: Target of %d is above minimum.', |
Richard Barnette | 5de01eb | 2017-12-15 09:53:42 -0800 | [diff] [blame] | 269 | self.labels.get('model', ''), self.pool, target_total) |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 270 | adjustment = target_total - self.total_hosts |
| 271 | return len(self.broken_hosts) + adjustment |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 272 | |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 273 | def allocate_surplus(self, num_broken): |
| 274 | """Allocate a list DUTs that can returned as surplus. |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 275 | |
| 276 | Return a list of devices that can be returned in order to |
| 277 | reduce this pool's supply. Broken DUTs will be preferred |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 278 | over working ones. |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 279 | |
| 280 | The `num_broken` parameter indicates the number of broken |
| 281 | DUTs to be left in the pool. If this number exceeds the |
| 282 | number of broken DUTs actually in the pool, the returned |
| 283 | list will be empty. If this number is negative, it |
| 284 | indicates a number of working DUTs to be returned in |
| 285 | addition to all broken ones. |
| 286 | |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 287 | @param num_broken Total number of broken DUTs to be left in |
| 288 | this pool. |
| 289 | |
| 290 | @return A list of DUTs to be returned as surplus. |
| 291 | |
| 292 | """ |
| 293 | if num_broken >= 0: |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 294 | surplus = self.broken_hosts[num_broken:] |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 295 | return surplus |
| 296 | else: |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 297 | return (self.broken_hosts + |
| 298 | self.working_hosts[:-num_broken]) |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 299 | |
| 300 | |
| 301 | def _exchange_labels(dry_run, hosts, target_pool, spare_pool): |
| 302 | """Reassign a list of DUTs from one pool to another. |
| 303 | |
| 304 | For all the given hosts, remove all labels associated with |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 305 | `spare_pool`, and add the labels for `target_pool`. |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 306 | |
| 307 | If `dry_run` is true, perform no changes, but log the `atest` |
| 308 | commands needed to accomplish the necessary label changes. |
| 309 | |
| 310 | @param dry_run Whether the logging is for a dry run or |
| 311 | for actual execution. |
| 312 | @param hosts List of DUTs (AFE hosts) to be reassigned. |
| 313 | @param target_pool The `_DUTPool` object from which the hosts |
| 314 | are drawn. |
| 315 | @param spare_pool The `_DUTPool` object to which the hosts |
| 316 | will be added. |
| 317 | |
| 318 | """ |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 319 | _log_info(dry_run, 'Transferring %d DUTs from %s to %s.', |
| 320 | len(hosts), spare_pool.pool, target_pool.pool) |
Jacob Kopczynski | c6e483e | 2017-08-25 17:28:35 -0700 | [diff] [blame] | 321 | metrics.Counter( |
| 322 | 'chromeos/autotest/balance_pools/duts_moved', |
Richard Barnette | 5de01eb | 2017-12-15 09:53:42 -0800 | [diff] [blame] | 323 | 'DUTs transferred between pools', |
| 324 | # TODO(jrbarnette) The 'board' field is a legacy. We need to |
| 325 | # leave it here until we do the extra work Monarch requires to |
| 326 | # delete a field. |
Prathmesh Prabhu | bb5cb29 | 2017-11-09 16:42:48 -0800 | [diff] [blame] | 327 | field_spec=[ |
| 328 | ts_mon.StringField('board'), |
| 329 | ts_mon.StringField('model'), |
| 330 | ts_mon.StringField('source_pool'), |
| 331 | ts_mon.StringField('target_pool'), |
| 332 | ] |
| 333 | ).increment_by( |
| 334 | len(hosts), |
| 335 | fields={ |
Richard Barnette | 5de01eb | 2017-12-15 09:53:42 -0800 | [diff] [blame] | 336 | 'board': target_pool.labels.get('model', ''), |
Prathmesh Prabhu | bb5cb29 | 2017-11-09 16:42:48 -0800 | [diff] [blame] | 337 | 'model': target_pool.labels.get('model', ''), |
| 338 | 'source_pool': spare_pool.pool, |
| 339 | 'target_pool': target_pool.pool, |
| 340 | }, |
| 341 | ) |
Jacob Kopczynski | ed55f2e | 2017-11-10 16:26:42 -0800 | [diff] [blame] | 342 | if not hosts: |
| 343 | return |
Prathmesh Prabhu | bb5cb29 | 2017-11-09 16:42:48 -0800 | [diff] [blame] | 344 | |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 345 | additions = target_pool.pool_labels |
| 346 | removals = spare_pool.pool_labels |
| 347 | for host in hosts: |
| 348 | if not dry_run: |
| 349 | _log_message('Updating host: %s.', host.hostname) |
Richard Barnette | 07303cb | 2016-04-15 16:56:16 -0700 | [diff] [blame] | 350 | host.remove_labels(removals) |
| 351 | host.add_labels(additions) |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 352 | else: |
| 353 | _log_message('atest label remove -m %s %s', |
| 354 | host.hostname, ' '.join(removals)) |
| 355 | _log_message('atest label add -m %s %s', |
| 356 | host.hostname, ' '.join(additions)) |
| 357 | |
| 358 | |
Richard Barnette | 5de01eb | 2017-12-15 09:53:42 -0800 | [diff] [blame] | 359 | def _balance_model(arguments, afe, pool, labels, start_time, end_time): |
| 360 | """Balance one model as requested by command line arguments. |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 361 | |
| 362 | @param arguments Parsed command line arguments. |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 363 | @param afe AFE object to be used for the changes. |
Richard Barnette | 5de01eb | 2017-12-15 09:53:42 -0800 | [diff] [blame] | 364 | @param pool Pool of the model to be balanced. |
| 365 | @param labels Restrict the balancing operation within DUTs |
| 366 | that have these labels. |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 367 | @param start_time Start time for HostJobHistory objects in |
| 368 | the DUT pools. |
| 369 | @param end_time End time for HostJobHistory objects in the |
| 370 | DUT pools. |
| 371 | |
| 372 | """ |
Prathmesh Prabhu | bb5cb29 | 2017-11-09 16:42:48 -0800 | [diff] [blame] | 373 | spare_pool = _DUTPool(afe, arguments.spare, labels, start_time, end_time) |
| 374 | main_pool = _DUTPool(afe, pool, labels, start_time, end_time) |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 375 | |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 376 | target_total = main_pool.total_hosts |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 377 | if arguments.total is not None: |
| 378 | target_total = arguments.total |
| 379 | elif arguments.grow: |
| 380 | target_total += arguments.grow |
| 381 | elif arguments.shrink: |
| 382 | target_total -= arguments.shrink |
| 383 | |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 384 | spares_needed = main_pool.calculate_spares_needed(target_total) |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 385 | if spares_needed > 0: |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 386 | spare_duts = spare_pool.working_hosts[:spares_needed] |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 387 | shortfall = spares_needed - len(spare_duts) |
| 388 | else: |
| 389 | spare_duts = [] |
| 390 | shortfall = spares_needed |
| 391 | |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 392 | surplus_duts = main_pool.allocate_surplus(shortfall) |
| 393 | |
| 394 | if spares_needed or surplus_duts or arguments.verbose: |
| 395 | dry_run = arguments.dry_run |
| 396 | _log_message('') |
| 397 | |
Prathmesh Prabhu | bb5cb29 | 2017-11-09 16:42:48 -0800 | [diff] [blame] | 398 | _log_info(dry_run, 'Balancing %s %s pool:', labels, main_pool.pool) |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 399 | _log_info(dry_run, |
| 400 | 'Total %d DUTs, %d working, %d broken, %d reserved.', |
| 401 | main_pool.total_hosts, len(main_pool.working_hosts), |
| 402 | len(main_pool.broken_hosts), len(main_pool.ineligible_hosts)) |
| 403 | |
| 404 | if spares_needed > 0: |
| 405 | add_msg = 'grow pool by %d DUTs' % spares_needed |
| 406 | elif spares_needed < 0: |
| 407 | add_msg = 'shrink pool by %d DUTs' % -spares_needed |
| 408 | else: |
| 409 | add_msg = 'no change to pool size' |
| 410 | _log_info(dry_run, 'Target is %d working DUTs; %s.', |
| 411 | target_total, add_msg) |
| 412 | |
| 413 | _log_info(dry_run, |
Chung-yih Wang | cc1d9cb | 2017-11-30 11:20:45 +0800 | [diff] [blame] | 414 | '%s %s pool has %d spares available for balancing pool %s', |
| 415 | labels, spare_pool.pool, len(spare_pool.working_hosts), |
| 416 | main_pool.pool) |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 417 | |
| 418 | if spares_needed > len(spare_duts): |
| 419 | _log_error('Not enough spares: need %d, only have %d.', |
| 420 | spares_needed, len(spare_duts)) |
| 421 | elif shortfall >= 0: |
| 422 | _log_info(dry_run, |
| 423 | '%s %s pool will return %d broken DUTs, ' |
| 424 | 'leaving %d still in the pool.', |
Prathmesh Prabhu | bb5cb29 | 2017-11-09 16:42:48 -0800 | [diff] [blame] | 425 | labels, main_pool.pool, |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 426 | len(surplus_duts), |
| 427 | len(main_pool.broken_hosts) - len(surplus_duts)) |
| 428 | else: |
| 429 | _log_info(dry_run, |
| 430 | '%s %s pool will return %d surplus DUTs, ' |
| 431 | 'including %d working DUTs.', |
Prathmesh Prabhu | bb5cb29 | 2017-11-09 16:42:48 -0800 | [diff] [blame] | 432 | labels, main_pool.pool, |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 433 | len(main_pool.broken_hosts) - shortfall, |
| 434 | -shortfall) |
| 435 | |
David James | f1d6e45 | 2015-07-17 15:23:04 -0700 | [diff] [blame] | 436 | if (len(main_pool.broken_hosts) > arguments.max_broken and |
| 437 | not arguments.force_rebalance): |
| 438 | _log_error('%s %s pool: Refusing to act on pool with %d broken DUTs.', |
Prathmesh Prabhu | bb5cb29 | 2017-11-09 16:42:48 -0800 | [diff] [blame] | 439 | labels, main_pool.pool, len(main_pool.broken_hosts)) |
Richard Barnette | 5de01eb | 2017-12-15 09:53:42 -0800 | [diff] [blame] | 440 | _log_error('Please investigate this model to for a bug ') |
David James | f1d6e45 | 2015-07-17 15:23:04 -0700 | [diff] [blame] | 441 | _log_error('that is bricking devices. Once you have finished your ') |
| 442 | _log_error('investigation, you can force a rebalance with ') |
| 443 | _log_error('--force-rebalance') |
Jacob Kopczynski | ed55f2e | 2017-11-10 16:26:42 -0800 | [diff] [blame] | 444 | spare_duts = [] |
| 445 | surplus_duts = [] |
David James | f1d6e45 | 2015-07-17 15:23:04 -0700 | [diff] [blame] | 446 | |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 447 | if not spare_duts and not surplus_duts: |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 448 | if arguments.verbose: |
| 449 | _log_info(arguments.dry_run, 'No exchange required.') |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 450 | |
| 451 | _exchange_labels(arguments.dry_run, surplus_duts, |
| 452 | spare_pool, main_pool) |
| 453 | _exchange_labels(arguments.dry_run, spare_duts, |
| 454 | main_pool, spare_pool) |
| 455 | |
| 456 | |
Prathmesh Prabhu | bb5cb29 | 2017-11-09 16:42:48 -0800 | [diff] [blame] | 457 | def _too_many_broken(inventory, pool, args): |
Kevin Cheng | cf0ad2b | 2016-04-19 14:51:39 -0700 | [diff] [blame] | 458 | """ |
Richard Barnette | 5de01eb | 2017-12-15 09:53:42 -0800 | [diff] [blame] | 459 | Get the inventory of models and check if too many are broken. |
Kevin Cheng | cf0ad2b | 2016-04-19 14:51:39 -0700 | [diff] [blame] | 460 | |
Prathmesh Prabhu | bb5cb29 | 2017-11-09 16:42:48 -0800 | [diff] [blame] | 461 | @param inventory: _LabInventory object. |
| 462 | @param pool: The pool to check. |
| 463 | @param args: Parsed command line arguments. |
Kevin Cheng | cf0ad2b | 2016-04-19 14:51:39 -0700 | [diff] [blame] | 464 | |
Richard Barnette | 5de01eb | 2017-12-15 09:53:42 -0800 | [diff] [blame] | 465 | @return True if the number of models with 1 or more broken duts |
| 466 | exceed max_broken_models, False otherwise. |
Kevin Cheng | cf0ad2b | 2016-04-19 14:51:39 -0700 | [diff] [blame] | 467 | """ |
Prathmesh Prabhu | bb5cb29 | 2017-11-09 16:42:48 -0800 | [diff] [blame] | 468 | # Were we asked to skip this check? |
| 469 | if (args.force_rebalance or |
Richard Barnette | 5de01eb | 2017-12-15 09:53:42 -0800 | [diff] [blame] | 470 | (args.all_models and args.max_broken_models == 0)): |
Kevin Cheng | cf0ad2b | 2016-04-19 14:51:39 -0700 | [diff] [blame] | 471 | return False |
| 472 | |
Richard Barnette | 5de01eb | 2017-12-15 09:53:42 -0800 | [diff] [blame] | 473 | max_broken = args.max_broken_models |
| 474 | if max_broken is None: |
| 475 | total_num = len(inventory.get_pool_models(pool)) |
| 476 | max_broken = int(_MAX_BROKEN_DEFAULT_RATIO * total_num) |
Prathmesh Prabhu | bb5cb29 | 2017-11-09 16:42:48 -0800 | [diff] [blame] | 477 | _log_info(args.dry_run, |
Richard Barnette | 5de01eb | 2017-12-15 09:53:42 -0800 | [diff] [blame] | 478 | 'Max broken models for pool %s: %d', |
Prathmesh Prabhu | bb5cb29 | 2017-11-09 16:42:48 -0800 | [diff] [blame] | 479 | pool, max_broken) |
Kevin Cheng | cf0ad2b | 2016-04-19 14:51:39 -0700 | [diff] [blame] | 480 | |
Richard Barnette | 5de01eb | 2017-12-15 09:53:42 -0800 | [diff] [blame] | 481 | broken = [model for model, counts in inventory.iteritems() |
| 482 | if counts.get_broken(pool) != 0] |
| 483 | _log_message('There are %d models in the %s pool with at least 1 ' |
Prathmesh Prabhu | bb5cb29 | 2017-11-09 16:42:48 -0800 | [diff] [blame] | 484 | 'broken DUT (max threshold %d)', |
| 485 | len(broken), pool, max_broken) |
| 486 | for b in sorted(broken): |
| 487 | _log_message(b) |
| 488 | return len(broken) > max_broken |
Kevin Cheng | cf0ad2b | 2016-04-19 14:51:39 -0700 | [diff] [blame] | 489 | |
| 490 | |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 491 | def _parse_command(argv): |
| 492 | """Parse the command line arguments. |
| 493 | |
| 494 | Create an argument parser for this command's syntax, parse the |
| 495 | command line, and return the result of the `ArgumentParser` |
| 496 | `parse_args()` method. |
| 497 | |
| 498 | @param argv Standard command line argument vector; `argv[0]` is |
| 499 | assumed to be the command name. |
| 500 | |
| 501 | @return Result returned by `ArgumentParser.parse_args()`. |
| 502 | |
| 503 | """ |
| 504 | parser = argparse.ArgumentParser( |
| 505 | prog=argv[0], |
| 506 | description='Balance pool shortages from spares on reserve') |
| 507 | |
Prathmesh Prabhu | c8cf0f6 | 2017-11-09 08:57:58 -0800 | [diff] [blame] | 508 | parser.add_argument( |
| 509 | '-w', '--web', type=str, default=None, |
| 510 | help='AFE host to use. Default comes from shadow_config.', |
| 511 | ) |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 512 | count_group = parser.add_mutually_exclusive_group() |
| 513 | count_group.add_argument('-t', '--total', type=int, |
| 514 | metavar='COUNT', default=None, |
| 515 | help='Set the number of DUTs in the ' |
| 516 | 'pool to the specified count for ' |
Richard Barnette | 5de01eb | 2017-12-15 09:53:42 -0800 | [diff] [blame] | 517 | 'every MODEL') |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 518 | count_group.add_argument('-a', '--grow', type=int, |
| 519 | metavar='COUNT', default=None, |
| 520 | help='Add the specified number of DUTs ' |
Richard Barnette | 5de01eb | 2017-12-15 09:53:42 -0800 | [diff] [blame] | 521 | 'to the pool for every MODEL') |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 522 | count_group.add_argument('-d', '--shrink', type=int, |
| 523 | metavar='COUNT', default=None, |
| 524 | help='Remove the specified number of DUTs ' |
Richard Barnette | 5de01eb | 2017-12-15 09:53:42 -0800 | [diff] [blame] | 525 | 'from the pool for every MODEL') |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 526 | |
Kevin Cheng | cf0ad2b | 2016-04-19 14:51:39 -0700 | [diff] [blame] | 527 | parser.add_argument('-s', '--spare', default=_SPARE_DEFAULT, |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 528 | metavar='POOL', |
| 529 | help='Pool from which to draw replacement ' |
Kevin Cheng | cf0ad2b | 2016-04-19 14:51:39 -0700 | [diff] [blame] | 530 | 'spares (default: pool:%s)' % _SPARE_DEFAULT) |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 531 | parser.add_argument('-n', '--dry-run', action='store_true', |
| 532 | help='Report actions to take in the form of ' |
| 533 | 'shell commands') |
David James | 750c038 | 2015-05-06 19:30:46 -0700 | [diff] [blame] | 534 | parser.add_argument('-v', '--verbose', action='store_true', |
| 535 | help='Print more detail about calculations for debug ' |
| 536 | 'purposes.') |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 537 | |
David James | f1d6e45 | 2015-07-17 15:23:04 -0700 | [diff] [blame] | 538 | parser.add_argument('-m', '--max-broken', default=2, type=int, |
| 539 | metavar='COUNT', |
| 540 | help='Only rebalance a pool if it has at most ' |
| 541 | 'COUNT broken DUTs.') |
| 542 | parser.add_argument('-f', '--force-rebalance', action='store_true', |
| 543 | help='Forcefully rebalance all DUTs in a pool, even ' |
| 544 | 'if it has a large number of broken DUTs. ' |
| 545 | 'Before doing this, please investigate whether ' |
| 546 | 'there is a bug that is bricking devices in the ' |
| 547 | 'lab.') |
Jacob Kopczynski | f8d90a8 | 2017-10-10 14:37:33 -0700 | [diff] [blame] | 548 | parser.add_argument('--production', action='store_true', |
Prathmesh Prabhu | 7a050da | 2017-11-09 09:15:03 -0800 | [diff] [blame] | 549 | help='Treat this as a production run. This will ' |
| 550 | 'collect metrics.') |
David James | f1d6e45 | 2015-07-17 15:23:04 -0700 | [diff] [blame] | 551 | |
Prathmesh Prabhu | bb5cb29 | 2017-11-09 16:42:48 -0800 | [diff] [blame] | 552 | parser.add_argument( |
Prathmesh Prabhu | bb5cb29 | 2017-11-09 16:42:48 -0800 | [diff] [blame] | 553 | '--all-models', |
| 554 | action='store_true', |
| 555 | help='Rebalance all managed models. This will do a very expensive ' |
| 556 | 'check to see how many models have at least one broken DUT. ' |
| 557 | 'To bypass that check, set --max-broken-models to 0.', |
| 558 | ) |
| 559 | parser.add_argument( |
Richard Barnette | 5de01eb | 2017-12-15 09:53:42 -0800 | [diff] [blame] | 560 | '--max-broken-models', default=None, type=int, metavar='COUNT', |
Prathmesh Prabhu | bb5cb29 | 2017-11-09 16:42:48 -0800 | [diff] [blame] | 561 | help='Only rebalance all models if number of models with broken ' |
| 562 | 'DUTs in the specified pool is less than COUNT.', |
| 563 | ) |
| 564 | |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 565 | parser.add_argument('pool', |
| 566 | metavar='POOL', |
Kevin Cheng | cf0ad2b | 2016-04-19 14:51:39 -0700 | [diff] [blame] | 567 | help='Name of the pool to balance. Use %s to balance ' |
| 568 | 'all critical pools' % _ALL_CRITICAL_POOLS) |
Richard Barnette | 5de01eb | 2017-12-15 09:53:42 -0800 | [diff] [blame] | 569 | parser.add_argument('models', nargs='*', metavar='MODEL', |
| 570 | help='Names of models to balance.') |
Aviv Keshet | 39c20dd | 2017-11-02 10:17:13 -0700 | [diff] [blame] | 571 | |
Chung-yih Wang | cc1d9cb | 2017-11-30 11:20:45 +0800 | [diff] [blame] | 572 | parser.add_argument('--sku', type=str, |
| 573 | help='Optional name of sku to restrict to.') |
| 574 | |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 575 | arguments = parser.parse_args(argv[1:]) |
David James | 8352bc2 | 2015-05-05 16:37:05 -0700 | [diff] [blame] | 576 | |
| 577 | # Error-check arguments. |
Richard Barnette | 5de01eb | 2017-12-15 09:53:42 -0800 | [diff] [blame] | 578 | if arguments.models and arguments.all_models: |
| 579 | parser.error('Cannot specify individual models on the command line ' |
| 580 | 'when using --all-models.') |
Kevin Cheng | cf0ad2b | 2016-04-19 14:51:39 -0700 | [diff] [blame] | 581 | if (arguments.pool == _ALL_CRITICAL_POOLS and |
Prathmesh Prabhu | bb5cb29 | 2017-11-09 16:42:48 -0800 | [diff] [blame] | 582 | arguments.spare != _SPARE_DEFAULT): |
Kevin Cheng | cf0ad2b | 2016-04-19 14:51:39 -0700 | [diff] [blame] | 583 | parser.error('Cannot specify --spare pool to be %s when balancing all ' |
| 584 | 'critical pools.' % _SPARE_DEFAULT) |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 585 | return arguments |
| 586 | |
| 587 | |
Prathmesh Prabhu | bb5cb29 | 2017-11-09 16:42:48 -0800 | [diff] [blame] | 588 | def infer_balancer_targets(afe, arguments, pools): |
Richard Barnette | 5de01eb | 2017-12-15 09:53:42 -0800 | [diff] [blame] | 589 | """Take some arguments and translate them to a list of models to balance |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 590 | |
Jacob Kopczynski | c6e483e | 2017-08-25 17:28:35 -0700 | [diff] [blame] | 591 | Args: |
| 592 | @param afe AFE object to be used for taking inventory. |
| 593 | @param arguments Parsed command line arguments. |
| 594 | @param pools The list of pools to balance. |
| 595 | |
Richard Barnette | 5de01eb | 2017-12-15 09:53:42 -0800 | [diff] [blame] | 596 | @returns a list of (model, labels) tuples to be balanced |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 597 | |
| 598 | """ |
Prathmesh Prabhu | bb5cb29 | 2017-11-09 16:42:48 -0800 | [diff] [blame] | 599 | balancer_targets = [] |
Aviv Keshet | 39c20dd | 2017-11-02 10:17:13 -0700 | [diff] [blame] | 600 | |
Prathmesh Prabhu | bb5cb29 | 2017-11-09 16:42:48 -0800 | [diff] [blame] | 601 | for pool in pools: |
Richard Barnette | 5de01eb | 2017-12-15 09:53:42 -0800 | [diff] [blame] | 602 | if arguments.all_models: |
Prathmesh Prabhu | bb5cb29 | 2017-11-09 16:42:48 -0800 | [diff] [blame] | 603 | inventory = lab_inventory.get_inventory(afe) |
| 604 | quarantine = _too_many_broken(inventory, pool, arguments) |
Jacob Kopczynski | c6e483e | 2017-08-25 17:28:35 -0700 | [diff] [blame] | 605 | if quarantine: |
Richard Barnette | 5de01eb | 2017-12-15 09:53:42 -0800 | [diff] [blame] | 606 | _log_error('Refusing to balance all models for %s pool, ' |
| 607 | 'too many models with at least 1 broken DUT ' |
| 608 | 'detected.', pool) |
Kevin Cheng | cf0ad2b | 2016-04-19 14:51:39 -0700 | [diff] [blame] | 609 | else: |
Richard Barnette | 685ac85 | 2018-04-04 16:14:06 -0700 | [diff] [blame] | 610 | for model in inventory.get_pool_models(pool): |
Richard Barnette | 5de01eb | 2017-12-15 09:53:42 -0800 | [diff] [blame] | 611 | labels = labellib.LabelsMapping() |
| 612 | labels['model'] = model |
| 613 | balancer_targets.append((pool, labels.getlabels())) |
Jacob Kopczynski | c6e483e | 2017-08-25 17:28:35 -0700 | [diff] [blame] | 614 | metrics.Boolean( |
| 615 | 'chromeos/autotest/balance_pools/unchanged_pools').set( |
| 616 | quarantine, fields={'pool': pool}) |
Prathmesh Prabhu | bb5cb29 | 2017-11-09 16:42:48 -0800 | [diff] [blame] | 617 | _log_message('Pool %s quarantine status: %s', pool, quarantine) |
| 618 | else: |
Richard Barnette | 5de01eb | 2017-12-15 09:53:42 -0800 | [diff] [blame] | 619 | for model in arguments.models: |
Prathmesh Prabhu | bb5cb29 | 2017-11-09 16:42:48 -0800 | [diff] [blame] | 620 | labels = labellib.LabelsMapping() |
Richard Barnette | 5de01eb | 2017-12-15 09:53:42 -0800 | [diff] [blame] | 621 | labels['model'] = model |
Richard Barnette | 32fb1e8 | 2018-01-30 13:39:30 -0800 | [diff] [blame] | 622 | if arguments.sku: |
| 623 | labels['sku'] = arguments.sku |
| 624 | balancer_targets.append((pool, labels.getlabels())) |
Prathmesh Prabhu | bb5cb29 | 2017-11-09 16:42:48 -0800 | [diff] [blame] | 625 | return balancer_targets |
Jacob Kopczynski | c6e483e | 2017-08-25 17:28:35 -0700 | [diff] [blame] | 626 | |
| 627 | |
| 628 | def main(argv): |
| 629 | """Standard main routine. |
| 630 | |
| 631 | @param argv Command line arguments including `sys.argv[0]`. |
| 632 | |
| 633 | """ |
Jacob Kopczynski | f8d90a8 | 2017-10-10 14:37:33 -0700 | [diff] [blame] | 634 | arguments = _parse_command(argv) |
| 635 | if arguments.production: |
Aviv Keshet | 2cc427d | 2018-04-18 13:39:24 -0700 | [diff] [blame] | 636 | metrics_manager = site_utils.SetupTsMonGlobalState('balance_pools', |
| 637 | indirect=True) |
Jacob Kopczynski | f8d90a8 | 2017-10-10 14:37:33 -0700 | [diff] [blame] | 638 | else: |
Jacob Kopczynski | f8d90a8 | 2017-10-10 14:37:33 -0700 | [diff] [blame] | 639 | metrics_manager = site_utils.TrivialContextManager() |
| 640 | |
| 641 | with metrics_manager: |
Aviv Keshet | 259a650 | 2018-04-18 13:48:02 -0700 | [diff] [blame] | 642 | with metrics.SuccessCounter('chromeos/autotest/balance_pools/runs'): |
| 643 | end_time = time.time() |
| 644 | start_time = end_time - 24 * 60 * 60 |
| 645 | afe = frontend.AFE(server=arguments.web) |
Prathmesh Prabhu | bb5cb29 | 2017-11-09 16:42:48 -0800 | [diff] [blame] | 646 | |
Aviv Keshet | 259a650 | 2018-04-18 13:48:02 -0700 | [diff] [blame] | 647 | def balancer(pool, labels): |
| 648 | """Balance the specified model. |
Prathmesh Prabhu | bb5cb29 | 2017-11-09 16:42:48 -0800 | [diff] [blame] | 649 | |
Aviv Keshet | 259a650 | 2018-04-18 13:48:02 -0700 | [diff] [blame] | 650 | @param pool: The pool to rebalance for the model. |
| 651 | @param labels: labels to restrict to balancing operations |
| 652 | within. |
| 653 | """ |
| 654 | _balance_model(arguments, afe, pool, labels, |
| 655 | start_time, end_time) |
| 656 | _log_message('') |
Prathmesh Prabhu | bb5cb29 | 2017-11-09 16:42:48 -0800 | [diff] [blame] | 657 | |
Aviv Keshet | 259a650 | 2018-04-18 13:48:02 -0700 | [diff] [blame] | 658 | pools = (lab_inventory.CRITICAL_POOLS |
| 659 | if arguments.pool == _ALL_CRITICAL_POOLS |
| 660 | else [arguments.pool]) |
| 661 | balancer_targets = infer_balancer_targets(afe, arguments, pools) |
| 662 | try: |
| 663 | parallel.RunTasksInProcessPool( |
| 664 | balancer, |
| 665 | balancer_targets, |
| 666 | processes=8, |
| 667 | ) |
| 668 | except KeyboardInterrupt: |
| 669 | pass |
J. Richard Barnette | 91d5681 | 2015-04-21 10:22:31 -0700 | [diff] [blame] | 670 | |
| 671 | |
| 672 | if __name__ == '__main__': |
| 673 | main(sys.argv) |