[autotest] Convert `balance_pool` to Python. This adds a new site_utils/balance_pool.py to replace contrib/balance_pool. The basic features of the two programs are the same; however, the command line syntax of the two programs is not compatible. BUG=None TEST=test w/ -n, plus run for real against cautotest Change-Id: I30cbe5b89e9e0aae49cf92f09ab983c9a17f859f Reviewed-on: https://chromium-review.googlesource.com/267363 Reviewed-by: Dan Shi <dshi@chromium.org> Tested-by: Richard Barnette <jrbarnette@chromium.org> Commit-Queue: Richard Barnette <jrbarnette@chromium.org>

commit: 91d56813e47f97c6b7b633c26c65e83a2d1f285a [log] [tgz]
author: J. Richard Barnette <jrbarnette@chromium.org> Tue Apr 21 10:22:31 2015 -0700
committer: ChromeOS Commit Bot <chromeos-commit-bot@chromium.org> Thu Apr 30 02:31:08 2015 +0000
tree: a8df2527512dd3504471648a743bcb11319ad6f1
parent: 3d0590adb50cd0af0c53007c64eeaa6cddfbf5e9 [diff] [blame]
diff --git a/site_utils/balance_pools.py b/site_utils/balance_pools.py
new file mode 100755
index 0000000..84b8777
--- /dev/null
+++ b/site_utils/balance_pools.py

@@ -0,0 +1,506 @@
+#!/usr/bin/env python
+# Copyright 2015 The Chromium OS Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+"""Adjust pool balances to cover DUT shortfalls.
+
+This command takes all broken DUTs in a specific pool for specific
+boards and swaps them with working DUTs taken from a selected pool
+of spares.  The command is meant primarily for replacing broken DUTs
+in critical pools like BVT or CQ, but it can also be used to adjust
+pool sizes, or to create or remove pools.
+
+usage:  balance_pool.py [ options ] POOL BOARD [ BOARD ... ]
+
+positional arguments:
+  POOL                  Name of the pool to balance
+  BOARD                 Names of boards to balance
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -t COUNT, --total COUNT
+                        Set the number of DUTs in the pool to the specified
+                        count for every BOARD
+  -a COUNT, --grow COUNT
+                        Add the specified number of DUTs to the pool for every
+                        BOARD
+  -d COUNT, --shrink COUNT
+                        Remove the specified number of DUTs from the pool for
+                        every BOARD
+  -s POOL, --spare POOL
+                        Pool from which to draw replacement spares (default:
+                        pool:suites)
+  -n, --dry-run         Report actions to take in the form of shell commands
+
+
+The command attempts to remove all broken DUTs from the target POOL
+for every BOARD, and replace them with enough working DUTs taken
+from the spare pool to bring the strength of POOL to the requested
+total COUNT.
+
+If no COUNT options are supplied (i.e. there are no --total, --grow,
+or --shrink options), the command will maintain the current totals of
+DUTs for every BOARD in the target POOL.
+
+If not enough working spares are available, broken DUTs may be left
+in the pool to keep the pool at the target COUNT.
+
+When reducing pool size, working DUTs will be returned after broken
+DUTs, if it's necessary to achieve the target COUNT.
+
+"""
+
+
+import argparse
+import sys
+import time
+
+import common
+from autotest_lib.server import frontend
+from autotest_lib.site_utils import status_history
+from autotest_lib.site_utils.suite_scheduler import constants
+
+
+_POOL_PREFIX = constants.Labels.POOL_PREFIX
+
+
+def _log_message(message, *args):
+    """Log a message with optional format arguments to stdout.
+
+    This function logs a single line to stdout, with formatting
+    if necessary, and without adornments.
+
+    If `*args` are supplied, the message will be formatted using
+    the arguments.
+
+    @param message  Message to be logged, possibly after formatting.
+    @param args     Format arguments.  If empty, the message is logged
+                    without formatting.
+
+    """
+    if args:
+        message = message % args
+    sys.stdout.write('%s\n' % message)
+
+
+def _log_info(dry_run, message, *args):
+    """Log information in a dry-run dependent fashion.
+
+    This function logs a single line to stdout, with formatting
+    if necessary.  When logging for a dry run, the message is
+    printed as a shell comment, rather than as unadorned text.
+
+    If `*args` are supplied, the message will be formatted using
+    the arguments.
+
+    @param message  Message to be logged, possibly after formatting.
+    @param args     Format arguments.  If empty, the message is logged
+                    without formatting.
+
+    """
+    if dry_run:
+        message = '# ' + message
+    _log_message(message, *args)
+
+
+def _log_error(message, *args):
+    """Log an error to stderr, with optional format arguments.
+
+    This function logs a single line to stderr, prefixed to indicate
+    that it is an error message.
+
+    If `*args` are supplied, the message will be formatted using
+    the arguments.
+
+    @param message  Message to be logged, possibly after formatting.
+    @param args     Format arguments.  If empty, the message is logged
+                    without formatting.
+
+    """
+    if args:
+        message = message % args
+    sys.stderr.write('ERROR: %s\n' % message)
+
+
+class _DUTPool(object):
+    """Information about a pool of DUTs for a given board.
+
+    This class collects information about all DUTs for a given
+    board and pool pair, and divides them into three categories:
+      + Working - the DUT is working for testing, and not locked.
+      + Broken - the DUT is unable to run tests, or it is locked.
+      + Ineligible - the DUT is not available to be removed from
+          this pool.  The DUT may be either working or broken.
+
+    DUTs with more than one pool: label are ineligible for exchange
+    during balancing.  This is done for the sake of chameleon hosts,
+    which must always be assigned to pool:suites.  These DUTs are
+    always marked with pool:chameleon to prevent their reassignment.
+
+    TODO(jrbarnette):  The use of `pool:chamelon` (instead of just
+    the `chameleon` label is a hack that should be eliminated.
+
+    _DUTPool instances are used to track both main pools that need
+    to be resupplied with working DUTs and spare pools that supply
+    those DUTs.
+
+    @property board               Name of the board associated with
+                                  this pool of DUTs.
+    @property pool                Name of the pool associated with
+                                  this pool of DUTs.
+    @property _working_hosts      The list of this pool's working
+                                  DUTs.
+    @property _broken_hosts       The list of this pool's broken
+                                  DUTs.
+    @property _ineligible__hosts  The list of this pool's ineligible
+                                  DUTs.
+    @property _labels             A list of labels that identify a DUT
+                                  as part of this pool.
+    @property _total_hosts        The total number of hosts in pool.
+
+    """
+
+
+    def __init__(self, afe, board, pool, start_time, end_time):
+        self.board = board
+        self.pool = pool
+        self._working_hosts = []
+        self._broken_hosts = []
+        self._ineligible_hosts = []
+        self._labels = [_POOL_PREFIX + self.pool]
+        self._total_hosts = self._get_hosts(afe, start_time, end_time)
+
+
+    def _get_hosts(self, afe, start_time, end_time):
+        all_histories = (
+            status_history.HostJobHistory.get_multiple_histories(
+                    afe, start_time, end_time,
+                    board=self.board, pool=self.pool))
+        for h in all_histories:
+            host = h.host
+            host_pools = [l for l in host.labels
+                          if l.startswith(_POOL_PREFIX)]
+            if len(host_pools) != 1:
+                self._ineligible_hosts.append(host)
+            else:
+                diag = h.last_diagnosis()[0]
+                if (diag == status_history.WORKING and
+                        not host.locked):
+                    self._working_hosts.append(host)
+                else:
+                    self._broken_hosts.append(host)
+        return len(all_histories)
+
+
+    @property
+    def pool_labels(self):
+        """Return the AFE labels that identify this pool.
+
+        The returned labels are the labels that must be removed
+        to remove a DUT from the pool, or added to add a DUT.
+
+        @return A list of AFE labels suitable for AFE.add_labels()
+                or AFE.remove_labels().
+
+        """
+        return self._labels
+
+
+    def calculate_inventory(self, dry_run):
+        """Calculate and log how many DUTs are in this pool.
+
+        Return the total number of DUTs in the pool across all three
+        categories (working, broken, and ineligible).  As a side
+        effect, log the totals.
+
+        @param dry_run Whether the logging is for a dry run or for
+                       actual execution.
+
+        @return The total number of DUTs in this pool.
+
+        """
+        _log_info(dry_run, 'Balancing %s %s pool:',
+                  self.board, self.pool)
+        _log_info(dry_run,
+                  'Total %d DUTs, %d working, %d broken, %d reserved.',
+                  self._total_hosts, len(self._working_hosts),
+                  len(self._broken_hosts), len(self._ineligible_hosts))
+        return self._total_hosts
+
+
+    def calculate_spares_needed(self, dry_run, target_total):
+        """Calculate and log the spares needed to achieve a target.
+
+        Return how many working spares are needed to achieve the
+        given `target_total` with all DUTs working.  Log the
+        adjustments entailed.
+
+        The spares count may be positive or negative.  Positive
+        values indicate spares are needed to replace broken DUTs in
+        order to reach the target; negative numbers indicate that
+        no spares are needed, and that a corresponding number of
+        working devices can be returned.
+
+        If the new target total would require returning ineligible
+        DUTs, an error is logged, and the target total is adjusted
+        so that those DUTs are not exchanged.
+
+        @param dry_run       Whether the logging is for a dry run or
+                             for actual execution.
+        @param target_total  The new target pool size.
+
+        @return The number of spares needed.
+
+        """
+        num_ineligible = len(self._ineligible_hosts)
+        if target_total < num_ineligible:
+            _log_error('%s %s pool: Target of %d is below '
+                       'minimum of %d DUTs.',
+                       self.board, self.pool,
+                       target_total, num_ineligible)
+            _log_error('Adjusting target to %d DUTs.', num_ineligible)
+            target_total = num_ineligible
+        adjustment = target_total - self._total_hosts
+        if adjustment > 0:
+            add_msg = 'grow pool by %d DUTs' % adjustment
+        elif adjustment < 0:
+            add_msg = 'shrink pool by %d DUTs' % -adjustment
+        else:
+            add_msg = 'no change to pool size'
+        _log_info(dry_run, 'Target is %d working DUTs; %s.',
+                  target_total, add_msg)
+        return len(self._broken_hosts) + adjustment
+
+
+    def allocate_working_spares(self, dry_run, num_requested):
+        """Allocate and log a list DUTs that can be used as spares.
+
+        Return a list of up to `num_requested` hosts from this
+        pool's list of working hosts.  Log details about this pool's
+        working spares.
+
+        If the requested number of DUTs exceeds the supply, log an
+        error, and return as many working devices as possible.
+
+        @param dry_run       Whether the logging is for a dry run or
+                             for actual execution.
+        @param num_requested Total number of DUTs to allocate from
+                             this pool's working DUTs.
+
+        @return A list of spare DUTs.
+
+        """
+        _log_info(dry_run,
+                  '%s %s pool has %d spares available.',
+                  self.board, self.pool, len(self._working_hosts))
+        if num_requested > len(self._working_hosts):
+            _log_error('Not enough spares: need %d, only have %d.',
+                       num_requested, len(self._working_hosts))
+        return self._working_hosts[:num_requested]
+
+
+    def allocate_surplus(self, dry_run, num_broken):
+        """Allocate and log a list DUTs that can returned as surplus.
+
+        Return a list of devices that can be returned in order to
+        reduce this pool's supply.  Broken DUTs will be preferred
+        over working ones.  Log information about the DUTs to be
+        returned.
+
+        The `num_broken` parameter indicates the number of broken
+        DUTs to be left in the pool.  If this number exceeds the
+        number of broken DUTs actually in the pool, the returned
+        list will be empty.  If this number is negative, it
+        indicates a number of working DUTs to be returned in
+        addition to all broken ones.
+
+        @param dry_run       Whether the logging is for a dry run or
+                             for actual execution.
+        @param num_broken    Total number of broken DUTs to be left in
+                             this pool.
+
+        @return A list of DUTs to be returned as surplus.
+
+        """
+        if num_broken >= 0:
+            surplus = self._broken_hosts[num_broken:]
+            _log_info(dry_run,
+                      '%s %s pool will return %d broken DUTs, '
+                      'leaving %d still in the pool.',
+                      self.board, self.pool,
+                      len(surplus),
+                      len(self._broken_hosts) - len(surplus))
+            return surplus
+        else:
+            _log_info(dry_run,
+                      '%s %s pool will return %d surplus DUTs, '
+                      'including %d working DUTs.',
+                      self.board, self.pool,
+                      len(self._broken_hosts) - num_broken,
+                      -num_broken)
+            return (self._broken_hosts +
+                    self._working_hosts[:-num_broken])
+
+
+def _exchange_labels(dry_run, hosts, target_pool, spare_pool):
+    """Reassign a list of DUTs from one pool to another.
+
+    For all the given hosts, remove all labels associated with
+    `spare_pool`, and add the labels for `target_pool`.  Log the
+    action.
+
+    If `dry_run` is true, perform no changes, but log the `atest`
+    commands needed to accomplish the necessary label changes.
+
+    @param dry_run       Whether the logging is for a dry run or
+                         for actual execution.
+    @param hosts         List of DUTs (AFE hosts) to be reassigned.
+    @param target_pool   The `_DUTPool` object from which the hosts
+                         are drawn.
+    @param spare_pool    The `_DUTPool` object to which the hosts
+                         will be added.
+
+    """
+    if not hosts:
+        return
+    _log_info(dry_run, 'Transferring %d DUTs from %s to %s.',
+              len(hosts), spare_pool.pool, target_pool.pool)
+    additions = target_pool.pool_labels
+    removals = spare_pool.pool_labels
+    for host in hosts:
+        if not dry_run:
+            _log_message('Updating host: %s.', host.hostname)
+            host.remove_labels(removals)
+            host.add_labels(additions)
+        else:
+            _log_message('atest label remove -m %s %s',
+                         host.hostname, ' '.join(removals))
+            _log_message('atest label add -m %s %s',
+                         host.hostname, ' '.join(additions))
+
+
+def _balance_board(arguments, afe, board, start_time, end_time):
+    """Balance one board as requested by command line arguments.
+
+    @param arguments     Parsed command line arguments.
+    @param dry_run       Whether the logging is for a dry run or
+                         for actual execution.
+    @param afe           AFE object to be used for the changes.
+    @param board         Board to be balanced.
+    @param start_time    Start time for HostJobHistory objects in
+                         the DUT pools.
+    @param end_time      End time for HostJobHistory objects in the
+                         DUT pools.
+
+    """
+    spare_pool = _DUTPool(afe, board, arguments.spare,
+                          start_time, end_time)
+    main_pool = _DUTPool(afe, board, arguments.pool,
+                         start_time, end_time)
+
+    target_total = main_pool.calculate_inventory(arguments.dry_run)
+    if arguments.total is not None:
+        target_total = arguments.total
+    elif arguments.grow:
+        target_total += arguments.grow
+    elif arguments.shrink:
+        target_total -= arguments.shrink
+
+    spares_needed = main_pool.calculate_spares_needed(
+            arguments.dry_run, target_total)
+    if spares_needed > 0:
+        spare_duts = spare_pool.allocate_working_spares(
+                arguments.dry_run, spares_needed)
+        shortfall = spares_needed - len(spare_duts)
+    else:
+        spare_duts = []
+        shortfall = spares_needed
+
+    surplus_duts = main_pool.allocate_surplus(
+            arguments.dry_run, shortfall)
+    if not spare_duts and not surplus_duts:
+        _log_info(arguments.dry_run, 'No exchange required.')
+        return
+
+    _exchange_labels(arguments.dry_run, surplus_duts,
+                     spare_pool, main_pool)
+    _exchange_labels(arguments.dry_run, spare_duts,
+                     main_pool, spare_pool)
+
+
+def _parse_command(argv):
+    """Parse the command line arguments.
+
+    Create an argument parser for this command's syntax, parse the
+    command line, and return the result of the `ArgumentParser`
+    `parse_args()` method.
+
+    @param argv Standard command line argument vector; `argv[0]` is
+                assumed to be the command name.
+
+    @return Result returned by `ArgumentParser.parse_args()`.
+
+    """
+    parser = argparse.ArgumentParser(
+            prog=argv[0],
+            description='Balance pool shortages from spares on reserve')
+
+    count_group = parser.add_mutually_exclusive_group()
+    count_group.add_argument('-t', '--total', type=int,
+                             metavar='COUNT', default=None,
+                             help='Set the number of DUTs in the '
+                                  'pool to the specified count for '
+                                  'every BOARD')
+    count_group.add_argument('-a', '--grow', type=int,
+                             metavar='COUNT', default=None,
+                             help='Add the specified number of DUTs '
+                                  'to the pool for every BOARD')
+    count_group.add_argument('-d', '--shrink', type=int,
+                             metavar='COUNT', default=None,
+                             help='Remove the specified number of DUTs '
+                                  'from the pool for every BOARD')
+
+    parser.add_argument('-s', '--spare', default='suites',
+                        metavar='POOL',
+                        help='Pool from which to draw replacement '
+                             'spares (default: pool:suites)')
+    parser.add_argument('-n', '--dry-run', action='store_true',
+                        help='Report actions to take in the form of '
+                             'shell commands')
+
+    parser.add_argument('pool',
+                        metavar='POOL',
+                        help='Name of the pool to balance')
+    parser.add_argument('boards', nargs='+',
+                        metavar='BOARD',
+                        help='Names of boards to balance')
+
+    arguments = parser.parse_args(argv[1:])
+    return arguments
+
+
+def main(argv):
+    """Standard main routine.
+
+    @param argv  Command line arguments including `sys.argv[0]`.
+
+    """
+    arguments = _parse_command(argv)
+    end_time = time.time()
+    start_time = end_time - 24 * 60 * 60
+
+    first_time = True
+    try:
+        afe = frontend.AFE(server=None)
+        for board in arguments.boards:
+            if not first_time:
+                _log_message('')
+            _balance_board(arguments, afe, board, start_time, end_time)
+            first_time = False
+    except KeyboardInterrupt:
+        pass
+
+
+if __name__ == '__main__':
+    main(sys.argv)
commit	91d56813e47f97c6b7b633c26c65e83a2d1f285a	[log] [tgz]
author	J. Richard Barnette <jrbarnette@chromium.org>	Tue Apr 21 10:22:31 2015 -0700
committer	ChromeOS Commit Bot <chromeos-commit-bot@chromium.org>	Thu Apr 30 02:31:08 2015 +0000
tree	a8df2527512dd3504471648a743bcb11319ad6f1
parent	3d0590adb50cd0af0c53007c64eeaa6cddfbf5e9 [diff] [blame]