Add metrics to balancer, remove board indices. Boolean metrics for boards and pools depleted of capacity and for pools which are quarantined. Counter metric for moved DUTs parameterized by board, source pool, and target pool. No longer assign indices to boards while balancing them. BUG=chromium:746067 TEST=ran 'balance_pool --all-boards suites' Change-Id: Ic42ddd99b45b7ee5d6cbf40dde5b566dcce9f828 Reviewed-on: https://chromium-review.googlesource.com/639692 Commit-Ready: Jacob Kopczynski <jkop@chromium.org> Tested-by: Jacob Kopczynski <jkop@chromium.org> Reviewed-by: Paul Hobbs <phobbs@google.com>

commit: c6e483e1d707f473d54b85e6f4531b300b3f6b29 [log] [tgz]
author: Jacob Kopczynski <jkop@chromium.org> Fri Aug 25 17:28:35 2017 -0700
committer: chrome-bot <chrome-bot@chromium.org> Thu Oct 05 21:24:42 2017 -0700
tree: 9b4790202d4c7b869123a85c568a4536f3dcf9da
parent: 0a2ea5790eee1427336114f18aa4288520ccd910 [diff]
diff --git a/site_utils/balance_pools.py b/site_utils/balance_pools.py
index c9ca4ed..c7a9b40 100755
--- a/site_utils/balance_pools.py
+++ b/site_utils/balance_pools.py

@@ -58,12 +58,18 @@
 
 import common
 from autotest_lib.server import frontend
+from autotest_lib.server import site_utils
 from autotest_lib.server.lib import status_history
 from autotest_lib.site_utils import lab_inventory
 from autotest_lib.site_utils.suite_scheduler import constants
-
+from chromite.lib import metrics
 from chromite.lib import parallel
 
+try:
+  from infra_libs import ts_mon
+except (ImportError, RuntimeError):
+  import mock
+  ts_mon = mock.Mock()
 
 _POOL_PREFIX = constants.Labels.POOL_PREFIX
 # This is the ratio of all boards we should calculate the default max number of
@@ -236,7 +242,8 @@
 
         """
         num_ineligible = len(self.ineligible_hosts)
-        if target_total < num_ineligible:
+        spares_needed = target_total >= num_ineligible
+        if not spares_needed:
             _log_error('%s %s pool: Target of %d is below '
                        'minimum of %d DUTs.',
                        self.board, self.pool,
@@ -244,6 +251,14 @@
             _log_error('Adjusting target to %d DUTs.', num_ineligible)
             target_total = num_ineligible
         adjustment = target_total - self.total_hosts
+        metrics.Boolean(
+            'chromeos/autotest/balance_pools/exhausted_pools',
+            "True for each pool/board which requests more DUTs than supplied",
+            field_spec=[
+                ts_mon.StringField('pool'), ts_mon.StringField('board')]).set(
+                    not spares_needed,
+                    fields={'pool': self.pool, 'board': self.board}
+                )
         return len(self.broken_hosts) + adjustment
 
     def allocate_surplus(self, num_broken):
@@ -296,6 +311,16 @@
         return
     _log_info(dry_run, 'Transferring %d DUTs from %s to %s.',
               len(hosts), spare_pool.pool, target_pool.pool)
+    metrics.Counter(
+        'chromeos/autotest/balance_pools/duts_moved',
+        "DUTs transferred between pools",
+        field_spec=[ts_mon.StringField('board'),
+                    ts_mon.StringField('source_pool'),
+                    ts_mon.StringField('target_pool')]
+    ).increment_by(len(hosts),
+                   fields={'board': target_pool.board,
+                           'source_pool': spare_pool.pool,
+                           'target_pool': target_pool.pool})
     additions = target_pool.pool_labels
     removals = spare_pool.pool_labels
     for host in hosts:
@@ -441,7 +466,6 @@
                   '%s pool',
                   max_broken_boards, pool)
 
-
     broken_boards = [board for board, counts in inventory.items()
                      if counts.get_broken(pool) != 0]
     broken_boards.sort()
@@ -544,55 +568,74 @@
     return arguments
 
 
-def main(argv):
-    """Standard main routine.
+def specify_balance_args(afe, arguments, pools):
+    """Take some arguments and translate them to a list of boards to balance
 
-    @param argv  Command line arguments including `sys.argv[0]`.
+    Args:
+    @param afe           AFE object to be used for taking inventory.
+    @param arguments     Parsed command line arguments.
+    @param pools         The list of pools to balance.
+
+    @returns    a list of (board, pool) pairs to be balanced
 
     """
-    def balancer(i, board, pool):
-      """Balance the specified board.
-
-      @param i The index of the board.
-      @param board The board name.
-      @param pool The pool to rebalance for the board.
-      """
-      if i > 0:
-          _log_message('')
-      _balance_board(arguments, afe, board, pool, start_time, end_time)
-
-    arguments = _parse_command(argv)
-    end_time = time.time()
-    start_time = end_time - 24 * 60 * 60
-    afe = frontend.AFE(server=None)
-    boards = arguments.boards
-    pools = (lab_inventory.CRITICAL_POOLS
-             if arguments.pool == _ALL_CRITICAL_POOLS
-             else [arguments.pool])
     board_info = []
+    boards = arguments.boards
     if arguments.all_boards:
         inventory = lab_inventory.get_inventory(afe)
         for pool in pools:
-            if _too_many_broken_boards(inventory, pool, arguments):
+            quarantine = _too_many_broken_boards(inventory, pool, arguments)
+            if quarantine:
                 _log_error('Refusing to balance all boards for %s pool, '
                            'too many boards with at least 1 broken DUT '
                            'detected.', pool)
             else:
                 boards_in_pool = inventory.get_managed_boards(pool=pool)
                 current_len_board_info = len(board_info)
-                board_info.extend([(i + current_len_board_info, board, pool)
-                                   for i, board in enumerate(boards_in_pool)])
+                board_info.extend([(board, pool) for board in boards_in_pool])
+            metrics.Boolean(
+                'chromeos/autotest/balance_pools/unchanged_pools').set(
+                    quarantine, fields={'pool': pool})
     else:
-        # We have specified boards with a specified pool, setup the args to the
-        # balancer properly.
+        # We have specified boards with a specified pool, setup the args to
+        # the balancer properly.
         for pool in pools:
             current_len_board_info = len(board_info)
-            board_info.extend([(i + current_len_board_info, board, pool)
-                               for i, board in enumerate(boards)])
-    try:
-        parallel.RunTasksInProcessPool(balancer, board_info, processes=8)
-    except KeyboardInterrupt:
-        pass
+            board_info.extend([(board, pool) for board in boards])
+    return board_info
+
+
+def main(argv):
+    """Standard main routine.
+
+    @param argv  Command line arguments including `sys.argv[0]`.
+
+    """
+    def balancer(board, pool):
+      """Balance the specified board.
+
+      @param board The board name.
+      @param pool The pool to rebalance for the board.
+      """
+      _balance_board(arguments, afe, board, pool, start_time, end_time)
+      _log_message('')
+
+    with site_utils.SetupTsMonGlobalState('balance_pools',
+                                          short_lived=True,
+                                          auto_flush=False):
+        arguments = _parse_command(argv)
+        end_time = time.time()
+        start_time = end_time - 24 * 60 * 60
+        afe = frontend.AFE(server=None)
+        pools = (lab_inventory.CRITICAL_POOLS
+                 if arguments.pool == _ALL_CRITICAL_POOLS
+                 else [arguments.pool])
+        board_info = specify_balance_args(afe, arguments, pools)
+        try:
+            parallel.RunTasksInProcessPool(balancer, board_info, processes=8)
+        except KeyboardInterrupt:
+            pass
+        metrics.Flush()
 
 
 if __name__ == '__main__':
commit	c6e483e1d707f473d54b85e6f4531b300b3f6b29	[log] [tgz]
author	Jacob Kopczynski <jkop@chromium.org>	Fri Aug 25 17:28:35 2017 -0700
committer	chrome-bot <chrome-bot@chromium.org>	Thu Oct 05 21:24:42 2017 -0700
tree	9b4790202d4c7b869123a85c568a4536f3dcf9da
parent	0a2ea5790eee1427336114f18aa4288520ccd910 [diff]