Add metrics to balancer, remove board indices.
Boolean metrics for boards and pools depleted of capacity and for pools which
are quarantined.
Counter metric for moved DUTs parameterized by board, source pool, and target
pool.
No longer assign indices to boards while balancing them.
BUG=chromium:746067
TEST=ran 'balance_pool --all-boards suites'
Change-Id: Ic42ddd99b45b7ee5d6cbf40dde5b566dcce9f828
Reviewed-on: https://chromium-review.googlesource.com/639692
Commit-Ready: Jacob Kopczynski <jkop@chromium.org>
Tested-by: Jacob Kopczynski <jkop@chromium.org>
Reviewed-by: Paul Hobbs <phobbs@google.com>
diff --git a/site_utils/balance_pools.py b/site_utils/balance_pools.py
index c9ca4ed..c7a9b40 100755
--- a/site_utils/balance_pools.py
+++ b/site_utils/balance_pools.py
@@ -58,12 +58,18 @@
import common
from autotest_lib.server import frontend
+from autotest_lib.server import site_utils
from autotest_lib.server.lib import status_history
from autotest_lib.site_utils import lab_inventory
from autotest_lib.site_utils.suite_scheduler import constants
-
+from chromite.lib import metrics
from chromite.lib import parallel
+try:
+ from infra_libs import ts_mon
+except (ImportError, RuntimeError):
+ import mock
+ ts_mon = mock.Mock()
_POOL_PREFIX = constants.Labels.POOL_PREFIX
# This is the ratio of all boards we should calculate the default max number of
@@ -236,7 +242,8 @@
"""
num_ineligible = len(self.ineligible_hosts)
- if target_total < num_ineligible:
+ spares_needed = target_total >= num_ineligible
+ if not spares_needed:
_log_error('%s %s pool: Target of %d is below '
'minimum of %d DUTs.',
self.board, self.pool,
@@ -244,6 +251,14 @@
_log_error('Adjusting target to %d DUTs.', num_ineligible)
target_total = num_ineligible
adjustment = target_total - self.total_hosts
+ metrics.Boolean(
+ 'chromeos/autotest/balance_pools/exhausted_pools',
+ "True for each pool/board which requests more DUTs than supplied",
+ field_spec=[
+ ts_mon.StringField('pool'), ts_mon.StringField('board')]).set(
+ not spares_needed,
+ fields={'pool': self.pool, 'board': self.board}
+ )
return len(self.broken_hosts) + adjustment
def allocate_surplus(self, num_broken):
@@ -296,6 +311,16 @@
return
_log_info(dry_run, 'Transferring %d DUTs from %s to %s.',
len(hosts), spare_pool.pool, target_pool.pool)
+ metrics.Counter(
+ 'chromeos/autotest/balance_pools/duts_moved',
+ "DUTs transferred between pools",
+ field_spec=[ts_mon.StringField('board'),
+ ts_mon.StringField('source_pool'),
+ ts_mon.StringField('target_pool')]
+ ).increment_by(len(hosts),
+ fields={'board': target_pool.board,
+ 'source_pool': spare_pool.pool,
+ 'target_pool': target_pool.pool})
additions = target_pool.pool_labels
removals = spare_pool.pool_labels
for host in hosts:
@@ -441,7 +466,6 @@
'%s pool',
max_broken_boards, pool)
-
broken_boards = [board for board, counts in inventory.items()
if counts.get_broken(pool) != 0]
broken_boards.sort()
@@ -544,55 +568,74 @@
return arguments
-def main(argv):
- """Standard main routine.
+def specify_balance_args(afe, arguments, pools):
+ """Take some arguments and translate them to a list of boards to balance
- @param argv Command line arguments including `sys.argv[0]`.
+ Args:
+ @param afe AFE object to be used for taking inventory.
+ @param arguments Parsed command line arguments.
+ @param pools The list of pools to balance.
+
+ @returns a list of (board, pool) pairs to be balanced
"""
- def balancer(i, board, pool):
- """Balance the specified board.
-
- @param i The index of the board.
- @param board The board name.
- @param pool The pool to rebalance for the board.
- """
- if i > 0:
- _log_message('')
- _balance_board(arguments, afe, board, pool, start_time, end_time)
-
- arguments = _parse_command(argv)
- end_time = time.time()
- start_time = end_time - 24 * 60 * 60
- afe = frontend.AFE(server=None)
- boards = arguments.boards
- pools = (lab_inventory.CRITICAL_POOLS
- if arguments.pool == _ALL_CRITICAL_POOLS
- else [arguments.pool])
board_info = []
+ boards = arguments.boards
if arguments.all_boards:
inventory = lab_inventory.get_inventory(afe)
for pool in pools:
- if _too_many_broken_boards(inventory, pool, arguments):
+ quarantine = _too_many_broken_boards(inventory, pool, arguments)
+ if quarantine:
_log_error('Refusing to balance all boards for %s pool, '
'too many boards with at least 1 broken DUT '
'detected.', pool)
else:
boards_in_pool = inventory.get_managed_boards(pool=pool)
current_len_board_info = len(board_info)
- board_info.extend([(i + current_len_board_info, board, pool)
- for i, board in enumerate(boards_in_pool)])
+ board_info.extend([(board, pool) for board in boards_in_pool])
+ metrics.Boolean(
+ 'chromeos/autotest/balance_pools/unchanged_pools').set(
+ quarantine, fields={'pool': pool})
else:
- # We have specified boards with a specified pool, setup the args to the
- # balancer properly.
+ # We have specified boards with a specified pool, setup the args to
+ # the balancer properly.
for pool in pools:
current_len_board_info = len(board_info)
- board_info.extend([(i + current_len_board_info, board, pool)
- for i, board in enumerate(boards)])
- try:
- parallel.RunTasksInProcessPool(balancer, board_info, processes=8)
- except KeyboardInterrupt:
- pass
+ board_info.extend([(board, pool) for board in boards])
+ return board_info
+
+
+def main(argv):
+ """Standard main routine.
+
+ @param argv Command line arguments including `sys.argv[0]`.
+
+ """
+ def balancer(board, pool):
+ """Balance the specified board.
+
+ @param board The board name.
+ @param pool The pool to rebalance for the board.
+ """
+ _balance_board(arguments, afe, board, pool, start_time, end_time)
+ _log_message('')
+
+ with site_utils.SetupTsMonGlobalState('balance_pools',
+ short_lived=True,
+ auto_flush=False):
+ arguments = _parse_command(argv)
+ end_time = time.time()
+ start_time = end_time - 24 * 60 * 60
+ afe = frontend.AFE(server=None)
+ pools = (lab_inventory.CRITICAL_POOLS
+ if arguments.pool == _ALL_CRITICAL_POOLS
+ else [arguments.pool])
+ board_info = specify_balance_args(afe, arguments, pools)
+ try:
+ parallel.RunTasksInProcessPool(balancer, board_info, processes=8)
+ except KeyboardInterrupt:
+ pass
+ metrics.Flush()
if __name__ == '__main__':