Aviv Keshet | a43072a | 2018-04-16 16:29:42 -0700 | [diff] [blame] | 1 | #!/usr/bin/env python |
| 2 | # Copyright 2018 The Chromium OS Authors. All rights reserved. |
| 3 | # Use of this source code is governed by a BSD-style license that can be |
| 4 | # found in the LICENSE file. |
| 5 | |
| 6 | """A simple service to monitor DUT statuses from master db/afe.""" |
| 7 | import collections |
| 8 | import logging |
| 9 | import sys |
| 10 | import time |
| 11 | |
| 12 | import common |
| 13 | from autotest_lib.server import constants |
| 14 | from autotest_lib.server import frontend |
| 15 | from chromite.lib import metrics |
| 16 | from chromite.lib import ts_mon_config |
| 17 | |
| 18 | from infra_libs import ts_mon |
| 19 | |
| 20 | |
| 21 | DutCountBucket = collections.namedtuple('DutCountBucket', |
| 22 | ['board', |
| 23 | 'model', |
| 24 | 'pool', |
| 25 | 'is_locked', |
| 26 | 'status'] |
| 27 | ) |
| 28 | |
| 29 | |
| 30 | def _get_bucket_for_host(host): |
| 31 | """Determine the counter bucket for |host|. |
| 32 | |
| 33 | Args: |
| 34 | host: A Host object as returned by afe. |
| 35 | |
| 36 | Returns: |
| 37 | A DutCountBucket instance describing the bucket for this host. |
| 38 | """ |
| 39 | board = _get_unique_label(host.labels, constants.Labels.BOARD_PREFIX) |
| 40 | model = _get_unique_label(host.labels, constants.Labels.MODEL_PREFIX) |
| 41 | pool = _get_unique_label(host.labels, constants.Labels.POOL_PREFIX) |
| 42 | if pool in constants.Pools.MANAGED_POOLS: |
| 43 | pool = 'managed:' + pool |
| 44 | status = host.status or '[None]' |
| 45 | is_locked = host.locked |
| 46 | return DutCountBucket(board, model, pool, is_locked, status) |
| 47 | |
| 48 | |
| 49 | def _get_unique_label(labels, prefix): |
| 50 | """Return the labels for a given prefix, with prefix stripped. |
| 51 | |
| 52 | If prefixed label does not occur, return '[None]' |
| 53 | If prefixed label occurs multiply, return '[Multiple]' |
| 54 | |
| 55 | _get_unique_label(['foo:1', 'foo:2', 'bar1'], 'foo:') -> '[Multiple]' |
| 56 | |
| 57 | _get_unique_label(['foo:1', 'bar2', 'baz3'], 'foo:') -> '1' |
| 58 | |
| 59 | _get_prefixed_labels(['bar1', 'baz1'], 'foo:') -> '[None]' |
| 60 | """ |
| 61 | ls = [l[len(prefix):] for l in labels if l.startswith(prefix)] |
| 62 | if not ls: |
| 63 | return '[None]' |
| 64 | elif len(ls) == 1: |
| 65 | return ls[0] |
| 66 | else: |
| 67 | return '[Multiple]' |
| 68 | |
| 69 | |
| 70 | def main(argv): |
| 71 | """Entry point for dut_mon.""" |
| 72 | logging.getLogger().setLevel(logging.INFO) |
| 73 | |
| 74 | with ts_mon_config.SetupTsMonGlobalState('dut_mon', indirect=True): |
| 75 | afe = frontend.AFE() |
| 76 | counters = collections.defaultdict(lambda: 0) |
| 77 | |
| 78 | field_spec = [ts_mon.StringField('board'), |
| 79 | ts_mon.StringField('model'), |
| 80 | ts_mon.StringField('pool'), |
| 81 | ts_mon.BooleanField('is_locked'), |
| 82 | ts_mon.StringField('status'), |
| 83 | ] |
| 84 | dut_count = metrics.Gauge('chromeos/autotest/dut_mon/dut_count', |
| 85 | description='The number of duts in a given ' |
| 86 | 'state and bucket.', |
| 87 | field_spec=field_spec) |
| 88 | tick_count = metrics.Counter('chromeos/autotest/dut_mon/tick', |
| 89 | description='Tick counter of dut_mon.') |
| 90 | |
| 91 | while True: |
| 92 | # Note: We reset all counters to zero in each loop rather than |
| 93 | # creating a new defaultdict, because we want to ensure that any |
| 94 | # gauges that were previously set to a nonzero value by this process |
| 95 | # get set back to zero if necessary. |
| 96 | for k in counters: |
| 97 | counters[k] = 0 |
| 98 | |
| 99 | logging.info('Fetching all hosts.') |
| 100 | hosts = afe.get_hosts() |
| 101 | logging.info('Fetched %s hosts.', len(hosts)) |
| 102 | for host in hosts: |
| 103 | fields = _get_bucket_for_host(host) |
| 104 | counters[fields] += 1 |
| 105 | |
| 106 | for field, value in counters.iteritems(): |
| 107 | logging.info('%s %s', field, value) |
| 108 | dut_count.set(value, fields=field.__dict__) |
| 109 | |
| 110 | tick_count.increment() |
| 111 | logging.info('Sleeping for 2 minutes.') |
| 112 | time.sleep(120) |
| 113 | |
| 114 | |
| 115 | if __name__ == '__main__': |
| 116 | main(sys.argv) |