Chris Ching | df9a8ae | 2017-05-10 00:46:01 -0600 | [diff] [blame] | 1 | #!/usr/bin/env python |
| 2 | # Copyright 2017 The Chromium OS Authors. All rights reserved. |
| 3 | # Use of this source code is governed by a BSD-style license that can be |
| 4 | # found in the LICENSE file. |
| 5 | |
| 6 | """Standalone service to monitor AFE servers and report to ts_mon""" |
| 7 | import sys |
| 8 | import time |
| 9 | import multiprocessing |
| 10 | import urllib2 |
| 11 | |
| 12 | import common |
| 13 | from autotest_lib.client.common_lib import global_config |
| 14 | from autotest_lib.frontend.afe.json_rpc import proxy |
| 15 | from autotest_lib.server import frontend |
| 16 | from chromite.lib import commandline |
| 17 | from chromite.lib import cros_logging as logging |
| 18 | from chromite.lib import metrics |
| 19 | from chromite.lib import ts_mon_config |
| 20 | |
| 21 | METRIC_ROOT = 'chromeos/autotest/blackbox/afe_rpc' |
| 22 | METRIC_RPC_CALL_DURATIONS = METRIC_ROOT + '/rpc_call_durations' |
| 23 | METRIC_TICK = METRIC_ROOT + '/tick' |
| 24 | METRIC_MONITOR_ERROR = METRIC_ROOT + '/afe_monitor_error' |
| 25 | |
| 26 | FAILURE_REASONS = { |
| 27 | proxy.JSONRPCException: 'JSONRPCException', |
| 28 | } |
| 29 | |
| 30 | |
| 31 | def afe_rpc_call(hostname): |
| 32 | """Perform one rpc call set on server |
| 33 | |
| 34 | @param hostname: server's hostname to poll |
| 35 | """ |
| 36 | afe_monitor = AfeMonitor(hostname) |
| 37 | try: |
| 38 | afe_monitor.run() |
| 39 | except Exception as e: |
Chris Ching | 1c0fe8b | 2017-06-01 15:29:16 -0600 | [diff] [blame^] | 40 | metrics.Counter(METRIC_MONITOR_ERROR).increment( |
| 41 | fields={'target_hostname': hostname}) |
| 42 | logging.exception(e) |
Chris Ching | df9a8ae | 2017-05-10 00:46:01 -0600 | [diff] [blame] | 43 | |
| 44 | |
| 45 | class RpcFlightRecorder(object): |
| 46 | """Monitors a list of AFE""" |
| 47 | def __init__(self, servers, poll_period=60): |
| 48 | """ |
| 49 | @pram servers: list of afe services to monitor |
| 50 | @pram poll_period: frequency to poll all services, in seconds |
| 51 | """ |
| 52 | self._servers = set(servers) |
| 53 | self._poll_period = poll_period |
| 54 | self._pool = multiprocessing.Pool(processes=20) |
| 55 | |
| 56 | |
| 57 | def poll_servers(self): |
| 58 | """Blocking function that polls all servers and shards""" |
| 59 | while(True): |
| 60 | start_time = time.time() |
| 61 | logging.debug('Starting Server Polling: %s' % |
| 62 | ', '.join(self._servers)) |
| 63 | |
| 64 | self._pool.map(afe_rpc_call, self._servers) |
| 65 | |
| 66 | logging.debug('Finished Server Polling') |
| 67 | |
| 68 | metrics.Counter(METRIC_TICK).increment() |
| 69 | |
| 70 | wait_time = (start_time + self._poll_period) - time.time() |
| 71 | if wait_time > 0: |
| 72 | time.sleep(wait_time) |
| 73 | |
| 74 | |
| 75 | class AfeMonitor(object): |
| 76 | """Object that runs rpc calls against the given afe frontend""" |
| 77 | |
| 78 | def __init__(self, hostname): |
| 79 | """ |
| 80 | @param hostname: hostname of server to monitor, string |
| 81 | """ |
| 82 | self._hostname = hostname |
| 83 | self._afe = frontend.AFE(server=self._hostname) |
| 84 | self._metric_fields = {'target_hostname': self._hostname} |
| 85 | |
| 86 | |
| 87 | def run_cmd(self, cmd): |
| 88 | """Runs rpc command and log metrics |
| 89 | |
| 90 | @param cmd: string of rpc command to send |
| 91 | """ |
| 92 | metric_fields = self._metric_fields.copy() |
| 93 | metric_fields['command'] = cmd |
| 94 | metric_fields['success'] = False |
| 95 | metric_fields['failure_reason'] = '' |
| 96 | |
| 97 | with metrics.SecondsTimer(METRIC_RPC_CALL_DURATIONS, |
| 98 | fields=dict(self._metric_fields)) as f: |
| 99 | try: |
| 100 | result = self._afe.run(cmd) |
| 101 | f['success'] = True |
| 102 | logging.debug("%s:%s:result = %s", self._hostname, |
| 103 | cmd, result) |
| 104 | logging.info("%s:%s:success", self._hostname, cmd) |
| 105 | except urllib2.HTTPError as e: |
| 106 | f['failure_reason'] = 'HTTPError:%d' % e.code |
| 107 | logging.warning("%s:%s:failed - %s", self._hostname, cmd, |
| 108 | f['failure_reason']) |
| 109 | except Exception as e: |
Chris Ching | 1c0fe8b | 2017-06-01 15:29:16 -0600 | [diff] [blame^] | 110 | f['failure_reason'] = FAILURE_REASONS.get(type(e), 'Unknown') |
Chris Ching | df9a8ae | 2017-05-10 00:46:01 -0600 | [diff] [blame] | 111 | logging.warning("%s:%s:failed - %s", |
| 112 | self._hostname, |
| 113 | cmd, |
| 114 | f['failure_reason']) |
| 115 | if type(e) not in FAILURE_REASONS: |
| 116 | raise |
| 117 | |
| 118 | |
| 119 | def run(self): |
| 120 | """Tests server and returns the result""" |
| 121 | self.run_cmd('get_motd') |
| 122 | |
| 123 | |
| 124 | def get_parser(): |
| 125 | """Returns argparse parser""" |
| 126 | parser = commandline.ArgumentParser(description=__doc__) |
| 127 | |
| 128 | parser.add_argument('-a', '--afe', action='append', default=[], |
| 129 | help='Autotest FrontEnd server to monitor') |
| 130 | |
| 131 | parser.add_argument('-p', '--poll-period', type=int, default=60, |
| 132 | help='Frequency to poll AFE servers') |
| 133 | return parser |
| 134 | |
| 135 | |
| 136 | def main(argv): |
| 137 | """Main function |
| 138 | |
| 139 | @param argv: commandline arguments passed |
| 140 | """ |
| 141 | parser = get_parser() |
| 142 | options = parser.parse_args(argv[1:]) |
| 143 | |
| 144 | |
| 145 | if not options.afe: |
| 146 | options.afe = [global_config.global_config.get_config_value( |
| 147 | 'SERVER', 'global_afe_hostname', default='cautotest')] |
| 148 | |
| 149 | with ts_mon_config.SetupTsMonGlobalState('rpc_flight_recorder', |
| 150 | indirect=True): |
| 151 | afe_monitor = RpcFlightRecorder(options.afe, |
| 152 | poll_period=options.poll_period) |
| 153 | afe_monitor.poll_servers() |
| 154 | |
| 155 | |
| 156 | if __name__ == '__main__': |
| 157 | main(sys.argv) |