Chris Ching | df9a8ae | 2017-05-10 00:46:01 -0600 | [diff] [blame] | 1 | #!/usr/bin/env python |
| 2 | # Copyright 2017 The Chromium OS Authors. All rights reserved. |
| 3 | # Use of this source code is governed by a BSD-style license that can be |
| 4 | # found in the LICENSE file. |
| 5 | |
| 6 | """Standalone service to monitor AFE servers and report to ts_mon""" |
| 7 | import sys |
| 8 | import time |
| 9 | import multiprocessing |
| 10 | import urllib2 |
| 11 | |
| 12 | import common |
| 13 | from autotest_lib.client.common_lib import global_config |
| 14 | from autotest_lib.frontend.afe.json_rpc import proxy |
| 15 | from autotest_lib.server import frontend |
| 16 | from chromite.lib import commandline |
| 17 | from chromite.lib import cros_logging as logging |
| 18 | from chromite.lib import metrics |
| 19 | from chromite.lib import ts_mon_config |
| 20 | |
| 21 | METRIC_ROOT = 'chromeos/autotest/blackbox/afe_rpc' |
| 22 | METRIC_RPC_CALL_DURATIONS = METRIC_ROOT + '/rpc_call_durations' |
| 23 | METRIC_TICK = METRIC_ROOT + '/tick' |
| 24 | METRIC_MONITOR_ERROR = METRIC_ROOT + '/afe_monitor_error' |
| 25 | |
| 26 | FAILURE_REASONS = { |
| 27 | proxy.JSONRPCException: 'JSONRPCException', |
| 28 | } |
| 29 | |
| 30 | |
| 31 | def afe_rpc_call(hostname): |
| 32 | """Perform one rpc call set on server |
| 33 | |
| 34 | @param hostname: server's hostname to poll |
| 35 | """ |
| 36 | afe_monitor = AfeMonitor(hostname) |
| 37 | try: |
| 38 | afe_monitor.run() |
| 39 | except Exception as e: |
Chris Ching | 1c0fe8b | 2017-06-01 15:29:16 -0600 | [diff] [blame] | 40 | metrics.Counter(METRIC_MONITOR_ERROR).increment( |
| 41 | fields={'target_hostname': hostname}) |
| 42 | logging.exception(e) |
Chris Ching | df9a8ae | 2017-05-10 00:46:01 -0600 | [diff] [blame] | 43 | |
| 44 | |
| 45 | class RpcFlightRecorder(object): |
| 46 | """Monitors a list of AFE""" |
| 47 | def __init__(self, servers, poll_period=60): |
| 48 | """ |
| 49 | @pram servers: list of afe services to monitor |
| 50 | @pram poll_period: frequency to poll all services, in seconds |
| 51 | """ |
| 52 | self._servers = set(servers) |
| 53 | self._poll_period = poll_period |
| 54 | self._pool = multiprocessing.Pool(processes=20) |
| 55 | |
| 56 | |
| 57 | def poll_servers(self): |
| 58 | """Blocking function that polls all servers and shards""" |
| 59 | while(True): |
| 60 | start_time = time.time() |
| 61 | logging.debug('Starting Server Polling: %s' % |
| 62 | ', '.join(self._servers)) |
| 63 | |
| 64 | self._pool.map(afe_rpc_call, self._servers) |
| 65 | |
| 66 | logging.debug('Finished Server Polling') |
| 67 | |
| 68 | metrics.Counter(METRIC_TICK).increment() |
| 69 | |
| 70 | wait_time = (start_time + self._poll_period) - time.time() |
| 71 | if wait_time > 0: |
| 72 | time.sleep(wait_time) |
| 73 | |
Chris Ching | 43fdebc | 2017-06-15 11:05:38 -0600 | [diff] [blame^] | 74 | def _failed(fields, msg_str, reason, err=None): |
| 75 | """Mark current run failed |
| 76 | |
| 77 | @param fields, ts_mon fields to mark as failed |
| 78 | @param msg_str, message string to be filled |
| 79 | @param reason: why it failed |
| 80 | @param err: optional error to log more debug info |
| 81 | """ |
| 82 | fields['success'] = False |
| 83 | fields['failure_reason'] = reason |
| 84 | logging.warning("%s failed - %s", msg_str, reason) |
| 85 | if err: |
| 86 | logging.debug("%s fail_err - %s", msg_str, str(err)) |
Chris Ching | df9a8ae | 2017-05-10 00:46:01 -0600 | [diff] [blame] | 87 | |
| 88 | class AfeMonitor(object): |
| 89 | """Object that runs rpc calls against the given afe frontend""" |
| 90 | |
| 91 | def __init__(self, hostname): |
| 92 | """ |
| 93 | @param hostname: hostname of server to monitor, string |
| 94 | """ |
| 95 | self._hostname = hostname |
| 96 | self._afe = frontend.AFE(server=self._hostname) |
| 97 | self._metric_fields = {'target_hostname': self._hostname} |
| 98 | |
| 99 | |
Chris Ching | 43fdebc | 2017-06-15 11:05:38 -0600 | [diff] [blame^] | 100 | def run_cmd(self, cmd, expected=None): |
Chris Ching | df9a8ae | 2017-05-10 00:46:01 -0600 | [diff] [blame] | 101 | """Runs rpc command and log metrics |
| 102 | |
| 103 | @param cmd: string of rpc command to send |
Chris Ching | 43fdebc | 2017-06-15 11:05:38 -0600 | [diff] [blame^] | 104 | @param expected: expected result of rpc |
Chris Ching | df9a8ae | 2017-05-10 00:46:01 -0600 | [diff] [blame] | 105 | """ |
| 106 | metric_fields = self._metric_fields.copy() |
| 107 | metric_fields['command'] = cmd |
Chris Ching | 43fdebc | 2017-06-15 11:05:38 -0600 | [diff] [blame^] | 108 | metric_fields['success'] = True |
Chris Ching | df9a8ae | 2017-05-10 00:46:01 -0600 | [diff] [blame] | 109 | metric_fields['failure_reason'] = '' |
| 110 | |
| 111 | with metrics.SecondsTimer(METRIC_RPC_CALL_DURATIONS, |
Prathmesh Prabhu | 129c0d4 | 2017-06-02 12:44:21 -0700 | [diff] [blame] | 112 | fields=dict(metric_fields)) as f: |
Chris Ching | 43fdebc | 2017-06-15 11:05:38 -0600 | [diff] [blame^] | 113 | |
| 114 | msg_str = "%s:%s" % (self._hostname, cmd) |
| 115 | |
| 116 | |
Chris Ching | df9a8ae | 2017-05-10 00:46:01 -0600 | [diff] [blame] | 117 | try: |
| 118 | result = self._afe.run(cmd) |
Chris Ching | 43fdebc | 2017-06-15 11:05:38 -0600 | [diff] [blame^] | 119 | logging.debug("%s result = %s", msg_str, result) |
| 120 | if expected is not None and expected != result: |
| 121 | _failed(f, msg_str, 'IncorrectResponse') |
| 122 | |
Chris Ching | df9a8ae | 2017-05-10 00:46:01 -0600 | [diff] [blame] | 123 | except urllib2.HTTPError as e: |
Chris Ching | 43fdebc | 2017-06-15 11:05:38 -0600 | [diff] [blame^] | 124 | _failed(f, msg_str, 'HTTPError:%d' % e.code) |
| 125 | |
Chris Ching | df9a8ae | 2017-05-10 00:46:01 -0600 | [diff] [blame] | 126 | except Exception as e: |
Chris Ching | 43fdebc | 2017-06-15 11:05:38 -0600 | [diff] [blame^] | 127 | _failed(f, msg_str, FAILURE_REASONS.get(type(e), 'Unknown'), |
| 128 | err=e) |
| 129 | |
Chris Ching | df9a8ae | 2017-05-10 00:46:01 -0600 | [diff] [blame] | 130 | if type(e) not in FAILURE_REASONS: |
| 131 | raise |
| 132 | |
Chris Ching | 43fdebc | 2017-06-15 11:05:38 -0600 | [diff] [blame^] | 133 | if f['success']: |
| 134 | logging.info("%s success", msg_str) |
| 135 | |
Chris Ching | df9a8ae | 2017-05-10 00:46:01 -0600 | [diff] [blame] | 136 | |
| 137 | def run(self): |
| 138 | """Tests server and returns the result""" |
Chris Ching | 43fdebc | 2017-06-15 11:05:38 -0600 | [diff] [blame^] | 139 | self.run_cmd('get_server_time') |
| 140 | self.run_cmd('ping_db', [True]) |
Chris Ching | df9a8ae | 2017-05-10 00:46:01 -0600 | [diff] [blame] | 141 | |
| 142 | |
| 143 | def get_parser(): |
| 144 | """Returns argparse parser""" |
| 145 | parser = commandline.ArgumentParser(description=__doc__) |
| 146 | |
| 147 | parser.add_argument('-a', '--afe', action='append', default=[], |
| 148 | help='Autotest FrontEnd server to monitor') |
| 149 | |
| 150 | parser.add_argument('-p', '--poll-period', type=int, default=60, |
| 151 | help='Frequency to poll AFE servers') |
| 152 | return parser |
| 153 | |
| 154 | |
| 155 | def main(argv): |
| 156 | """Main function |
| 157 | |
| 158 | @param argv: commandline arguments passed |
| 159 | """ |
| 160 | parser = get_parser() |
| 161 | options = parser.parse_args(argv[1:]) |
| 162 | |
| 163 | |
| 164 | if not options.afe: |
| 165 | options.afe = [global_config.global_config.get_config_value( |
| 166 | 'SERVER', 'global_afe_hostname', default='cautotest')] |
| 167 | |
| 168 | with ts_mon_config.SetupTsMonGlobalState('rpc_flight_recorder', |
| 169 | indirect=True): |
| 170 | afe_monitor = RpcFlightRecorder(options.afe, |
| 171 | poll_period=options.poll_period) |
| 172 | afe_monitor.poll_servers() |
| 173 | |
| 174 | |
| 175 | if __name__ == '__main__': |
| 176 | main(sys.argv) |