blob: bda878d3968b6286d2d84acdc0b8daae47874a83 [file] [log] [blame]
Chris Chingdf9a8ae2017-05-10 00:46:01 -06001#!/usr/bin/env python
2# Copyright 2017 The Chromium OS Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6"""Standalone service to monitor AFE servers and report to ts_mon"""
7import sys
8import time
9import multiprocessing
10import urllib2
11
12import common
13from autotest_lib.client.common_lib import global_config
14from autotest_lib.frontend.afe.json_rpc import proxy
15from autotest_lib.server import frontend
16from chromite.lib import commandline
17from chromite.lib import cros_logging as logging
18from chromite.lib import metrics
19from chromite.lib import ts_mon_config
20
21METRIC_ROOT = 'chromeos/autotest/blackbox/afe_rpc'
22METRIC_RPC_CALL_DURATIONS = METRIC_ROOT + '/rpc_call_durations'
23METRIC_TICK = METRIC_ROOT + '/tick'
24METRIC_MONITOR_ERROR = METRIC_ROOT + '/afe_monitor_error'
25
26FAILURE_REASONS = {
27 proxy.JSONRPCException: 'JSONRPCException',
28 }
29
30
31def afe_rpc_call(hostname):
32 """Perform one rpc call set on server
33
34 @param hostname: server's hostname to poll
35 """
36 afe_monitor = AfeMonitor(hostname)
37 try:
38 afe_monitor.run()
39 except Exception as e:
Chris Ching1c0fe8b2017-06-01 15:29:16 -060040 metrics.Counter(METRIC_MONITOR_ERROR).increment(
41 fields={'target_hostname': hostname})
42 logging.exception(e)
Chris Chingdf9a8ae2017-05-10 00:46:01 -060043
44
45class RpcFlightRecorder(object):
46 """Monitors a list of AFE"""
47 def __init__(self, servers, poll_period=60):
48 """
49 @pram servers: list of afe services to monitor
50 @pram poll_period: frequency to poll all services, in seconds
51 """
52 self._servers = set(servers)
53 self._poll_period = poll_period
54 self._pool = multiprocessing.Pool(processes=20)
55
56
57 def poll_servers(self):
58 """Blocking function that polls all servers and shards"""
59 while(True):
60 start_time = time.time()
61 logging.debug('Starting Server Polling: %s' %
62 ', '.join(self._servers))
63
64 self._pool.map(afe_rpc_call, self._servers)
65
66 logging.debug('Finished Server Polling')
67
68 metrics.Counter(METRIC_TICK).increment()
69
70 wait_time = (start_time + self._poll_period) - time.time()
71 if wait_time > 0:
72 time.sleep(wait_time)
73
Chris Ching43fdebc2017-06-15 11:05:38 -060074def _failed(fields, msg_str, reason, err=None):
75 """Mark current run failed
76
77 @param fields, ts_mon fields to mark as failed
78 @param msg_str, message string to be filled
79 @param reason: why it failed
80 @param err: optional error to log more debug info
81 """
82 fields['success'] = False
83 fields['failure_reason'] = reason
84 logging.warning("%s failed - %s", msg_str, reason)
85 if err:
86 logging.debug("%s fail_err - %s", msg_str, str(err))
Chris Chingdf9a8ae2017-05-10 00:46:01 -060087
88class AfeMonitor(object):
89 """Object that runs rpc calls against the given afe frontend"""
90
91 def __init__(self, hostname):
92 """
93 @param hostname: hostname of server to monitor, string
94 """
95 self._hostname = hostname
96 self._afe = frontend.AFE(server=self._hostname)
97 self._metric_fields = {'target_hostname': self._hostname}
98
99
Chris Ching43fdebc2017-06-15 11:05:38 -0600100 def run_cmd(self, cmd, expected=None):
Chris Chingdf9a8ae2017-05-10 00:46:01 -0600101 """Runs rpc command and log metrics
102
103 @param cmd: string of rpc command to send
Chris Ching43fdebc2017-06-15 11:05:38 -0600104 @param expected: expected result of rpc
Chris Chingdf9a8ae2017-05-10 00:46:01 -0600105 """
106 metric_fields = self._metric_fields.copy()
107 metric_fields['command'] = cmd
Chris Ching43fdebc2017-06-15 11:05:38 -0600108 metric_fields['success'] = True
Chris Chingdf9a8ae2017-05-10 00:46:01 -0600109 metric_fields['failure_reason'] = ''
110
111 with metrics.SecondsTimer(METRIC_RPC_CALL_DURATIONS,
Prathmesh Prabhu129c0d42017-06-02 12:44:21 -0700112 fields=dict(metric_fields)) as f:
Chris Ching43fdebc2017-06-15 11:05:38 -0600113
114 msg_str = "%s:%s" % (self._hostname, cmd)
115
116
Chris Chingdf9a8ae2017-05-10 00:46:01 -0600117 try:
118 result = self._afe.run(cmd)
Chris Ching43fdebc2017-06-15 11:05:38 -0600119 logging.debug("%s result = %s", msg_str, result)
120 if expected is not None and expected != result:
121 _failed(f, msg_str, 'IncorrectResponse')
122
Chris Chingdf9a8ae2017-05-10 00:46:01 -0600123 except urllib2.HTTPError as e:
Chris Ching43fdebc2017-06-15 11:05:38 -0600124 _failed(f, msg_str, 'HTTPError:%d' % e.code)
125
Chris Chingdf9a8ae2017-05-10 00:46:01 -0600126 except Exception as e:
Chris Ching43fdebc2017-06-15 11:05:38 -0600127 _failed(f, msg_str, FAILURE_REASONS.get(type(e), 'Unknown'),
128 err=e)
129
Chris Chingdf9a8ae2017-05-10 00:46:01 -0600130 if type(e) not in FAILURE_REASONS:
131 raise
132
Chris Ching43fdebc2017-06-15 11:05:38 -0600133 if f['success']:
134 logging.info("%s success", msg_str)
135
Chris Chingdf9a8ae2017-05-10 00:46:01 -0600136
137 def run(self):
138 """Tests server and returns the result"""
Chris Ching43fdebc2017-06-15 11:05:38 -0600139 self.run_cmd('get_server_time')
140 self.run_cmd('ping_db', [True])
Chris Chingdf9a8ae2017-05-10 00:46:01 -0600141
142
143def get_parser():
144 """Returns argparse parser"""
145 parser = commandline.ArgumentParser(description=__doc__)
146
147 parser.add_argument('-a', '--afe', action='append', default=[],
148 help='Autotest FrontEnd server to monitor')
149
150 parser.add_argument('-p', '--poll-period', type=int, default=60,
151 help='Frequency to poll AFE servers')
152 return parser
153
154
155def main(argv):
156 """Main function
157
158 @param argv: commandline arguments passed
159 """
160 parser = get_parser()
161 options = parser.parse_args(argv[1:])
162
163
164 if not options.afe:
165 options.afe = [global_config.global_config.get_config_value(
166 'SERVER', 'global_afe_hostname', default='cautotest')]
167
168 with ts_mon_config.SetupTsMonGlobalState('rpc_flight_recorder',
169 indirect=True):
170 afe_monitor = RpcFlightRecorder(options.afe,
171 poll_period=options.poll_period)
172 afe_monitor.poll_servers()
173
174
175if __name__ == '__main__':
176 main(sys.argv)