blob: 25e8ca8038a6fec2d8050906f9022c2636223669 [file] [log] [blame]
Chris Chingdf9a8ae2017-05-10 00:46:01 -06001#!/usr/bin/env python
2# Copyright 2017 The Chromium OS Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6"""Standalone service to monitor AFE servers and report to ts_mon"""
7import sys
8import time
9import multiprocessing
10import urllib2
11
12import common
13from autotest_lib.client.common_lib import global_config
14from autotest_lib.frontend.afe.json_rpc import proxy
15from autotest_lib.server import frontend
16from chromite.lib import commandline
17from chromite.lib import cros_logging as logging
18from chromite.lib import metrics
19from chromite.lib import ts_mon_config
20
21METRIC_ROOT = 'chromeos/autotest/blackbox/afe_rpc'
22METRIC_RPC_CALL_DURATIONS = METRIC_ROOT + '/rpc_call_durations'
23METRIC_TICK = METRIC_ROOT + '/tick'
24METRIC_MONITOR_ERROR = METRIC_ROOT + '/afe_monitor_error'
25
26FAILURE_REASONS = {
27 proxy.JSONRPCException: 'JSONRPCException',
28 }
29
30
31def afe_rpc_call(hostname):
32 """Perform one rpc call set on server
33
34 @param hostname: server's hostname to poll
35 """
36 afe_monitor = AfeMonitor(hostname)
37 try:
38 afe_monitor.run()
39 except Exception as e:
Chris Ching1c0fe8b2017-06-01 15:29:16 -060040 metrics.Counter(METRIC_MONITOR_ERROR).increment(
41 fields={'target_hostname': hostname})
42 logging.exception(e)
Chris Chingdf9a8ae2017-05-10 00:46:01 -060043
44
45class RpcFlightRecorder(object):
46 """Monitors a list of AFE"""
47 def __init__(self, servers, poll_period=60):
48 """
49 @pram servers: list of afe services to monitor
50 @pram poll_period: frequency to poll all services, in seconds
51 """
52 self._servers = set(servers)
53 self._poll_period = poll_period
54 self._pool = multiprocessing.Pool(processes=20)
55
56
57 def poll_servers(self):
58 """Blocking function that polls all servers and shards"""
59 while(True):
60 start_time = time.time()
61 logging.debug('Starting Server Polling: %s' %
62 ', '.join(self._servers))
63
64 self._pool.map(afe_rpc_call, self._servers)
65
66 logging.debug('Finished Server Polling')
67
68 metrics.Counter(METRIC_TICK).increment()
69
70 wait_time = (start_time + self._poll_period) - time.time()
71 if wait_time > 0:
72 time.sleep(wait_time)
73
74
75class AfeMonitor(object):
76 """Object that runs rpc calls against the given afe frontend"""
77
78 def __init__(self, hostname):
79 """
80 @param hostname: hostname of server to monitor, string
81 """
82 self._hostname = hostname
83 self._afe = frontend.AFE(server=self._hostname)
84 self._metric_fields = {'target_hostname': self._hostname}
85
86
87 def run_cmd(self, cmd):
88 """Runs rpc command and log metrics
89
90 @param cmd: string of rpc command to send
91 """
92 metric_fields = self._metric_fields.copy()
93 metric_fields['command'] = cmd
94 metric_fields['success'] = False
95 metric_fields['failure_reason'] = ''
96
97 with metrics.SecondsTimer(METRIC_RPC_CALL_DURATIONS,
98 fields=dict(self._metric_fields)) as f:
99 try:
100 result = self._afe.run(cmd)
101 f['success'] = True
102 logging.debug("%s:%s:result = %s", self._hostname,
103 cmd, result)
104 logging.info("%s:%s:success", self._hostname, cmd)
105 except urllib2.HTTPError as e:
106 f['failure_reason'] = 'HTTPError:%d' % e.code
107 logging.warning("%s:%s:failed - %s", self._hostname, cmd,
108 f['failure_reason'])
109 except Exception as e:
Chris Ching1c0fe8b2017-06-01 15:29:16 -0600110 f['failure_reason'] = FAILURE_REASONS.get(type(e), 'Unknown')
Chris Chingdf9a8ae2017-05-10 00:46:01 -0600111 logging.warning("%s:%s:failed - %s",
112 self._hostname,
113 cmd,
114 f['failure_reason'])
115 if type(e) not in FAILURE_REASONS:
116 raise
117
118
119 def run(self):
120 """Tests server and returns the result"""
121 self.run_cmd('get_motd')
122
123
124def get_parser():
125 """Returns argparse parser"""
126 parser = commandline.ArgumentParser(description=__doc__)
127
128 parser.add_argument('-a', '--afe', action='append', default=[],
129 help='Autotest FrontEnd server to monitor')
130
131 parser.add_argument('-p', '--poll-period', type=int, default=60,
132 help='Frequency to poll AFE servers')
133 return parser
134
135
136def main(argv):
137 """Main function
138
139 @param argv: commandline arguments passed
140 """
141 parser = get_parser()
142 options = parser.parse_args(argv[1:])
143
144
145 if not options.afe:
146 options.afe = [global_config.global_config.get_config_value(
147 'SERVER', 'global_afe_hostname', default='cautotest')]
148
149 with ts_mon_config.SetupTsMonGlobalState('rpc_flight_recorder',
150 indirect=True):
151 afe_monitor = RpcFlightRecorder(options.afe,
152 poll_period=options.poll_period)
153 afe_monitor.poll_servers()
154
155
156if __name__ == '__main__':
157 main(sys.argv)