blob: dd842cba29a5ffa8d156f3199fd3839471bb8a3a [file] [log] [blame]
Alex Miller12ad87e2014-01-22 11:19:46 -08001#!/usr/bin/env python
2
Alex Millerc38b20e2014-01-22 14:41:01 -08003import collections
Alex Miller12ad87e2014-01-22 11:19:46 -08004import multiprocessing.pool
Alex Millerc38b20e2014-01-22 14:41:01 -08005import subprocess
6import sys
Alex Miller12ad87e2014-01-22 11:19:46 -08007
8import common
9from autotest_lib.server import frontend
10from autotest_lib.server import utils
11
12
13SERVERS = ['cautotest', 'cautotest-cq']
14
15
16def ssh_command(host, cmd):
17 return ['ssh', '-o PasswordAuthentication=no', '-o ConnectTimeout=1',
18 '-o ConnectionAttempts=1', '-o StrictHostKeyChecking=no',
19 '-q', 'root@'+host, '--', cmd]
20
21
22class Result(object):
23 def __init__(self, host, ping, ssh, servod, logs):
24 self.host = host
25 self.ping = ping
26 self.ssh = ssh
27 self.servod = servod
28 self.logs = logs
29
30
31def log_result(result):
32 print "Examining %s ..." % result.host
33
Alex Millerc38b20e2014-01-22 14:41:01 -080034 if result.ping:
Alex Miller12ad87e2014-01-22 11:19:46 -080035 print " PING = UP"
36 else:
Alex Millerc38b20e2014-01-22 14:41:01 -080037 print " PING = DOWN\n"
Alex Miller12ad87e2014-01-22 11:19:46 -080038 return
39
40 if result.ssh:
41 print " SSH = UP"
42 else:
Alex Millerc38b20e2014-01-22 14:41:01 -080043 print " SSH = DOWN\n"
Alex Miller12ad87e2014-01-22 11:19:46 -080044 return
45
46 print " SERVOD = %s" % ('UP' if result.servod else 'DOWN',)
47 print " LOGS = \n%s" % (result.logs,)
48
49
50def check_servo(servo):
51 r = Result(servo, None, None, None, None)
52
Alex Millerc38b20e2014-01-22 14:41:01 -080053 r.ping = (utils.ping(servo, tries=5, deadline=5) == 0)
54 if not r.ping:
Alex Miller12ad87e2014-01-22 11:19:46 -080055 return r
56
57 try:
58 subprocess.check_output(ssh_command(servo, "true"))
59 except subprocess.CalledProcessError:
60 r.ssh = False
61 return r
62 else:
63 r.ssh = True
64
65 try:
66 output = subprocess.check_output(ssh_command(servo, "pgrep servod"))
67 except subprocess.CalledProcessError:
68 r.servod = False
69 else:
70 r.servod = (output != "")
71
72 try:
73 output = subprocess.check_output(
74 ssh_command(servo, "tail /var/log/servod.log"))
75 except subprocess.CalledProcessError:
76 r.logs = ""
77 else:
78 r.logs = output
79
80 return r
81
Alex Miller12ad87e2014-01-22 11:19:46 -080082
Alex Millerc38b20e2014-01-22 14:41:01 -080083def redeploy_hdctools(host):
84 try:
85 subprocess.check_output(
86 ssh_command(host, "/home/chromeos-test/hdctools/beaglebone/deploy"),
87 stderr=subprocess.STDOUT)
88 except subprocess.CalledProcessError:
89 return False
90 else:
91 return True
92
93
94def install_package(package):
95 def curry(host):
96 try:
97 subprocess.check_output(
98 ssh_command(host, "apt-get install %s" % package),
99 stderr=subprocess.STDOUT)
100 subprocess.check_output(
101 ssh_command(host, "start servod"),
102 stderr=subprocess.STDOUT)
103 except subprocess.CalledProcessError:
104 return False
105 else:
106 return True
107 curry.__name__ = "install_package(%s)" % package
108 return curry
109
110
111def manual_intervention(reason):
112 def curry(_):
113 return False
114
115 curry.__name__ = 'MANUAL(%s)' % reason
116 return curry
117
118
119Fix = collections.namedtuple('Fix', ['host', 'method', 'success'])
120
121# I don't know if these failures are one-time or repeating, so I'm adding code
122# here for now. If these are seen and fixed by this frequently, then this code
123# should be moved into servo_host's repair()
124def diagnose_failure(r):
125 method = None
126
127 if r.logs and 'ImportError: Entry point' in r.logs:
128 method = redeploy_hdctools
129
130 if r.logs and 'ImportError: No module named serial' in r.logs:
131 method = install_package('python-serial')
132
133 if not r.ping or not r.ssh:
134 method = manual_intervention('servo is unresponsive on network')
135
136 if r.logs and 'No usb device connected to servo' in r.logs:
137 method = manual_intervention("servo doesn't see USB drive")
138
139 if r.logs and 'discover_servo - No servos found' in r.logs:
140 method = manual_intervention("beaglebone doesn't see servo")
141
142 if method:
143 return Fix(r.host, method.__name__, method(r.host))
144 else:
145 return None
146
147
148def main():
Alex Miller12ad87e2014-01-22 11:19:46 -0800149 pool = multiprocessing.pool.ThreadPool()
Alex Millerc38b20e2014-01-22 14:41:01 -0800150 all_results = []
151
152 for server in SERVERS:
153 afe = frontend.AFE(server=server)
154 hosts = afe.run('get_hosts', multiple_labels=['servo'],
155 status='Repair Failed')
156 servos = [h['hostname']+'-servo.cros' for h in hosts]
157
158 results = pool.imap_unordered(check_servo, servos)
159 for result in results:
160 log_result(result)
161 all_results.append(result)
162
163 # fix 'em if you can?
164 fixes = filter(None, pool.imap_unordered(diagnose_failure, all_results))
165 for fix in fixes:
166 print ("Fixing %(host)s via %(method)s resulted in %(success)s" %
167 dict(host=fix.host, method=fix.method,
168 success='SUCCESS' if fix.success else 'FAILURE'))
169 return 0
170
171if __name__ == '__main__':
172 sys.exit(main())