blob: 5c3130095a779ae18bed28d71fe68ca058951501 [file] [log] [blame]
Alex Miller6faf3f12013-11-11 10:50:54 -08001#!/usr/bin/python
2
3"""
4Poll server-status on cautotest to watch for RPCs taking longer than 10s. Then
5we go and ssh around to figure out what the command line of the process that
6caused the RPC was so that one can track down what is generating the expensive
7RPC load.
8"""
9
10try:
11 from bs4 import BeautifulSoup
12except ImportError:
13 print 'Run `apt-get install python-bs4`'
14 raise
15
16import time
17import subprocess
18import multiprocessing
19
20import common
21import requests
22
23
24def check_cautotest():
25 page = requests.get('http://cautotest/server-status').text
26 soup = BeautifulSoup(page)
27 pids = []
28 for row in soup.table.findAll('tr'):
29 cols = [x.text.strip() for x in row.findAll('td')]
30 if not cols:
31 continue
32 if cols[3] == 'W' and int(cols[5]) > 10 and cols[1] != '-':
33 pids.append((cols[1], cols[3], cols[5]))
34 return pids
35
36def pull_cautotest_info(proc_id):
37 try:
38 conn = subprocess.check_output('become chromeos-test@cautotest -- '
39 '"sudo lsof -i | grep -e %s | grep -e ESTABLISHED"' % proc_id,
40 shell=True)
41 remote_info = conn.split()[8].split('->')[1].split(':')
42 except Exception:
43 remote_info = None
44 return remote_info
45
46def strace_cautotest(proc_id):
47 try:
48 straced = subprocess.check_output('become chromeos-test@cautotest -- '
49 '"sudo strace -s 500 -p %s 2>&1 | head -n 20"' % proc_id,
50 shell=True)
51 except subprocess.CalledProcessError:
52 straced = ""
53 return straced
54
55def pull_drone_info(host, port):
56 try:
57 lsof = subprocess.check_output('become chromeos-test@%s -- '
58 '"sudo lsof -i | grep -e :%s | grep -e ESTABLISHED"'
59 % (host, port), shell=True)
60 proc_id = lsof.split()[1]
61 cmdline = subprocess.check_output('become chromeos-test@%s -- '
62 '"cat /proc/%s/cmdline"' % (host,proc_id), shell=True)
63 except Exception:
64 cmdline = ''
65 return cmdline
66
67def pull_all_data(pid, queue):
68 try:
69 remote_info = pull_cautotest_info(pid[0])
70 if remote_info:
71 drone_info = pull_drone_info(*remote_info)
72 else:
73 drone_info = None
74 straced = strace_cautotest(pid[0])
75 queue.put((pid, remote_info, drone_info, straced))
76 except Exception:
77 queue.put(None)
78
79def print_data(x):
80 (pid, remote_info, drone_info, straced) = x
81 print "*** %s stuck in %s for %s secs" % pid
82 print remote_info
83 print drone_info
84 print straced
85 print '\a'
86
87while True:
88 queue = multiprocessing.Queue()
89 processes = []
90 pids = check_cautotest()
91 for pid in pids:
92 proc = multiprocessing.Process(target=pull_all_data, args=(pid, queue))
93 proc.start()
94 processes.append(proc)
95 for proc in processes:
96 x = queue.get()
97 if x:
98 print_data(x)
99 for proc in processes:
100 proc.terminate()
101 time.sleep(5)