blob: 086a3a13e3154c9858ba6ff6178e72ffedf9f88d [file] [log] [blame]
Torne (Richard Coles)2a99a7e2013-03-28 15:31:22 +00001#!/usr/bin/env python
2#
3# Copyright (c) 2012 The Chromium Authors. All rights reserved.
4# Use of this source code is governed by a BSD-style license that can be
5# found in the LICENSE file.
6
7"""Helper script to shard build bot steps and save results to disk.
8
9Our buildbot infrastructure requires each slave to run steps serially.
10This is sub-optimal for android, where these steps can run independently on
11multiple connected devices.
12
13The buildbots will run this script multiple times per cycle:
14- First: all steps listed in -s in will be executed in parallel using all
15connected devices. Step results will be pickled to disk. Each step has a unique
16name. The result code will be ignored if the step name is listed in
17--flaky_steps.
18The buildbot will treat this step as a regular step, and will not process any
19graph data.
20
21- Then, with -p STEP_NAME: at this stage, we'll simply print the file with the
22step results previously saved. The buildbot will then process the graph data
23accordingly.
24
25The JSON steps file contains a dictionary in the format:
26{
27 "step_name_foo": "script_to_execute foo",
28 "step_name_bar": "script_to_execute bar"
29}
30
31The JSON flaky steps file contains a list with step names which results should
32be ignored:
33[
34 "step_name_foo",
35 "step_name_bar"
36]
37
38Note that script_to_execute necessarily have to take at least the following
39options:
40 --device: the serial number to be passed to all adb commands.
41 --keep_test_server_ports: indicates it's being run as a shard, and shouldn't
42 reset test server port allocation.
43"""
44
45
46import datetime
47import json
48import logging
49import multiprocessing
50import optparse
51import pexpect
52import pickle
53import os
54import signal
55import shutil
56import sys
Ben Murdochbb1529c2013-08-08 10:24:53 +010057import time
Torne (Richard Coles)2a99a7e2013-03-28 15:31:22 +000058
59from pylib import android_commands
60from pylib import cmd_helper
61from pylib import constants
Ben Murdoch7dbb3d52013-07-17 14:55:54 +010062from pylib import forwarder
Torne (Richard Coles)2a99a7e2013-03-28 15:31:22 +000063from pylib import ports
64
65
Torne (Richard Coles)868fa2f2013-06-11 10:57:03 +010066_OUTPUT_DIR = os.path.join(constants.DIR_SOURCE_ROOT, 'out', 'step_results')
Torne (Richard Coles)2a99a7e2013-03-28 15:31:22 +000067
68
69def _SaveResult(result):
70 with file(os.path.join(_OUTPUT_DIR, result['name']), 'w') as f:
71 f.write(pickle.dumps(result))
72
73
74def _RunStepsPerDevice(steps):
75 results = []
76 for step in steps:
77 start_time = datetime.datetime.now()
78 print 'Starting %s: %s %s at %s' % (step['name'], step['cmd'],
79 start_time, step['device'])
80 output, exit_code = pexpect.run(
Torne (Richard Coles)868fa2f2013-06-11 10:57:03 +010081 step['cmd'], cwd=os.path.abspath(constants.DIR_SOURCE_ROOT),
Torne (Richard Coles)2a99a7e2013-03-28 15:31:22 +000082 withexitstatus=True, logfile=sys.stdout, timeout=1800,
83 env=os.environ)
84 exit_code = exit_code or 0
85 end_time = datetime.datetime.now()
86 exit_msg = '%s %s' % (exit_code,
87 '(ignored, flaky step)' if step['is_flaky'] else '')
88 print 'Finished %s: %s %s %s at %s' % (step['name'], exit_msg, step['cmd'],
89 end_time, step['device'])
90 if step['is_flaky']:
91 exit_code = 0
92 result = {'name': step['name'],
93 'output': output,
94 'exit_code': exit_code,
95 'total_time': (end_time - start_time).seconds,
96 'device': step['device']}
97 _SaveResult(result)
98 results += [result]
99 return results
100
101
102def _RunShardedSteps(steps, flaky_steps, devices):
103 assert steps
104 assert devices, 'No devices connected?'
105 if os.path.exists(_OUTPUT_DIR):
106 assert '/step_results' in _OUTPUT_DIR
107 shutil.rmtree(_OUTPUT_DIR)
108 if not os.path.exists(_OUTPUT_DIR):
109 os.makedirs(_OUTPUT_DIR)
110 step_names = sorted(steps.keys())
111 all_params = []
112 num_devices = len(devices)
113 shard_size = (len(steps) + num_devices - 1) / num_devices
114 for i, device in enumerate(devices):
115 steps_per_device = []
116 for s in steps.keys()[i * shard_size:(i + 1) * shard_size]:
117 steps_per_device += [{'name': s,
118 'device': device,
119 'is_flaky': s in flaky_steps,
120 'cmd': steps[s] + ' --device ' + device +
121 ' --keep_test_server_ports'}]
122 all_params += [steps_per_device]
123 print 'Start sharding (note: output is not synchronized...)'
124 print '*' * 80
125 start_time = datetime.datetime.now()
126 pool = multiprocessing.Pool(processes=num_devices)
127 async_results = pool.map_async(_RunStepsPerDevice, all_params)
128 results_per_device = async_results.get(999999)
129 end_time = datetime.datetime.now()
130 print '*' * 80
131 print 'Finished sharding.'
132 print 'Summary'
133 total_time = 0
134 for results in results_per_device:
135 for result in results:
136 print('%s : exit_code=%d in %d secs at %s' %
137 (result['name'], result['exit_code'], result['total_time'],
138 result['device']))
139 total_time += result['total_time']
140 print 'Step time: %d secs' % ((end_time - start_time).seconds)
141 print 'Bots time: %d secs' % total_time
142 # No exit_code for the sharding step: the individual _PrintResults step
143 # will return the corresponding exit_code.
144 return 0
145
146
147def _PrintStepOutput(step_name):
148 file_name = os.path.join(_OUTPUT_DIR, step_name)
149 if not os.path.exists(file_name):
150 print 'File not found ', file_name
151 return 1
152 with file(file_name, 'r') as f:
153 result = pickle.loads(f.read())
154 print result['output']
155 return result['exit_code']
156
157
Ben Murdocheb525c52013-07-10 11:40:50 +0100158def _PrintAllStepsOutput(steps):
159 with file(steps, 'r') as f:
160 steps = json.load(f)
161 ret = 0
162 for step_name in steps.keys():
163 ret |= _PrintStepOutput(step_name)
164 return ret
165
166
Torne (Richard Coles)2a99a7e2013-03-28 15:31:22 +0000167def _KillPendingServers():
168 for retry in range(5):
169 for server in ['lighttpd', 'web-page-replay']:
170 pids = cmd_helper.GetCmdOutput(['pgrep', '-f', server])
171 pids = [pid.strip() for pid in pids.split('\n') if pid.strip()]
172 for pid in pids:
173 try:
174 logging.warning('Killing %s %s', server, pid)
175 os.kill(int(pid), signal.SIGQUIT)
176 except Exception as e:
177 logging.warning('Failed killing %s %s %s', server, pid, e)
Ben Murdochbb1529c2013-08-08 10:24:53 +0100178 # Restart the adb server with full trace, and redirect stderr to stdout
179 # so the extra tracing won't confuse higher up layers.
180 os.environ['ADB_TRACE'] = 'all'
181 cmd_helper.RunCmd(['adb', 'kill-server'])
182 cmd_helper.RunCmd(['adb', 'start-server'])
183 cmd_helper.RunCmd(['adb', 'root'])
184 i = 1
185 while not android_commands.GetAttachedDevices():
186 time.sleep(i)
187 i *= 2
188 if i > 10:
189 break
Torne (Richard Coles)2a99a7e2013-03-28 15:31:22 +0000190
191
192def main(argv):
193 parser = optparse.OptionParser()
194 parser.add_option('-s', '--steps',
195 help='A JSON file containing all the steps to be '
196 'sharded.')
197 parser.add_option('--flaky_steps',
198 help='A JSON file containing steps that are flaky and '
199 'will have its exit code ignored.')
200 parser.add_option('-p', '--print_results',
201 help='Only prints the results for the previously '
202 'executed step, do not run it again.')
Ben Murdocheb525c52013-07-10 11:40:50 +0100203 parser.add_option('-P', '--print_all',
204 help='Only prints the results for the previously '
205 'executed steps, do not run them again.')
Torne (Richard Coles)2a99a7e2013-03-28 15:31:22 +0000206 options, urls = parser.parse_args(argv)
207 if options.print_results:
208 return _PrintStepOutput(options.print_results)
Ben Murdocheb525c52013-07-10 11:40:50 +0100209 if options.print_all:
210 return _PrintAllStepsOutput(options.print_all)
Torne (Richard Coles)2a99a7e2013-03-28 15:31:22 +0000211
212 # At this point, we should kill everything that may have been left over from
213 # previous runs.
214 _KillPendingServers()
215
Torne (Richard Coles)a36e5922013-08-05 13:57:33 +0100216 forwarder.Forwarder.UseMultiprocessing()
217
Torne (Richard Coles)2a99a7e2013-03-28 15:31:22 +0000218 # Reset the test port allocation. It's important to do it before starting
219 # to dispatch any step.
220 if not ports.ResetTestServerPortAllocation():
221 raise Exception('Failed to reset test server port.')
222
223 # Sort the devices so that we'll try to always run a step in the same device.
224 devices = sorted(android_commands.GetAttachedDevices())
225 if not devices:
226 print 'You must attach a device'
227 return 1
228
229 with file(options.steps, 'r') as f:
230 steps = json.load(f)
231 flaky_steps = []
232 if options.flaky_steps:
233 with file(options.flaky_steps, 'r') as f:
234 flaky_steps = json.load(f)
235 return _RunShardedSteps(steps, flaky_steps, devices)
236
237
238if __name__ == '__main__':
239 sys.exit(main(sys.argv))