Torne (Richard Coles) | 2a99a7e | 2013-03-28 15:31:22 +0000 | [diff] [blame] | 1 | #!/usr/bin/env python |
| 2 | # |
| 3 | # Copyright (c) 2012 The Chromium Authors. All rights reserved. |
| 4 | # Use of this source code is governed by a BSD-style license that can be |
| 5 | # found in the LICENSE file. |
| 6 | |
| 7 | """Helper script to shard build bot steps and save results to disk. |
| 8 | |
| 9 | Our buildbot infrastructure requires each slave to run steps serially. |
| 10 | This is sub-optimal for android, where these steps can run independently on |
| 11 | multiple connected devices. |
| 12 | |
| 13 | The buildbots will run this script multiple times per cycle: |
| 14 | - First: all steps listed in -s in will be executed in parallel using all |
| 15 | connected devices. Step results will be pickled to disk. Each step has a unique |
| 16 | name. The result code will be ignored if the step name is listed in |
| 17 | --flaky_steps. |
| 18 | The buildbot will treat this step as a regular step, and will not process any |
| 19 | graph data. |
| 20 | |
| 21 | - Then, with -p STEP_NAME: at this stage, we'll simply print the file with the |
| 22 | step results previously saved. The buildbot will then process the graph data |
| 23 | accordingly. |
| 24 | |
| 25 | The JSON steps file contains a dictionary in the format: |
| 26 | { |
| 27 | "step_name_foo": "script_to_execute foo", |
| 28 | "step_name_bar": "script_to_execute bar" |
| 29 | } |
| 30 | |
| 31 | The JSON flaky steps file contains a list with step names which results should |
| 32 | be ignored: |
| 33 | [ |
| 34 | "step_name_foo", |
| 35 | "step_name_bar" |
| 36 | ] |
| 37 | |
| 38 | Note that script_to_execute necessarily have to take at least the following |
| 39 | options: |
| 40 | --device: the serial number to be passed to all adb commands. |
| 41 | --keep_test_server_ports: indicates it's being run as a shard, and shouldn't |
| 42 | reset test server port allocation. |
| 43 | """ |
| 44 | |
| 45 | |
| 46 | import datetime |
| 47 | import json |
| 48 | import logging |
| 49 | import multiprocessing |
| 50 | import optparse |
| 51 | import pexpect |
| 52 | import pickle |
| 53 | import os |
| 54 | import signal |
| 55 | import shutil |
| 56 | import sys |
Ben Murdoch | bb1529c | 2013-08-08 10:24:53 +0100 | [diff] [blame^] | 57 | import time |
Torne (Richard Coles) | 2a99a7e | 2013-03-28 15:31:22 +0000 | [diff] [blame] | 58 | |
| 59 | from pylib import android_commands |
| 60 | from pylib import cmd_helper |
| 61 | from pylib import constants |
Ben Murdoch | 7dbb3d5 | 2013-07-17 14:55:54 +0100 | [diff] [blame] | 62 | from pylib import forwarder |
Torne (Richard Coles) | 2a99a7e | 2013-03-28 15:31:22 +0000 | [diff] [blame] | 63 | from pylib import ports |
| 64 | |
| 65 | |
Torne (Richard Coles) | 868fa2f | 2013-06-11 10:57:03 +0100 | [diff] [blame] | 66 | _OUTPUT_DIR = os.path.join(constants.DIR_SOURCE_ROOT, 'out', 'step_results') |
Torne (Richard Coles) | 2a99a7e | 2013-03-28 15:31:22 +0000 | [diff] [blame] | 67 | |
| 68 | |
| 69 | def _SaveResult(result): |
| 70 | with file(os.path.join(_OUTPUT_DIR, result['name']), 'w') as f: |
| 71 | f.write(pickle.dumps(result)) |
| 72 | |
| 73 | |
| 74 | def _RunStepsPerDevice(steps): |
| 75 | results = [] |
| 76 | for step in steps: |
| 77 | start_time = datetime.datetime.now() |
| 78 | print 'Starting %s: %s %s at %s' % (step['name'], step['cmd'], |
| 79 | start_time, step['device']) |
| 80 | output, exit_code = pexpect.run( |
Torne (Richard Coles) | 868fa2f | 2013-06-11 10:57:03 +0100 | [diff] [blame] | 81 | step['cmd'], cwd=os.path.abspath(constants.DIR_SOURCE_ROOT), |
Torne (Richard Coles) | 2a99a7e | 2013-03-28 15:31:22 +0000 | [diff] [blame] | 82 | withexitstatus=True, logfile=sys.stdout, timeout=1800, |
| 83 | env=os.environ) |
| 84 | exit_code = exit_code or 0 |
| 85 | end_time = datetime.datetime.now() |
| 86 | exit_msg = '%s %s' % (exit_code, |
| 87 | '(ignored, flaky step)' if step['is_flaky'] else '') |
| 88 | print 'Finished %s: %s %s %s at %s' % (step['name'], exit_msg, step['cmd'], |
| 89 | end_time, step['device']) |
| 90 | if step['is_flaky']: |
| 91 | exit_code = 0 |
| 92 | result = {'name': step['name'], |
| 93 | 'output': output, |
| 94 | 'exit_code': exit_code, |
| 95 | 'total_time': (end_time - start_time).seconds, |
| 96 | 'device': step['device']} |
| 97 | _SaveResult(result) |
| 98 | results += [result] |
| 99 | return results |
| 100 | |
| 101 | |
| 102 | def _RunShardedSteps(steps, flaky_steps, devices): |
| 103 | assert steps |
| 104 | assert devices, 'No devices connected?' |
| 105 | if os.path.exists(_OUTPUT_DIR): |
| 106 | assert '/step_results' in _OUTPUT_DIR |
| 107 | shutil.rmtree(_OUTPUT_DIR) |
| 108 | if not os.path.exists(_OUTPUT_DIR): |
| 109 | os.makedirs(_OUTPUT_DIR) |
| 110 | step_names = sorted(steps.keys()) |
| 111 | all_params = [] |
| 112 | num_devices = len(devices) |
| 113 | shard_size = (len(steps) + num_devices - 1) / num_devices |
| 114 | for i, device in enumerate(devices): |
| 115 | steps_per_device = [] |
| 116 | for s in steps.keys()[i * shard_size:(i + 1) * shard_size]: |
| 117 | steps_per_device += [{'name': s, |
| 118 | 'device': device, |
| 119 | 'is_flaky': s in flaky_steps, |
| 120 | 'cmd': steps[s] + ' --device ' + device + |
| 121 | ' --keep_test_server_ports'}] |
| 122 | all_params += [steps_per_device] |
| 123 | print 'Start sharding (note: output is not synchronized...)' |
| 124 | print '*' * 80 |
| 125 | start_time = datetime.datetime.now() |
| 126 | pool = multiprocessing.Pool(processes=num_devices) |
| 127 | async_results = pool.map_async(_RunStepsPerDevice, all_params) |
| 128 | results_per_device = async_results.get(999999) |
| 129 | end_time = datetime.datetime.now() |
| 130 | print '*' * 80 |
| 131 | print 'Finished sharding.' |
| 132 | print 'Summary' |
| 133 | total_time = 0 |
| 134 | for results in results_per_device: |
| 135 | for result in results: |
| 136 | print('%s : exit_code=%d in %d secs at %s' % |
| 137 | (result['name'], result['exit_code'], result['total_time'], |
| 138 | result['device'])) |
| 139 | total_time += result['total_time'] |
| 140 | print 'Step time: %d secs' % ((end_time - start_time).seconds) |
| 141 | print 'Bots time: %d secs' % total_time |
| 142 | # No exit_code for the sharding step: the individual _PrintResults step |
| 143 | # will return the corresponding exit_code. |
| 144 | return 0 |
| 145 | |
| 146 | |
| 147 | def _PrintStepOutput(step_name): |
| 148 | file_name = os.path.join(_OUTPUT_DIR, step_name) |
| 149 | if not os.path.exists(file_name): |
| 150 | print 'File not found ', file_name |
| 151 | return 1 |
| 152 | with file(file_name, 'r') as f: |
| 153 | result = pickle.loads(f.read()) |
| 154 | print result['output'] |
| 155 | return result['exit_code'] |
| 156 | |
| 157 | |
Ben Murdoch | eb525c5 | 2013-07-10 11:40:50 +0100 | [diff] [blame] | 158 | def _PrintAllStepsOutput(steps): |
| 159 | with file(steps, 'r') as f: |
| 160 | steps = json.load(f) |
| 161 | ret = 0 |
| 162 | for step_name in steps.keys(): |
| 163 | ret |= _PrintStepOutput(step_name) |
| 164 | return ret |
| 165 | |
| 166 | |
Torne (Richard Coles) | 2a99a7e | 2013-03-28 15:31:22 +0000 | [diff] [blame] | 167 | def _KillPendingServers(): |
| 168 | for retry in range(5): |
| 169 | for server in ['lighttpd', 'web-page-replay']: |
| 170 | pids = cmd_helper.GetCmdOutput(['pgrep', '-f', server]) |
| 171 | pids = [pid.strip() for pid in pids.split('\n') if pid.strip()] |
| 172 | for pid in pids: |
| 173 | try: |
| 174 | logging.warning('Killing %s %s', server, pid) |
| 175 | os.kill(int(pid), signal.SIGQUIT) |
| 176 | except Exception as e: |
| 177 | logging.warning('Failed killing %s %s %s', server, pid, e) |
Ben Murdoch | bb1529c | 2013-08-08 10:24:53 +0100 | [diff] [blame^] | 178 | # Restart the adb server with full trace, and redirect stderr to stdout |
| 179 | # so the extra tracing won't confuse higher up layers. |
| 180 | os.environ['ADB_TRACE'] = 'all' |
| 181 | cmd_helper.RunCmd(['adb', 'kill-server']) |
| 182 | cmd_helper.RunCmd(['adb', 'start-server']) |
| 183 | cmd_helper.RunCmd(['adb', 'root']) |
| 184 | i = 1 |
| 185 | while not android_commands.GetAttachedDevices(): |
| 186 | time.sleep(i) |
| 187 | i *= 2 |
| 188 | if i > 10: |
| 189 | break |
Torne (Richard Coles) | 2a99a7e | 2013-03-28 15:31:22 +0000 | [diff] [blame] | 190 | |
| 191 | |
| 192 | def main(argv): |
| 193 | parser = optparse.OptionParser() |
| 194 | parser.add_option('-s', '--steps', |
| 195 | help='A JSON file containing all the steps to be ' |
| 196 | 'sharded.') |
| 197 | parser.add_option('--flaky_steps', |
| 198 | help='A JSON file containing steps that are flaky and ' |
| 199 | 'will have its exit code ignored.') |
| 200 | parser.add_option('-p', '--print_results', |
| 201 | help='Only prints the results for the previously ' |
| 202 | 'executed step, do not run it again.') |
Ben Murdoch | eb525c5 | 2013-07-10 11:40:50 +0100 | [diff] [blame] | 203 | parser.add_option('-P', '--print_all', |
| 204 | help='Only prints the results for the previously ' |
| 205 | 'executed steps, do not run them again.') |
Torne (Richard Coles) | 2a99a7e | 2013-03-28 15:31:22 +0000 | [diff] [blame] | 206 | options, urls = parser.parse_args(argv) |
| 207 | if options.print_results: |
| 208 | return _PrintStepOutput(options.print_results) |
Ben Murdoch | eb525c5 | 2013-07-10 11:40:50 +0100 | [diff] [blame] | 209 | if options.print_all: |
| 210 | return _PrintAllStepsOutput(options.print_all) |
Torne (Richard Coles) | 2a99a7e | 2013-03-28 15:31:22 +0000 | [diff] [blame] | 211 | |
| 212 | # At this point, we should kill everything that may have been left over from |
| 213 | # previous runs. |
| 214 | _KillPendingServers() |
| 215 | |
Torne (Richard Coles) | a36e592 | 2013-08-05 13:57:33 +0100 | [diff] [blame] | 216 | forwarder.Forwarder.UseMultiprocessing() |
| 217 | |
Torne (Richard Coles) | 2a99a7e | 2013-03-28 15:31:22 +0000 | [diff] [blame] | 218 | # Reset the test port allocation. It's important to do it before starting |
| 219 | # to dispatch any step. |
| 220 | if not ports.ResetTestServerPortAllocation(): |
| 221 | raise Exception('Failed to reset test server port.') |
| 222 | |
| 223 | # Sort the devices so that we'll try to always run a step in the same device. |
| 224 | devices = sorted(android_commands.GetAttachedDevices()) |
| 225 | if not devices: |
| 226 | print 'You must attach a device' |
| 227 | return 1 |
| 228 | |
| 229 | with file(options.steps, 'r') as f: |
| 230 | steps = json.load(f) |
| 231 | flaky_steps = [] |
| 232 | if options.flaky_steps: |
| 233 | with file(options.flaky_steps, 'r') as f: |
| 234 | flaky_steps = json.load(f) |
| 235 | return _RunShardedSteps(steps, flaky_steps, devices) |
| 236 | |
| 237 | |
| 238 | if __name__ == '__main__': |
| 239 | sys.exit(main(sys.argv)) |