Blame - build/android/bb_run_sharded_steps.py - fp2-dev/platform/external/chromium_org

blob: 086a3a13e3154c9858ba6ff6178e72ffedf9f88d [file] [log] [blame]

Torne (Richard Coles)	2a99a7e	2013-03-28 15:31:22 +0000	[diff] [blame]	1	#!/usr/bin/env python
				2	#
				3	# Copyright (c) 2012 The Chromium Authors. All rights reserved.
				4	# Use of this source code is governed by a BSD-style license that can be
				5	# found in the LICENSE file.
				6
				7	"""Helper script to shard build bot steps and save results to disk.
				8
				9	Our buildbot infrastructure requires each slave to run steps serially.
				10	This is sub-optimal for android, where these steps can run independently on
				11	multiple connected devices.
				12
				13	The buildbots will run this script multiple times per cycle:
				14	- First: all steps listed in -s in will be executed in parallel using all
				15	connected devices. Step results will be pickled to disk. Each step has a unique
				16	name. The result code will be ignored if the step name is listed in
				17	--flaky_steps.
				18	The buildbot will treat this step as a regular step, and will not process any
				19	graph data.
				20
				21	- Then, with -p STEP_NAME: at this stage, we'll simply print the file with the
				22	step results previously saved. The buildbot will then process the graph data
				23	accordingly.
				24
				25	The JSON steps file contains a dictionary in the format:
				26	{
				27	"step_name_foo": "script_to_execute foo",
				28	"step_name_bar": "script_to_execute bar"
				29	}
				30
				31	The JSON flaky steps file contains a list with step names which results should
				32	be ignored:
				33	[
				34	"step_name_foo",
				35	"step_name_bar"
				36	]
				37
				38	Note that script_to_execute necessarily have to take at least the following
				39	options:
				40	--device: the serial number to be passed to all adb commands.
				41	--keep_test_server_ports: indicates it's being run as a shard, and shouldn't
				42	reset test server port allocation.
				43	"""
				44
				45
				46	import datetime
				47	import json
				48	import logging
				49	import multiprocessing
				50	import optparse
				51	import pexpect
				52	import pickle
				53	import os
				54	import signal
				55	import shutil
				56	import sys
Ben Murdoch	bb1529c	2013-08-08 10:24:53 +0100	[diff] [blame^]	57	import time
Torne (Richard Coles)	2a99a7e	2013-03-28 15:31:22 +0000	[diff] [blame]	58
				59	from pylib import android_commands
				60	from pylib import cmd_helper
				61	from pylib import constants
Ben Murdoch	7dbb3d5	2013-07-17 14:55:54 +0100	[diff] [blame]	62	from pylib import forwarder
Torne (Richard Coles)	2a99a7e	2013-03-28 15:31:22 +0000	[diff] [blame]	63	from pylib import ports
				64
				65
Torne (Richard Coles)	868fa2f	2013-06-11 10:57:03 +0100	[diff] [blame]	66	_OUTPUT_DIR = os.path.join(constants.DIR_SOURCE_ROOT, 'out', 'step_results')
Torne (Richard Coles)	2a99a7e	2013-03-28 15:31:22 +0000	[diff] [blame]	67
				68
				69	def _SaveResult(result):
				70	with file(os.path.join(_OUTPUT_DIR, result['name']), 'w') as f:
				71	f.write(pickle.dumps(result))
				72
				73
				74	def _RunStepsPerDevice(steps):
				75	results = []
				76	for step in steps:
				77	start_time = datetime.datetime.now()
				78	print 'Starting %s: %s %s at %s' % (step['name'], step['cmd'],
				79	start_time, step['device'])
				80	output, exit_code = pexpect.run(
Torne (Richard Coles)	868fa2f	2013-06-11 10:57:03 +0100	[diff] [blame]	81	step['cmd'], cwd=os.path.abspath(constants.DIR_SOURCE_ROOT),
Torne (Richard Coles)	2a99a7e	2013-03-28 15:31:22 +0000	[diff] [blame]	82	withexitstatus=True, logfile=sys.stdout, timeout=1800,
				83	env=os.environ)
				84	exit_code = exit_code or 0
				85	end_time = datetime.datetime.now()
				86	exit_msg = '%s %s' % (exit_code,
				87	'(ignored, flaky step)' if step['is_flaky'] else '')
				88	print 'Finished %s: %s %s %s at %s' % (step['name'], exit_msg, step['cmd'],
				89	end_time, step['device'])
				90	if step['is_flaky']:
				91	exit_code = 0
				92	result = {'name': step['name'],
				93	'output': output,
				94	'exit_code': exit_code,
				95	'total_time': (end_time - start_time).seconds,
				96	'device': step['device']}
				97	_SaveResult(result)
				98	results += [result]
				99	return results
				100
				101
				102	def _RunShardedSteps(steps, flaky_steps, devices):
				103	assert steps
				104	assert devices, 'No devices connected?'
				105	if os.path.exists(_OUTPUT_DIR):
				106	assert '/step_results' in _OUTPUT_DIR
				107	shutil.rmtree(_OUTPUT_DIR)
				108	if not os.path.exists(_OUTPUT_DIR):
				109	os.makedirs(_OUTPUT_DIR)
				110	step_names = sorted(steps.keys())
				111	all_params = []
				112	num_devices = len(devices)
				113	shard_size = (len(steps) + num_devices - 1) / num_devices
				114	for i, device in enumerate(devices):
				115	steps_per_device = []
				116	for s in steps.keys()[i * shard_size:(i + 1) * shard_size]:
				117	steps_per_device += [{'name': s,
				118	'device': device,
				119	'is_flaky': s in flaky_steps,
				120	'cmd': steps[s] + ' --device ' + device +
				121	' --keep_test_server_ports'}]
				122	all_params += [steps_per_device]
				123	print 'Start sharding (note: output is not synchronized...)'
				124	print '' 80
				125	start_time = datetime.datetime.now()
				126	pool = multiprocessing.Pool(processes=num_devices)
				127	async_results = pool.map_async(_RunStepsPerDevice, all_params)
				128	results_per_device = async_results.get(999999)
				129	end_time = datetime.datetime.now()
				130	print '' 80
				131	print 'Finished sharding.'
				132	print 'Summary'
				133	total_time = 0
				134	for results in results_per_device:
				135	for result in results:
				136	print('%s : exit_code=%d in %d secs at %s' %
				137	(result['name'], result['exit_code'], result['total_time'],
				138	result['device']))
				139	total_time += result['total_time']
				140	print 'Step time: %d secs' % ((end_time - start_time).seconds)
				141	print 'Bots time: %d secs' % total_time
				142	# No exit_code for the sharding step: the individual _PrintResults step
				143	# will return the corresponding exit_code.
				144	return 0
				145
				146
				147	def _PrintStepOutput(step_name):
				148	file_name = os.path.join(_OUTPUT_DIR, step_name)
				149	if not os.path.exists(file_name):
				150	print 'File not found ', file_name
				151	return 1
				152	with file(file_name, 'r') as f:
				153	result = pickle.loads(f.read())
				154	print result['output']
				155	return result['exit_code']
				156
				157
Ben Murdoch	eb525c5	2013-07-10 11:40:50 +0100	[diff] [blame]	158	def _PrintAllStepsOutput(steps):
				159	with file(steps, 'r') as f:
				160	steps = json.load(f)
				161	ret = 0
				162	for step_name in steps.keys():
				163	ret \|= _PrintStepOutput(step_name)
				164	return ret
				165
				166
Torne (Richard Coles)	2a99a7e	2013-03-28 15:31:22 +0000	[diff] [blame]	167	def _KillPendingServers():
				168	for retry in range(5):
				169	for server in ['lighttpd', 'web-page-replay']:
				170	pids = cmd_helper.GetCmdOutput(['pgrep', '-f', server])
				171	pids = [pid.strip() for pid in pids.split('\n') if pid.strip()]
				172	for pid in pids:
				173	try:
				174	logging.warning('Killing %s %s', server, pid)
				175	os.kill(int(pid), signal.SIGQUIT)
				176	except Exception as e:
				177	logging.warning('Failed killing %s %s %s', server, pid, e)
Ben Murdoch	bb1529c	2013-08-08 10:24:53 +0100	[diff] [blame^]	178	# Restart the adb server with full trace, and redirect stderr to stdout
				179	# so the extra tracing won't confuse higher up layers.
				180	os.environ['ADB_TRACE'] = 'all'
				181	cmd_helper.RunCmd(['adb', 'kill-server'])
				182	cmd_helper.RunCmd(['adb', 'start-server'])
				183	cmd_helper.RunCmd(['adb', 'root'])
				184	i = 1
				185	while not android_commands.GetAttachedDevices():
				186	time.sleep(i)
				187	i *= 2
				188	if i > 10:
				189	break
Torne (Richard Coles)	2a99a7e	2013-03-28 15:31:22 +0000	[diff] [blame]	190
				191
				192	def main(argv):
				193	parser = optparse.OptionParser()
				194	parser.add_option('-s', '--steps',
				195	help='A JSON file containing all the steps to be '
				196	'sharded.')
				197	parser.add_option('--flaky_steps',
				198	help='A JSON file containing steps that are flaky and '
				199	'will have its exit code ignored.')
				200	parser.add_option('-p', '--print_results',
				201	help='Only prints the results for the previously '
				202	'executed step, do not run it again.')
Ben Murdoch	eb525c5	2013-07-10 11:40:50 +0100	[diff] [blame]	203	parser.add_option('-P', '--print_all',
				204	help='Only prints the results for the previously '
				205	'executed steps, do not run them again.')
Torne (Richard Coles)	2a99a7e	2013-03-28 15:31:22 +0000	[diff] [blame]	206	options, urls = parser.parse_args(argv)
				207	if options.print_results:
				208	return _PrintStepOutput(options.print_results)
Ben Murdoch	eb525c5	2013-07-10 11:40:50 +0100	[diff] [blame]	209	if options.print_all:
				210	return _PrintAllStepsOutput(options.print_all)
Torne (Richard Coles)	2a99a7e	2013-03-28 15:31:22 +0000	[diff] [blame]	211
				212	# At this point, we should kill everything that may have been left over from
				213	# previous runs.
				214	_KillPendingServers()
				215
Torne (Richard Coles)	a36e592	2013-08-05 13:57:33 +0100	[diff] [blame]	216	forwarder.Forwarder.UseMultiprocessing()
				217
Torne (Richard Coles)	2a99a7e	2013-03-28 15:31:22 +0000	[diff] [blame]	218	# Reset the test port allocation. It's important to do it before starting
				219	# to dispatch any step.
				220	if not ports.ResetTestServerPortAllocation():
				221	raise Exception('Failed to reset test server port.')
				222
				223	# Sort the devices so that we'll try to always run a step in the same device.
				224	devices = sorted(android_commands.GetAttachedDevices())
				225	if not devices:
				226	print 'You must attach a device'
				227	return 1
				228
				229	with file(options.steps, 'r') as f:
				230	steps = json.load(f)
				231	flaky_steps = []
				232	if options.flaky_steps:
				233	with file(options.flaky_steps, 'r') as f:
				234	flaky_steps = json.load(f)
				235	return _RunShardedSteps(steps, flaky_steps, devices)
				236
				237
				238	if __name__ == '__main__':
				239	sys.exit(main(sys.argv))