Blame - site_utils/test_push.py - platform/external/autotest

2013-07-25 15:08:48 -0700

[diff] [blame]

#!/usr/bin/python

#

# Use of this source code is governed by a BSD-style license that can be

5

# found in the LICENSE file.

6

7

"""Tool to validate code in prod branch before pushing to lab.

8

9

The script runs push_to_prod suite to verify code in prod branch is ready to be

10

pushed. Link to design document:

11

https://docs.google.com/a/google.com/document/d/1JMz0xS3fZRSHMpFkkKAL_rxsdbNZomhHbC3B8L71uuI/edit

12

13

To verify if prod branch can be pushed to lab, run following command in

Shuqian Zhao

bb030ff

2017-09-21 17:36:13 -0700

[diff] [blame]

14

chromeos-staging-master2.hot server:

Michael Liang

52d9f1f

2014-06-17 15:01:24 -0700

[diff] [blame]

15

/usr/local/autotest/site_utils/test_push.py -e someone@company.com

Dan Shi

2013-07-25 15:08:48 -0700

[diff] [blame]

16

Shuqian Zhao

2016-09-21 11:02:15 -0700

[diff] [blame]

17

The script uses latest gandof stable build as test build by default.

Dan Shi

2013-07-25 15:08:48 -0700

[diff] [blame]

"""

import argparse

Shuqian Zhao

1f311c0

2016-09-01 19:30:54 -0700

[diff] [blame]

22

import ast

Shuqian Zhao

2016-10-25 13:31:06 -0700

[diff] [blame]

23

from contextlib import contextmanager

Shuqian Zhao

2018-01-31 11:53:34 -0800

[diff] [blame]

24

import datetime

Dan Shi

2013-07-25 15:08:48 -0700

[diff] [blame]

25

import getpass

Dan Shi

2015-04-07 17:37:09 -0700

[diff] [blame]

26

import multiprocessing

Dan Shi

2013-07-25 15:08:48 -0700

[diff] [blame]

import os

import re

import subprocess

import sys

Dan Shi

2015-04-07 17:37:09 -0700

[diff] [blame]

31

import time

32

import traceback

Dan Shi

2013-07-25 15:08:48 -0700

[diff] [blame]

33

import urllib2

34

35

import common

Dan Shi

2014-05-09 15:18:15 -0700

[diff] [blame]

36

try:

37

from autotest_lib.frontend import setup_django_environment

38

from autotest_lib.frontend.afe import models

Shuqian Zhao

2016-09-12 10:42:03 -0700

[diff] [blame]

39

from autotest_lib.frontend.afe import rpc_utils

Dan Shi

2014-05-09 15:18:15 -0700

[diff] [blame]

40

except ImportError:

41

# Unittest may not have Django database configured and will fail to import.

42

pass

Dan Shi

5fa602c

2015-03-26 17:54:13 -0700

[diff] [blame]

43

from autotest_lib.client.common_lib import global_config

Shuqian Zhao

2016-09-12 10:42:03 -0700

[diff] [blame]

44

from autotest_lib.client.common_lib import priorities

Shuqian Zhao

f239b31

2017-12-05 16:45:02 -0800

[diff] [blame]

45

from autotest_lib.client.common_lib.cros import retry

Prathmesh Prabhu

cd246f5

2018-01-03 13:45:48 -0800

[diff] [blame]

46

from autotest_lib.frontend.afe import rpc_client_lib

Xixuan Wu

93e646c

2017-12-07 18:36:10 -0800

[diff] [blame]

47

from autotest_lib.server import constants

Dan Shi

2013-07-25 15:08:48 -0700

[diff] [blame]

48

from autotest_lib.server import site_utils

Shuqian Zhao

2016-09-12 10:42:03 -0700

[diff] [blame]

49

from autotest_lib.server import utils

Dan Shi

2014-12-22 16:25:05 -0800

[diff] [blame]

50

from autotest_lib.server.cros import provision

Dan Shi

2013-07-25 15:08:48 -0700

[diff] [blame]

51

from autotest_lib.server.cros.dynamic_suite import frontend_wrappers

Dan Shi

2013-07-25 15:08:48 -0700

[diff] [blame]

52

Shuqian Zhao

2017-05-30 12:56:57 -0700

[diff] [blame]

53

try:

54

from chromite.lib import metrics

55

from chromite.lib import ts_mon_config

56

except ImportError:

57

metrics = site_utils.metrics_mock

58

ts_mon_config = site_utils.metrics_mock

59

Shuqian Zhao

2016-10-25 13:31:06 -0700

[diff] [blame]

60

AUTOTEST_DIR=common.autotest_dir

Dan Shi

2013-07-25 15:08:48 -0700

[diff] [blame]

61

CONFIG = global_config.global_config

62

Dan Shi

2016-02-03 11:37:02 -0800

[diff] [blame]

63

AFE = frontend_wrappers.RetryingAFE(timeout_min=0.5, delay_sec=2)

Shuqian Zhao

2016-09-12 10:42:03 -0700

[diff] [blame]

64

TKO = frontend_wrappers.RetryingTKO(timeout_min=0.1, delay_sec=10)

Dan Shi

2016-02-03 11:37:02 -0800

[diff] [blame]

65

Dan Shi

2013-07-25 15:08:48 -0700

[diff] [blame]

66

MAIL_FROM = 'chromeos-test@google.com'

Shuqian Zhao

1286166

2016-08-31 19:23:17 -0700

[diff] [blame]

67

BUILD_REGEX = 'R[\d]+-[\d]+\.[\d]+\.[\d]+'

Dan Shi

2013-07-25 15:08:48 -0700

[diff] [blame]

68

RUN_SUITE_COMMAND = 'run_suite.py'

69

PUSH_TO_PROD_SUITE = 'push_to_prod'

Jakob Juelich

2014-10-10 14:08:05 -0700

[diff] [blame]

70

DUMMY_SUITE = 'dummy'

xixuan

2d66858

2016-06-10 14:02:32 -0700

[diff] [blame]

71

DEFAULT_TIMEOUT_MIN_FOR_SUITE_JOB = 30

Shuqian Zhao

1286166

2016-08-31 19:23:17 -0700

[diff] [blame]

72

IMAGE_BUCKET = CONFIG.get_config_value('CROS', 'image_storage_server')

Allen Li

2017-11-27 15:33:54 -0800

[diff] [blame]

73

DEFAULT_NUM_DUTS = (

74

('gandof', 4),

75

('quawks', 2),

Allen Li

2017-11-27 15:33:54 -0800

[diff] [blame]

76

)

Dan Shi

2013-07-25 15:08:48 -0700

[diff] [blame]

77

Fang Deng

6dddf60

2014-04-17 17:01:47 -0700

[diff] [blame]

78

SUITE_JOB_START_INFO_REGEX = ('^.*Created suite job:.*'

79

'tab_id=view_job&object_id=(\d+)$')

Dan Shi

2013-07-25 15:08:48 -0700

[diff] [blame]

80

81

# Dictionary of test results keyed by test name regular expression.

82

EXPECTED_TEST_RESULTS = {'^SERVER_JOB$': 'GOOD',

83

# This is related to dummy_Fail/control.dependency.

84

'dummy_Fail.dependency$': 'TEST_NA',

Dan Shi

dc9eb17

2014-12-09 16:05:02 -0800

[diff] [blame]

85

'login_LoginSuccess.*': 'GOOD',

Dan Shi

2014-12-22 16:25:05 -0800

[diff] [blame]

86

'provision_AutoUpdate.double': 'GOOD',

Dan Shi

2013-07-25 15:08:48 -0700

[diff] [blame]

87

'dummy_Pass.*': 'GOOD',

88

'dummy_Fail.Fail$': 'FAIL',

89

'dummy_Fail.RetryFail$': 'FAIL',

90

'dummy_Fail.RetrySuccess': 'GOOD',

91

'dummy_Fail.Error$': 'ERROR',

92

'dummy_Fail.Warn$': 'WARN',

93

'dummy_Fail.NAError$': 'TEST_NA',

94

'dummy_Fail.Crash$': 'GOOD',

Aviv Keshet

ff024f9

2017-09-26 13:43:14 -0700

[diff] [blame]

95

'autotest_SyncCount$': 'GOOD',

Dan Shi

2013-07-25 15:08:48 -0700

[diff] [blame]

96

}

97

Jakob Juelich

2014-10-10 14:08:05 -0700

[diff] [blame]

98

EXPECTED_TEST_RESULTS_DUMMY = {'^SERVER_JOB$': 'GOOD',

99

'dummy_Pass.*': 'GOOD',

100

'dummy_Fail.Fail': 'FAIL',

101

'dummy_Fail.Warn': 'WARN',

102

'dummy_Fail.Crash': 'GOOD',

103

'dummy_Fail.Error': 'ERROR',

104

'dummy_Fail.NAError': 'TEST_NA',}

105

Shuqian Zhao

2016-09-12 10:42:03 -0700

[diff] [blame]

106

EXPECTED_TEST_RESULTS_POWERWASH = {'platform_Powerwash': 'GOOD',

107

'SERVER_JOB': 'GOOD'}

108

Dan Shi

2013-07-25 15:08:48 -0700

[diff] [blame]

109

URL_HOST = CONFIG.get_config_value('SERVER', 'hostname', type=str)

110

URL_PATTERN = CONFIG.get_config_value('CROS', 'log_url_pattern', type=str)

111

Dan Shi

dc9eb17

2014-12-09 16:05:02 -0800

[diff] [blame]

112

# Some test could be missing from the test results for various reasons. Add

113

# such test in this list and explain the reason.

114

IGNORE_MISSING_TESTS = [

115

# For latest build, npo_test_delta does not exist.

116

'autoupdate_EndToEndTest.npo_test_delta.*',

117

# For trybot build, nmo_test_delta does not exist.

118

'autoupdate_EndToEndTest.nmo_test_delta.*',

119

# Older build does not have login_LoginSuccess test in push_to_prod suite.

120

# TODO(dshi): Remove following lines after R41 is stable.

121

'login_LoginSuccess']

122

Aviv Keshet

2017-11-08 13:25:01 -0800

[diff] [blame]

123

# Multiprocessing proxy objects that are used to share data between background

124

# suite-running processes and main process. The multiprocessing-compatible

125

# versions are initialized in _main.

126

_run_suite_output = []

127

_all_suite_ids = []

128

Shuqian Zhao

2016-10-25 13:31:06 -0700

[diff] [blame]

129

# A dict maps the name of the updated repos and the path of them.

130

UPDATED_REPOS = {'autotest': AUTOTEST_DIR,

131

'chromite': '%s/site-packages/chromite/' % AUTOTEST_DIR}

Shuqian Zhao

80d3271

2016-11-11 16:37:36 -0800

[diff] [blame]

132

PUSH_USER = 'chromeos-test-lab'

Dan Shi

2013-07-25 15:08:48 -0700

[diff] [blame]

133

Shuqian Zhao

2018-01-31 11:53:34 -0800

[diff] [blame]

134

DEFAULT_SERVICE_RESPAWN_LIMIT = 2

135

136

Dan Shi

2013-07-25 15:08:48 -0700

[diff] [blame]

137

class TestPushException(Exception):

138

"""Exception to be raised when the test to push to prod failed."""

139

pass

140

Shuqian Zhao

f239b31

2017-12-05 16:45:02 -0800

[diff] [blame]

141

@retry.retry(TestPushException, timeout_min=5, delay_sec=30)

142

def check_dut_inventory(required_num_duts, pool):

143

"""Check DUT inventory for each board in the pool specified..

144

145

@param required_num_duts: a dict specifying the number of DUT each platform

146

requires in order to finish push tests.

147

@param pool: the pool used by test_push.

148

@raise TestPushException: if number of DUTs are less than the requirement.

149

"""

150

print 'Checking DUT inventory...'

151

pool_label = constants.Labels.POOL_PREFIX + pool

152

hosts = AFE.run('get_hosts', status='Ready', locked=False)

153

hosts = [h for h in hosts if pool_label in h.get('labels', [])]

154

platforms = [host['platform'] for host in hosts]

155

current_inventory = {p : platforms.count(p) for p in platforms}

156

error_msg = ''

157

for platform, req_num in required_num_duts.items():

158

curr_num = current_inventory.get(platform, 0)

159

if curr_num < req_num:

160

error_msg += ('\nRequire %d %s DUTs in pool: %s, only %d are Ready'

161

' now' % (req_num, platform, pool, curr_num))

162

if error_msg:

163

raise TestPushException('Not enough DUTs to run push tests. %s' %

164

error_msg)

165

Dan Shi

5ba5d2e

2014-05-09 13:47:00 -0700

[diff] [blame]

166

Shuqian Zhao

2016-09-12 10:42:03 -0700

[diff] [blame]

167

def powerwash_dut_to_test_repair(hostname, timeout):

168

"""Powerwash dut to test repair workflow.

Kevin Cheng

2015-12-11 09:45:57 -0800

[diff] [blame]

169

170

@param hostname: hostname of the dut.

Shuqian Zhao

2016-09-12 10:42:03 -0700

[diff] [blame]

171

@param timeout: seconds of the powerwash test to hit timeout.

172

@raise TestPushException: if DUT fail to run the test.

Kevin Cheng

2015-12-11 09:45:57 -0800

[diff] [blame]

173

"""

Shuqian Zhao

2016-09-12 10:42:03 -0700

[diff] [blame]

174

t = models.Test.objects.get(name='platform_Powerwash')

175

c = utils.read_file(os.path.join(common.autotest_dir, t.path))

176

job_id = rpc_utils.create_job_common(

177

'powerwash', priority=priorities.Priority.SUPER,

178

control_type='Server', control_file=c, hosts=[hostname])

179

Shuqian Zhao

e83a78c

2016-09-16 15:01:25 -0700

[diff] [blame]

180

end = time.time() + timeout

Shuqian Zhao

2016-09-12 10:42:03 -0700

[diff] [blame]

181

while not TKO.get_job_test_statuses_from_db(job_id):

Shuqian Zhao

e83a78c

2016-09-16 15:01:25 -0700

[diff] [blame]

182

if time.time() >= end:

183

AFE.run('abort_host_queue_entries', job=job_id)

Shuqian Zhao

2016-09-12 10:42:03 -0700

[diff] [blame]

184

raise TestPushException(

Shuqian Zhao

e83a78c

2016-09-16 15:01:25 -0700

[diff] [blame]

185

'Powerwash test on %s timeout after %ds, abort it.' %

186

(hostname, timeout))

Shuqian Zhao

2016-09-12 10:42:03 -0700

[diff] [blame]

187

time.sleep(10)

188

verify_test_results(job_id, EXPECTED_TEST_RESULTS_POWERWASH)

189

# Kick off verify, verify will fail and a repair should be triggered.

190

AFE.reverify_hosts(hostnames=[hostname])

Kevin Cheng

2015-12-11 09:45:57 -0800

[diff] [blame]

191

192

Shuqian Zhao

06deae0

2017-02-28 09:55:59 -0800

[diff] [blame]

193

def reverify_all_push_duts():

194

"""Reverify all the push DUTs."""

195

print 'Reverifying all DUTs.'

196

hosts = [h.hostname for h in AFE.get_hosts()]

Shuqian Zhao

d2a99f0

2016-09-22 13:31:30 -0700

[diff] [blame]

197

AFE.reverify_hosts(hostnames=hosts)

198

199

Richard Barnette

2018-04-20 15:11:54 -0700

[diff] [blame]

200

def parse_arguments(argv):

Dan Shi

2013-07-25 15:08:48 -0700

[diff] [blame]

201

"""Parse arguments for test_push tool.

202

Richard Barnette

2018-04-20 15:11:54 -0700

[diff] [blame]

203

@param argv Argument vector, as for `sys.argv`, including the

204

command name in `argv[0]`.

Dan Shi

2013-07-25 15:08:48 -0700

[diff] [blame]

205

@return: Parsed arguments.

206

207

"""

Richard Barnette

2018-04-20 15:11:54 -0700

[diff] [blame]

208

parser = argparse.ArgumentParser(prog=argv[0])

Dan Shi

8df9c00

2016-03-08 15:37:39 -0800

[diff] [blame]

209

parser.add_argument('-b', '--board', dest='board', default='gandof',

210

help='Default is gandof.')

Jakob Juelich

2014-10-10 14:08:05 -0700

[diff] [blame]

211

parser.add_argument('-sb', '--shard_board', dest='shard_board',

212

default='quawks',

213

help='Default is quawks.')

Dan Shi

2013-07-25 15:08:48 -0700

[diff] [blame]

214

parser.add_argument('-i', '--build', dest='build', default=None,

Shuqian Zhao

2016-09-21 11:02:15 -0700

[diff] [blame]

215

help='Default is the latest stale build of given '

216

'board. Must be a stable build, otherwise AU test '

217

'will fail. (ex: gandolf-release/R54-8743.25.0)')

Jakob Juelich

2014-10-10 14:08:05 -0700

[diff] [blame]

218

parser.add_argument('-si', '--shard_build', dest='shard_build', default=None,

Shuqian Zhao

2016-09-21 11:02:15 -0700

[diff] [blame]

219

help='Default is the latest stable build of given '

220

'board. Must be a stable build, otherwise AU test '

Jakob Juelich

2014-10-10 14:08:05 -0700

[diff] [blame]

221

'will fail.')

Dan Shi

2013-07-25 15:08:48 -0700

[diff] [blame]

222

parser.add_argument('-p', '--pool', dest='pool', default='bvt')

Shuqian Zhao

2015-08-06 09:46:22 -0700

[diff] [blame]

223

parser.add_argument('-t', '--timeout_min', dest='timeout_min', type=int,

xixuan

2d66858

2016-06-10 14:02:32 -0700

[diff] [blame]

224

default=DEFAULT_TIMEOUT_MIN_FOR_SUITE_JOB,

Shuqian Zhao

2015-08-06 09:46:22 -0700

[diff] [blame]

225

help='Time in mins to wait before abort the jobs we '

226

'are waiting on. Only for the asynchronous suites '

227

'triggered by create_and_return flag.')

Shuqian Zhao

1f311c0

2016-09-01 19:30:54 -0700

[diff] [blame]

228

parser.add_argument('-ud', '--num_duts', dest='num_duts',

Allen Li

2017-11-27 15:33:54 -0800

[diff] [blame]

229

default=dict(DEFAULT_NUM_DUTS),

230

type=ast.literal_eval,

231

help="Python dict literal that specifies the required"

232

" number of DUTs for each board. E.g {'gandof':4}")

Shuqian Zhao

676ed6f

2016-09-21 14:20:50 -0700

[diff] [blame]

233

parser.add_argument('-c', '--continue_on_failure', action='store_true',

234

dest='continue_on_failure',

235

help='All tests continue to run when there is failure')

Shuqian Zhao

2018-01-31 11:53:34 -0800

[diff] [blame]

236

parser.add_argument('-sl', '--service_respawn_limit', type=int,

237

default=DEFAULT_SERVICE_RESPAWN_LIMIT,

238

help='If a service crashes more than this, the test '

239

'push is considered failed.')

Dan Shi

2013-07-25 15:08:48 -0700

[diff] [blame]

240

Richard Barnette

2018-04-20 15:11:54 -0700

[diff] [blame]

241

arguments = parser.parse_args(argv[1:])

Dan Shi

2013-07-25 15:08:48 -0700

[diff] [blame]

242

Shuqian Zhao

2016-09-21 11:02:15 -0700

[diff] [blame]

243

# Get latest stable build as default build.

Richard Barnette

2018-04-20 15:11:54 -0700

[diff] [blame]

244

version_map = AFE.get_stable_version_map(AFE.CROS_IMAGE_TYPE)

Dan Shi

2013-07-25 15:08:48 -0700

[diff] [blame]

245

if not arguments.build:

Richard Barnette

2018-04-20 15:11:54 -0700

[diff] [blame]

246

arguments.build = version_map.get_image_name(arguments.board)

Jakob Juelich

2014-10-10 14:08:05 -0700

[diff] [blame]

247

if not arguments.shard_build:

Richard Barnette

2018-04-20 15:11:54 -0700

[diff] [blame]

248

arguments.shard_build = version_map.get_image_name(

249

arguments.shard_board)

Dan Shi

2013-07-25 15:08:48 -0700

[diff] [blame]

return arguments

Shuqian Zhao

2015-08-06 09:46:22 -0700

[diff] [blame]

253

def do_run_suite(suite_name, arguments, use_shard=False,

Richard Barnette

2018-04-25 01:00:27 +0000

[diff] [blame]

254

create_and_return=False):

Dan Shi

2013-07-25 15:08:48 -0700

[diff] [blame]

255

"""Call run_suite to run a suite job, and return the suite job id.

256

257

The script waits the suite job to finish before returning the suite job id.

258

Also it will echo the run_suite output to stdout.

259

260

@param suite_name: Name of a suite, e.g., dummy.

261

@param arguments: Arguments for run_suite command.

Jakob Juelich

2014-10-10 14:08:05 -0700

[diff] [blame]

262

@param use_shard: If true, suite is scheduled for shard board.

Shuqian Zhao

2015-08-06 09:46:22 -0700

[diff] [blame]

263

@param create_and_return: If True, run_suite just creates the suite, print

264

the job id, then finish immediately.

Jakob Juelich

2014-10-10 14:08:05 -0700

[diff] [blame]

265

Dan Shi

2013-07-25 15:08:48 -0700

[diff] [blame]

266

@return: Suite job ID.

267

268

"""

Richard Barnette

2018-04-25 01:00:27 +0000

[diff] [blame]

269

if use_shard:

Jakob Juelich

2014-10-10 14:08:05 -0700

[diff] [blame]

270

board = arguments.shard_board

271

build = arguments.shard_build

Dan Shi

2016-09-09 13:58:31 -0700

[diff] [blame]

272

else:

273

board = arguments.board

274

build = arguments.build

Jakob Juelich

2014-10-10 14:08:05 -0700

[diff] [blame]

275

Dan Shi

2014-12-22 16:25:05 -0800

[diff] [blame]

276

# Remove cros-version label to force provision.

Shuqian Zhao

7a49f1b

2016-10-24 16:48:04 -0700

[diff] [blame]

277

hosts = AFE.get_hosts(label=constants.Labels.BOARD_PREFIX+board,

278

locked=False)

Dan Shi

2014-12-22 16:25:05 -0800

[diff] [blame]

279

for host in hosts:

Dan Shi

2016-09-09 13:58:31 -0700

[diff] [blame]

280

labels_to_remove = [

281

l for l in host.labels

Richard Barnette

2018-04-25 01:00:27 +0000

[diff] [blame]

282

if l.startswith(provision.CROS_VERSION_PREFIX)]

Dan Shi

2016-09-09 13:58:31 -0700

[diff] [blame]

283

if labels_to_remove:

284

AFE.run('host_remove_labels', id=host.id, labels=labels_to_remove)

Dan Shi

2014-12-22 16:25:05 -0800

[diff] [blame]

285

Shuqian Zhao

d01fad0

2016-11-18 10:00:22 -0800

[diff] [blame]

286

# Test repair work flow on shards, powerwash test will timeout after 7m.

Kevin Cheng

2015-12-11 09:45:57 -0800

[diff] [blame]

287

if use_shard and not create_and_return:

Shuqian Zhao

d01fad0

2016-11-18 10:00:22 -0800

[diff] [blame]

288

powerwash_dut_to_test_repair(host.hostname, timeout=420)

Kevin Cheng

2015-12-11 09:45:57 -0800

[diff] [blame]

289

Dan Shi

2015-04-07 17:37:09 -0700

[diff] [blame]

290

current_dir = os.path.dirname(os.path.realpath(__file__))

291

cmd = [os.path.join(current_dir, RUN_SUITE_COMMAND),

Dan Shi

2013-07-25 15:08:48 -0700

[diff] [blame]

292

'-s', suite_name,

Jakob Juelich

2014-10-10 14:08:05 -0700

[diff] [blame]

293

'-b', board,

294

'-i', build,

Dan Shi

2013-07-25 15:08:48 -0700

[diff] [blame]

295

'-p', arguments.pool,

Allen Li

2017-11-27 15:33:54 -0800

[diff] [blame]

296

'--minimum_duts', str(arguments.num_duts[board])]

Shuqian Zhao

2015-08-06 09:46:22 -0700

[diff] [blame]

297

if create_and_return:

298

cmd += ['-c']

Dan Shi

2013-07-25 15:08:48 -0700

[diff] [blame]

299

300

suite_job_id = None

Dan Shi

2013-07-25 15:08:48 -0700

[diff] [blame]

301

302

proc = subprocess.Popen(cmd, stdout=subprocess.PIPE,

303

stderr=subprocess.STDOUT)

304

305

while True:

306

line = proc.stdout.readline()

307

308

# Break when run_suite process completed.

309

if not line and proc.poll() != None:

310

break

311

print line.rstrip()

Aviv Keshet

2017-11-08 13:25:01 -0800

[diff] [blame]

312

_run_suite_output.append(line.rstrip())

Dan Shi

2013-07-25 15:08:48 -0700

[diff] [blame]

313

314

if not suite_job_id:

315

m = re.match(SUITE_JOB_START_INFO_REGEX, line)

316

if m and m.group(1):

317

suite_job_id = int(m.group(1))

Aviv Keshet

2017-11-08 13:25:01 -0800

[diff] [blame]

318

_all_suite_ids.append(suite_job_id)

Dan Shi

2013-07-25 15:08:48 -0700

[diff] [blame]

319

320

if not suite_job_id:

321

raise TestPushException('Failed to retrieve suite job ID.')

Dan Shi

2014-05-09 15:18:15 -0700

[diff] [blame]

322

Shuqian Zhao

2015-08-06 09:46:22 -0700

[diff] [blame]

323

# If create_and_return specified, wait for the suite to finish.

324

if create_and_return:

325

end = time.time() + arguments.timeout_min * 60

Dan Shi

2016-02-03 11:37:02 -0800

[diff] [blame]

326

while not AFE.get_jobs(id=suite_job_id, finished=True):

Shuqian Zhao

2015-08-06 09:46:22 -0700

[diff] [blame]

327

if time.time() < end:

328

time.sleep(10)

329

else:

Dan Shi

2016-02-03 11:37:02 -0800

[diff] [blame]

330

AFE.run('abort_host_queue_entries', job=suite_job_id)

Shuqian Zhao

2015-08-06 09:46:22 -0700

[diff] [blame]

331

raise TestPushException(

332

'Asynchronous suite triggered by create_and_return '

333

'flag has timed out after %d mins. Aborting it.' %

334

arguments.timeout_min)

335

Dan Shi

2014-05-09 15:18:15 -0700

[diff] [blame]

336

print 'Suite job %s is completed.' % suite_job_id

Dan Shi

2013-07-25 15:08:48 -0700

[diff] [blame]

return suite_job_id

Dan Shi

2014-05-09 15:18:15 -0700

[diff] [blame]

340

def check_dut_image(build, suite_job_id):

341

"""Confirm all DUTs used for the suite are imaged to expected build.

342

343

@param build: Expected build to be imaged.

344

@param suite_job_id: job ID of the suite job.

345

@raise TestPushException: If a DUT does not have expected build imaged.

346

"""

347

print 'Checking image installed in DUTs...'

348

job_ids = [job.id for job in

349

models.Job.objects.filter(parent_job_id=suite_job_id)]

350

hqes = [models.HostQueueEntry.objects.filter(job_id=job_id)[0]

351

for job_id in job_ids]

352

hostnames = set([hqe.host.hostname for hqe in hqes])

353

for hostname in hostnames:

Prathmesh Prabhu

f10f41a

2017-04-21 11:52:16 -0700

[diff] [blame]

354

found_build = site_utils.get_build_from_afe(hostname, AFE)

355

if found_build != build:

Dan Shi

2014-05-09 15:18:15 -0700

[diff] [blame]

356

raise TestPushException('DUT is not imaged properly. Host %s has '

357

'build %s, while build %s is expected.' %

Prathmesh Prabhu

f10f41a

2017-04-21 11:52:16 -0700

[diff] [blame]

358

(hostname, found_build, build))

Dan Shi

2014-05-09 15:18:15 -0700

[diff] [blame]

359

360

Shuqian Zhao

2015-08-06 09:46:22 -0700

[diff] [blame]

361

def test_suite(suite_name, expected_results, arguments, use_shard=False,

Richard Barnette

2018-04-25 01:00:27 +0000

[diff] [blame]

362

create_and_return=False):

Dan Shi

2013-07-25 15:08:48 -0700

[diff] [blame]

363

"""Call run_suite to start a suite job and verify results.

364

365

@param suite_name: Name of a suite, e.g., dummy

366

@param expected_results: A dictionary of test name to test result.

367

@param arguments: Arguments for run_suite command.

Jakob Juelich

2014-10-10 14:08:05 -0700

[diff] [blame]

368

@param use_shard: If true, suite is scheduled for shard board.

Shuqian Zhao

2015-08-06 09:46:22 -0700

[diff] [blame]

369

@param create_and_return: If True, run_suite just creates the suite, print

370

the job id, then finish immediately.

Dan Shi

2013-07-25 15:08:48 -0700

[diff] [blame]

371

"""

Shuqian Zhao

2015-08-06 09:46:22 -0700

[diff] [blame]

372

suite_job_id = do_run_suite(suite_name, arguments, use_shard,

Richard Barnette

2018-04-25 01:00:27 +0000

[diff] [blame]

373

create_and_return)

Dan Shi

2013-07-25 15:08:48 -0700

[diff] [blame]

374

Dan Shi

2014-05-09 15:18:15 -0700

[diff] [blame]

375

# Confirm all DUTs used for the suite are imaged to expected build.

Jakob Juelich

2014-10-10 14:08:05 -0700

[diff] [blame]

376

# hqe.host_id for jobs running in shard is not synced back to master db,

377

# therefore, skip verifying dut build for jobs running in shard.

Richard Barnette

2018-04-25 01:00:27 +0000

[diff] [blame]

378

build_expected = arguments.build

379

if not use_shard:

Dan Shi

2016-09-09 13:58:31 -0700

[diff] [blame]

380

check_dut_image(build_expected, suite_job_id)

Dan Shi

2014-05-09 15:18:15 -0700

[diff] [blame]

381

Shuqian Zhao

2016-09-12 10:42:03 -0700

[diff] [blame]

382

# Verify test results are the expected results.

383

verify_test_results(suite_job_id, expected_results)

384

385

386

def verify_test_results(job_id, expected_results):

387

"""Verify the test results with the expected results.

388

389

@param job_id: id of the running jobs. For suite job, it is suite_job_id.

390

@param expected_results: A dictionary of test name to test result.

391

@raise TestPushException: If verify fails.

392

"""

Dan Shi

2014-05-09 15:18:15 -0700

[diff] [blame]

393

print 'Comparing test results...'

Shuqian Zhao

2016-09-12 10:42:03 -0700

[diff] [blame]

394

test_views = site_utils.get_test_views_from_tko(job_id, TKO)

Dan Shi

2013-07-25 15:08:48 -0700

[diff] [blame]

395

396

mismatch_errors = []

397

extra_test_errors = []

398

399

found_keys = set()

Shuqian Zhao

2016-09-12 10:42:03 -0700

[diff] [blame]

400

for test_name, test_status in test_views.items():

Dan Shi

2013-07-25 15:08:48 -0700

[diff] [blame]

401

print "%s%s" % (test_name.ljust(30), test_status)

Dan Shi

80b6ec0

2016-07-21 15:49:18 -0700

[diff] [blame]

402

# platform_InstallTestImage test may exist in old builds.

403

if re.search('platform_InstallTestImage_SERVER_JOB$', test_name):

404

continue

Dan Shi

2013-07-25 15:08:48 -0700

[diff] [blame]

405

test_found = False

406

for key,val in expected_results.items():

407

if re.search(key, test_name):

408

test_found = True

409

found_keys.add(key)

Dan Shi

2013-07-25 15:08:48 -0700

[diff] [blame]

410

if val != test_status:

411

error = ('%s Expected: [%s], Actual: [%s]' %

412

(test_name, val, test_status))

413

mismatch_errors.append(error)

414

if not test_found:

415

extra_test_errors.append(test_name)

416

417

missing_test_errors = set(expected_results.keys()) - found_keys

Dan Shi

dc9eb17

2014-12-09 16:05:02 -0800

[diff] [blame]

418

for exception in IGNORE_MISSING_TESTS:

419

try:

420

missing_test_errors.remove(exception)

except KeyError:

pass

Dan Shi

2013-07-25 15:08:48 -0700

[diff] [blame]

424

summary = []

425

if mismatch_errors:

426

summary.append(('Results of %d test(s) do not match expected '

427

'values:') % len(mismatch_errors))

428

summary.extend(mismatch_errors)

429

summary.append('\n')

430

431

if extra_test_errors:

432

summary.append('%d test(s) are not expected to be run:' %

433

len(extra_test_errors))

434

summary.extend(extra_test_errors)

435

summary.append('\n')

436

437

if missing_test_errors:

438

summary.append('%d test(s) are missing from the results:' %

439

len(missing_test_errors))

440

summary.extend(missing_test_errors)

441

summary.append('\n')

442

443

# Test link to log can be loaded.

Shuqian Zhao

2016-09-12 10:42:03 -0700

[diff] [blame]

444

job_name = '%s-%s' % (job_id, getpass.getuser())

Prathmesh Prabhu

cd246f5

2018-01-03 13:45:48 -0800

[diff] [blame]

445

log_link = URL_PATTERN % (rpc_client_lib.add_protocol(URL_HOST), job_name)

Dan Shi

2013-07-25 15:08:48 -0700

[diff] [blame]

446

try:

447

urllib2.urlopen(log_link).read()

448

except urllib2.URLError:

449

summary.append('Failed to load page for link to log: %s.' % log_link)

450

451

if summary:

452

raise TestPushException('\n'.join(summary))

453

454

Dan Shi

2015-04-07 17:37:09 -0700

[diff] [blame]

455

def test_suite_wrapper(queue, suite_name, expected_results, arguments,

Richard Barnette

2018-04-25 01:00:27 +0000

[diff] [blame]

456

use_shard=False, create_and_return=False):

Dan Shi

2015-04-07 17:37:09 -0700

[diff] [blame]

457

"""Wrapper to call test_suite. Handle exception and pipe it to parent

458

process.

459

460

@param queue: Queue to save exception to be accessed by parent process.

461

@param suite_name: Name of a suite, e.g., dummy

462

@param expected_results: A dictionary of test name to test result.

463

@param arguments: Arguments for run_suite command.

464

@param use_shard: If true, suite is scheduled for shard board.

Shuqian Zhao

2015-08-06 09:46:22 -0700

[diff] [blame]

465

@param create_and_return: If True, run_suite just creates the suite, print

466

the job id, then finish immediately.

Dan Shi

2015-04-07 17:37:09 -0700

[diff] [blame]

467

"""

468

try:

Shuqian Zhao

2015-08-06 09:46:22 -0700

[diff] [blame]

469

test_suite(suite_name, expected_results, arguments, use_shard,

Richard Barnette

2018-04-25 01:00:27 +0000

[diff] [blame]

470

create_and_return)

Allen Li

2017-11-27 15:33:54 -0800

[diff] [blame]

471

except Exception:

Dan Shi

2015-04-07 17:37:09 -0700

[diff] [blame]

472

# Store the whole exc_info leads to a PicklingError.

473

except_type, except_value, tb = sys.exc_info()

474

queue.put((except_type, except_value, traceback.extract_tb(tb)))

475

476

Dan Shi

2015-04-07 17:37:09 -0700

[diff] [blame]

477

def check_queue(queue):

478

"""Check the queue for any exception being raised.

479

480

@param queue: Queue used to store exception for parent process to access.

481

@raise: Any exception found in the queue.

"""

if queue.empty():

return

exc_info = queue.get()

486

# Raise the exception with original backtrace.

487

print 'Original stack trace of the exception:\n%s' % exc_info[2]

488

raise exc_info[0](exc_info[1])

489

490

Shuqian Zhao

2016-10-25 13:31:06 -0700

[diff] [blame]

491

def get_head_of_repos(repos):

492

"""Get HEAD of updated repos, currently are autotest and chromite repos

493

494

@param repos: a map of repo name to the path of the repo. E.g.

495

{'autotest': '/usr/local/autotest'}

496

@return: a map of repo names to the current HEAD of that repo.

"""

@contextmanager

def cd(new_wd):

"""Helper function to change working directory.

501

502

@param new_wd: new working directory that switch to.

503

"""

504

prev_wd = os.getcwd()

505

os.chdir(os.path.expanduser(new_wd))

try:

yield

finally:

os.chdir(prev_wd)

updated_repo_heads = {}

512

for repo_name, path_to_repo in repos.iteritems():

513

with cd(path_to_repo):

514

head = subprocess.check_output('git rev-parse HEAD',

515

shell=True).strip()

516

updated_repo_heads[repo_name] = head

517

return updated_repo_heads

518

519

Shuqian Zhao

80d3271

2016-11-11 16:37:36 -0800

[diff] [blame]

520

def push_prod_next_branch(updated_repo_heads):

521

"""push prod-next branch to the tested HEAD after all tests pass.

522

523

The push command must be ran as PUSH_USER, since only PUSH_USER has the

524

right to push branches.

525

526

@param updated_repo_heads: a map of repo names to tested HEAD of that repo.

527

"""

528

# prod-next branch for every repo is downloaded under PUSH_USER home dir.

Shuqian Zhao

aa0301c

2016-11-21 09:46:41 -0800

[diff] [blame]

529

cmd = ('cd ~/{repo}; git pull; git rebase {hash} prod-next;'

530

'git push origin prod-next')

Shuqian Zhao

80d3271

2016-11-11 16:37:36 -0800

[diff] [blame]

531

run_push_as_push_user = "sudo su - %s -c '%s'" % (PUSH_USER, cmd)

532

533

for repo_name, test_hash in updated_repo_heads.iteritems():

534

push_cmd = run_push_as_push_user.format(hash=test_hash, repo=repo_name)

535

print 'Pushing %s prod-next branch to %s' % (repo_name, test_hash)

536

print subprocess.check_output(push_cmd, stderr=subprocess.STDOUT,

shell=True)

Prathmesh Prabhu

2018-01-09 11:38:23 -0800

[diff] [blame]

540

def _run_test_suites(arguments):

541

"""Run the actual tests that comprise the test_push."""

542

# Use daemon flag will kill child processes when parent process fails.

543

use_daemon = not arguments.continue_on_failure

544

queue = multiprocessing.Queue()

545

546

push_to_prod_suite = multiprocessing.Process(

547

target=test_suite_wrapper,

548

args=(queue, PUSH_TO_PROD_SUITE, EXPECTED_TEST_RESULTS,

549

arguments))

550

push_to_prod_suite.daemon = use_daemon

551

push_to_prod_suite.start()

552

553

# suite test with --create_and_return flag

554

asynchronous_suite = multiprocessing.Process(

555

target=test_suite_wrapper,

556

args=(queue, DUMMY_SUITE, EXPECTED_TEST_RESULTS_DUMMY,

557

arguments, True, True))

558

asynchronous_suite.daemon = True

559

asynchronous_suite.start()

560

561

while push_to_prod_suite.is_alive() or asynchronous_suite.is_alive():

check_queue(queue)

time.sleep(5)

check_queue(queue)

push_to_prod_suite.join()

566

asynchronous_suite.join()

567

568

Shuqian Zhao

2018-01-31 11:53:34 -0800

[diff] [blame]

569

def check_service_crash(respawn_limit, start_time):

570

"""Check whether scheduler or host_scheduler crash during testing.

571

572

Since the testing push is kicked off at the beginning of a given hour, the way

573

to check whether a service is crashed is to check whether the times of the

574

service being respawn during testing push is over the respawn_limit.

575

576

@param respawn_limit: The maximum number of times the service is allowed to

577

be respawn.

578

@param start_time: The time that testing push is kicked off.

579

"""

580

def _parse(filename_prefix, filename):

581

"""Helper method to parse the time of the log.

582

583

@param filename_prefix: The prefix of the filename.

584

@param filename: The name of the log file.

585

"""

586

return datetime.datetime.strptime(filename[len(filename_prefix):],

587

"%Y-%m-%d-%H.%M.%S")

588

589

services = ['scheduler', 'host_scheduler']

590

logs = os.listdir('%s/logs/' % AUTOTEST_DIR)

591

curr_time = datetime.datetime.now()

592

593

error_msg = ''

594

for service in services:

595

log_prefix = '%s.log.' % service

596

respawn_count = sum(1 for l in logs if l.startswith(log_prefix)

597

and start_time <= _parse(log_prefix, l) <= curr_time)

598

599

if respawn_count > respawn_limit:

600

error_msg += ('%s has been respawned %s times during testing push at %s. '

601

'It is very likely crashed. Please check!\n' %

602

(service, respawn_count,

603

start_time.strftime("%Y-%m-%d-%H")))

604

if error_msg:

605

raise TestPushException(error_msg)

606

607

Prathmesh Prabhu

2018-01-09 11:38:23 -0800

[diff] [blame]

608

def _promote_prod_next_refs():

609

"""Updates prod-next branch on relevant repos."""

610

updated_repo_heads = get_head_of_repos(UPDATED_REPOS)

611

push_prod_next_branch(updated_repo_heads)

612

return updated_repo_heads

_SUCCESS_MSG = """

All tests completed successfully, the prod branch of the following repos is

617

ready to be pushed to the hash list below.

618

619

%(updated_repos_msg)s

620

621

Instructions for pushing to prod are available at

622

https://goto.google.com/autotest-to-prod

"""

Shuqian Zhao

2017-05-30 12:56:57 -0700

[diff] [blame]

626

def _main(arguments):

Prathmesh Prabhu

2018-01-09 11:38:23 -0800

[diff] [blame]

627

"""Run test and promote repo branches if tests succeed.

Shuqian Zhao

2017-05-30 12:56:57 -0700

[diff] [blame]

628

629

@param arguments: command line arguments.

630

"""

Aviv Keshet

2017-11-08 13:25:01 -0800

[diff] [blame]

631

632

# TODO Use chromite.lib.parallel.Manager instead, to workaround the

633

# too-long-tmp-path problem.

634

mpmanager = multiprocessing.Manager()

Prathmesh Prabhu

2018-01-09 11:38:23 -0800

[diff] [blame]

635

# These are globals used by other functions in this module to communicate

636

# back from worker processes.

637

global _run_suite_output

Aviv Keshet

2017-11-08 13:25:01 -0800

[diff] [blame]

638

_run_suite_output = mpmanager.list()

Prathmesh Prabhu

2018-01-09 11:38:23 -0800

[diff] [blame]

639

global _all_suite_ids

Aviv Keshet

2017-11-08 13:25:01 -0800

[diff] [blame]

640

_all_suite_ids = mpmanager.list()

641

Dan Shi

2013-07-25 15:08:48 -0700

[diff] [blame]

642

try:

Shuqian Zhao

2018-01-31 11:53:34 -0800

[diff] [blame]

643

start_time = datetime.datetime.now()

Shuqian Zhao

06deae0

2017-02-28 09:55:59 -0800

[diff] [blame]

644

reverify_all_push_duts()

Prathmesh Prabhu

2018-01-09 11:38:23 -0800

[diff] [blame]

645

time.sleep(15) # Wait for the verify test to start.

Shuqian Zhao

f239b31

2017-12-05 16:45:02 -0800

[diff] [blame]

646

check_dut_inventory(arguments.num_duts, arguments.pool)

Prathmesh Prabhu

2018-01-09 11:38:23 -0800

[diff] [blame]

647

_run_test_suites(arguments)

Shuqian Zhao

91b2014

2018-02-09 10:10:54 -0800

[diff] [blame]

648

check_service_crash(arguments.service_respawn_limit, start_time)

Prathmesh Prabhu

2018-01-09 11:38:23 -0800

[diff] [blame]

649

updated_repo_heads = _promote_prod_next_refs()

650

updated_repos_msg = '\n'.join(

651

['%s: %s' % (k, v) for k, v in updated_repo_heads.iteritems()])

652

print _SUCCESS_MSG % {'updated_repos_msg': updated_repos_msg}

653

except Exception:

Shuqian Zhao

676ed6f

2016-09-21 14:20:50 -0700

[diff] [blame]

654

# Abort running jobs when choose not to continue when there is failure.

655

if not arguments.continue_on_failure:

Aviv Keshet

2017-11-08 13:25:01 -0800

[diff] [blame]

656

for suite_id in _all_suite_ids:

Shuqian Zhao

676ed6f

2016-09-21 14:20:50 -0700

[diff] [blame]

657

if AFE.get_jobs(id=suite_id, finished=False):

658

AFE.run('abort_host_queue_entries', job=suite_id)

Dan Shi

2013-07-25 15:08:48 -0700

[diff] [blame]

659

raise

Shuqian Zhao

f794c49

2017-01-06 16:27:23 -0800

[diff] [blame]

660

finally:

Shuqian Zhao

d2a99f0

2016-09-22 13:31:30 -0700

[diff] [blame]

661

# Reverify all the hosts

Shuqian Zhao

06deae0

2017-02-28 09:55:59 -0800

[diff] [blame]

662

reverify_all_push_duts()

Dan Shi

2013-07-25 15:08:48 -0700

[diff] [blame]

663

Dan Shi

2013-07-25 15:08:48 -0700

[diff] [blame]

664

Shuqian Zhao

2017-05-30 12:56:57 -0700

[diff] [blame]

665

def main():

666

"""Entry point."""

Richard Barnette

2018-04-20 15:11:54 -0700

[diff] [blame]

667

arguments = parse_arguments(sys.argv)

Shuqian Zhao

034d85e

2017-06-01 11:57:39 -0700

[diff] [blame]

668

with ts_mon_config.SetupTsMonGlobalState(service_name='test_push',

669

indirect=True):

Prathmesh Prabhu

2018-01-09 11:38:23 -0800

[diff] [blame]

670

test_push_success = False

671

try:

672

_main(arguments)

673

test_push_success = True

674

finally:

675

metrics.Counter('chromeos/autotest/test_push/completed').increment(

676

fields={'success': test_push_success})

677

Shuqian Zhao

2017-05-30 12:56:57 -0700

[diff] [blame]

678

Dan Shi

2013-07-25 15:08:48 -0700

[diff] [blame]

679

if __name__ == '__main__':

Prathmesh Prabhu