blob: 8c21794b399f61735893b5b4ee761cd1d1b35345 [file] [log] [blame]
Dan Shi7e04fa82013-07-25 15:08:48 -07001#!/usr/bin/python
2#
3# Copyright (c) 2013 The Chromium OS Authors. All rights reserved.
4# Use of this source code is governed by a BSD-style license that can be
5# found in the LICENSE file.
6
7"""Tool to validate code in prod branch before pushing to lab.
8
9The script runs push_to_prod suite to verify code in prod branch is ready to be
10pushed. Link to design document:
11https://docs.google.com/a/google.com/document/d/1JMz0xS3fZRSHMpFkkKAL_rxsdbNZomhHbC3B8L71uuI/edit
12
13To verify if prod branch can be pushed to lab, run following command in
Shuqian Zhaobb030ff2017-09-21 17:36:13 -070014chromeos-staging-master2.hot server:
Michael Liang52d9f1f2014-06-17 15:01:24 -070015/usr/local/autotest/site_utils/test_push.py -e someone@company.com
Dan Shi7e04fa82013-07-25 15:08:48 -070016
Shuqian Zhaof3a114c2016-09-21 11:02:15 -070017The script uses latest gandof stable build as test build by default.
Dan Shi7e04fa82013-07-25 15:08:48 -070018
19"""
20
21import argparse
Shuqian Zhao1f311c02016-09-01 19:30:54 -070022import ast
Shuqian Zhao0de876d2018-01-31 11:53:34 -080023import datetime
Dan Shi7e04fa82013-07-25 15:08:48 -070024import getpass
Dan Shief1a5c02015-04-07 17:37:09 -070025import multiprocessing
Dan Shi7e04fa82013-07-25 15:08:48 -070026import os
27import re
28import subprocess
29import sys
Dan Shief1a5c02015-04-07 17:37:09 -070030import time
31import traceback
Dan Shi7e04fa82013-07-25 15:08:48 -070032import urllib2
33
34import common
Dan Shia8da7602014-05-09 15:18:15 -070035try:
36 from autotest_lib.frontend import setup_django_environment
37 from autotest_lib.frontend.afe import models
Shuqian Zhao327b6952016-09-12 10:42:03 -070038 from autotest_lib.frontend.afe import rpc_utils
Dan Shia8da7602014-05-09 15:18:15 -070039except ImportError:
40 # Unittest may not have Django database configured and will fail to import.
41 pass
Dan Shi5fa602c2015-03-26 17:54:13 -070042from autotest_lib.client.common_lib import global_config
Shuqian Zhao327b6952016-09-12 10:42:03 -070043from autotest_lib.client.common_lib import priorities
Shuqian Zhaof239b312017-12-05 16:45:02 -080044from autotest_lib.client.common_lib.cros import retry
Prathmesh Prabhucd246f52018-01-03 13:45:48 -080045from autotest_lib.frontend.afe import rpc_client_lib
Xixuan Wu93e646c2017-12-07 18:36:10 -080046from autotest_lib.server import constants
Dan Shi7e04fa82013-07-25 15:08:48 -070047from autotest_lib.server import site_utils
Shuqian Zhao327b6952016-09-12 10:42:03 -070048from autotest_lib.server import utils
Dan Shi47d32882014-12-22 16:25:05 -080049from autotest_lib.server.cros import provision
Dan Shi7e04fa82013-07-25 15:08:48 -070050from autotest_lib.server.cros.dynamic_suite import frontend_wrappers
Xixuan Wu665cfad2018-08-10 10:08:14 -070051from autotest_lib.site_utils import test_push_common
Dan Shi7e04fa82013-07-25 15:08:48 -070052
Shuqian Zhao7b2daea2016-10-25 13:31:06 -070053AUTOTEST_DIR=common.autotest_dir
Dan Shi7e04fa82013-07-25 15:08:48 -070054CONFIG = global_config.global_config
55
Dan Shiefd403e2016-02-03 11:37:02 -080056AFE = frontend_wrappers.RetryingAFE(timeout_min=0.5, delay_sec=2)
Shuqian Zhao327b6952016-09-12 10:42:03 -070057TKO = frontend_wrappers.RetryingTKO(timeout_min=0.1, delay_sec=10)
Dan Shiefd403e2016-02-03 11:37:02 -080058
Dan Shi7e04fa82013-07-25 15:08:48 -070059MAIL_FROM = 'chromeos-test@google.com'
Shuqian Zhao12861662016-08-31 19:23:17 -070060BUILD_REGEX = 'R[\d]+-[\d]+\.[\d]+\.[\d]+'
Dan Shi7e04fa82013-07-25 15:08:48 -070061RUN_SUITE_COMMAND = 'run_suite.py'
62PUSH_TO_PROD_SUITE = 'push_to_prod'
Jakob Juelich8f143912014-10-10 14:08:05 -070063DUMMY_SUITE = 'dummy'
xixuan2d668582016-06-10 14:02:32 -070064DEFAULT_TIMEOUT_MIN_FOR_SUITE_JOB = 30
Shuqian Zhao12861662016-08-31 19:23:17 -070065IMAGE_BUCKET = CONFIG.get_config_value('CROS', 'image_storage_server')
Allen Li64edf062017-11-27 15:33:54 -080066DEFAULT_NUM_DUTS = (
67 ('gandof', 4),
Aviv Keshetcc0be072018-09-20 21:32:09 +000068 ('quawks', 2),
Allen Li64edf062017-11-27 15:33:54 -080069)
Dan Shi7e04fa82013-07-25 15:08:48 -070070
Fang Deng6dddf602014-04-17 17:01:47 -070071SUITE_JOB_START_INFO_REGEX = ('^.*Created suite job:.*'
72 'tab_id=view_job&object_id=(\d+)$')
Dan Shi7e04fa82013-07-25 15:08:48 -070073
74# Dictionary of test results keyed by test name regular expression.
75EXPECTED_TEST_RESULTS = {'^SERVER_JOB$': 'GOOD',
76 # This is related to dummy_Fail/control.dependency.
77 'dummy_Fail.dependency$': 'TEST_NA',
Dan Shidc9eb172014-12-09 16:05:02 -080078 'login_LoginSuccess.*': 'GOOD',
Dan Shi47d32882014-12-22 16:25:05 -080079 'provision_AutoUpdate.double': 'GOOD',
Dan Shi7e04fa82013-07-25 15:08:48 -070080 'dummy_Pass.*': 'GOOD',
81 'dummy_Fail.Fail$': 'FAIL',
Dan Shi7e04fa82013-07-25 15:08:48 -070082 'dummy_Fail.Error$': 'ERROR',
83 'dummy_Fail.Warn$': 'WARN',
84 'dummy_Fail.NAError$': 'TEST_NA',
85 'dummy_Fail.Crash$': 'GOOD',
Aviv Keshetff024f92017-09-26 13:43:14 -070086 'autotest_SyncCount$': 'GOOD',
Dan Shi7e04fa82013-07-25 15:08:48 -070087 }
88
Jakob Juelich8f143912014-10-10 14:08:05 -070089EXPECTED_TEST_RESULTS_DUMMY = {'^SERVER_JOB$': 'GOOD',
90 'dummy_Pass.*': 'GOOD',
91 'dummy_Fail.Fail': 'FAIL',
92 'dummy_Fail.Warn': 'WARN',
93 'dummy_Fail.Crash': 'GOOD',
94 'dummy_Fail.Error': 'ERROR',
95 'dummy_Fail.NAError': 'TEST_NA',}
96
Shuqian Zhao327b6952016-09-12 10:42:03 -070097EXPECTED_TEST_RESULTS_POWERWASH = {'platform_Powerwash': 'GOOD',
98 'SERVER_JOB': 'GOOD'}
99
Dan Shi7e04fa82013-07-25 15:08:48 -0700100URL_HOST = CONFIG.get_config_value('SERVER', 'hostname', type=str)
101URL_PATTERN = CONFIG.get_config_value('CROS', 'log_url_pattern', type=str)
102
Prathmesh Prabhu9b00d2d2018-08-17 11:30:16 -0700103# Some test could be extra / missing or have mismatched results for various
104# reasons. Add such test in this list and explain the reason.
Xixuan Wu84a834f2018-08-10 15:22:26 -0700105_IGNORED_TESTS = [
Prathmesh Prabhuc4577982018-08-17 11:23:36 -0700106 # test_push uses a stable image build to test, which is quite behind ToT.
107 # The following expectations are correct at ToT, but need to be ignored
108 # until stable image is recent enough.
109
Prathmesh Prabhuc4577982018-08-17 11:23:36 -0700110 # TODO(pprabhu): Remove once R70 is stable.
111 'dummy_Fail.RetrySuccess',
112 'dummy_Fail.RetryFail',
Xixuan Wu84a834f2018-08-10 15:22:26 -0700113]
Dan Shidc9eb172014-12-09 16:05:02 -0800114
Aviv Keshet0d679eb2017-11-08 13:25:01 -0800115# Multiprocessing proxy objects that are used to share data between background
116# suite-running processes and main process. The multiprocessing-compatible
117# versions are initialized in _main.
118_run_suite_output = []
119_all_suite_ids = []
120
Shuqian Zhao0de876d2018-01-31 11:53:34 -0800121DEFAULT_SERVICE_RESPAWN_LIMIT = 2
122
123
Dan Shi7e04fa82013-07-25 15:08:48 -0700124class TestPushException(Exception):
125 """Exception to be raised when the test to push to prod failed."""
126 pass
127
Shuqian Zhaof239b312017-12-05 16:45:02 -0800128@retry.retry(TestPushException, timeout_min=5, delay_sec=30)
129def check_dut_inventory(required_num_duts, pool):
130 """Check DUT inventory for each board in the pool specified..
131
132 @param required_num_duts: a dict specifying the number of DUT each platform
133 requires in order to finish push tests.
134 @param pool: the pool used by test_push.
135 @raise TestPushException: if number of DUTs are less than the requirement.
136 """
137 print 'Checking DUT inventory...'
138 pool_label = constants.Labels.POOL_PREFIX + pool
139 hosts = AFE.run('get_hosts', status='Ready', locked=False)
140 hosts = [h for h in hosts if pool_label in h.get('labels', [])]
141 platforms = [host['platform'] for host in hosts]
142 current_inventory = {p : platforms.count(p) for p in platforms}
143 error_msg = ''
144 for platform, req_num in required_num_duts.items():
145 curr_num = current_inventory.get(platform, 0)
146 if curr_num < req_num:
147 error_msg += ('\nRequire %d %s DUTs in pool: %s, only %d are Ready'
148 ' now' % (req_num, platform, pool, curr_num))
149 if error_msg:
150 raise TestPushException('Not enough DUTs to run push tests. %s' %
151 error_msg)
152
Dan Shi5ba5d2e2014-05-09 13:47:00 -0700153
Shuqian Zhao327b6952016-09-12 10:42:03 -0700154def powerwash_dut_to_test_repair(hostname, timeout):
155 """Powerwash dut to test repair workflow.
Kevin Cheng6e4c2642015-12-11 09:45:57 -0800156
157 @param hostname: hostname of the dut.
Shuqian Zhao327b6952016-09-12 10:42:03 -0700158 @param timeout: seconds of the powerwash test to hit timeout.
159 @raise TestPushException: if DUT fail to run the test.
Kevin Cheng6e4c2642015-12-11 09:45:57 -0800160 """
Shuqian Zhao327b6952016-09-12 10:42:03 -0700161 t = models.Test.objects.get(name='platform_Powerwash')
Jacob Kopczynski0ea4e042018-08-14 11:21:48 -0700162 c = utils.read_file(os.path.join(AUTOTEST_DIR, t.path))
Shuqian Zhao327b6952016-09-12 10:42:03 -0700163 job_id = rpc_utils.create_job_common(
164 'powerwash', priority=priorities.Priority.SUPER,
165 control_type='Server', control_file=c, hosts=[hostname])
166
Shuqian Zhaoe83a78c2016-09-16 15:01:25 -0700167 end = time.time() + timeout
Shuqian Zhao327b6952016-09-12 10:42:03 -0700168 while not TKO.get_job_test_statuses_from_db(job_id):
Shuqian Zhaoe83a78c2016-09-16 15:01:25 -0700169 if time.time() >= end:
170 AFE.run('abort_host_queue_entries', job=job_id)
Shuqian Zhao327b6952016-09-12 10:42:03 -0700171 raise TestPushException(
Shuqian Zhaoe83a78c2016-09-16 15:01:25 -0700172 'Powerwash test on %s timeout after %ds, abort it.' %
173 (hostname, timeout))
Shuqian Zhao327b6952016-09-12 10:42:03 -0700174 time.sleep(10)
Xixuan Wu665cfad2018-08-10 10:08:14 -0700175 verify_test_results(job_id,
176 test_push_common.EXPECTED_TEST_RESULTS_POWERWASH)
Shuqian Zhao327b6952016-09-12 10:42:03 -0700177 # Kick off verify, verify will fail and a repair should be triggered.
178 AFE.reverify_hosts(hostnames=[hostname])
Kevin Cheng6e4c2642015-12-11 09:45:57 -0800179
180
Shuqian Zhao06deae02017-02-28 09:55:59 -0800181def reverify_all_push_duts():
182 """Reverify all the push DUTs."""
183 print 'Reverifying all DUTs.'
184 hosts = [h.hostname for h in AFE.get_hosts()]
Shuqian Zhaod2a99f02016-09-22 13:31:30 -0700185 AFE.reverify_hosts(hostnames=hosts)
186
187
Richard Barnette2af82212018-04-20 15:11:54 -0700188def parse_arguments(argv):
Dan Shi7e04fa82013-07-25 15:08:48 -0700189 """Parse arguments for test_push tool.
190
Richard Barnette2af82212018-04-20 15:11:54 -0700191 @param argv Argument vector, as for `sys.argv`, including the
192 command name in `argv[0]`.
Dan Shi7e04fa82013-07-25 15:08:48 -0700193 @return: Parsed arguments.
194
195 """
Richard Barnette2af82212018-04-20 15:11:54 -0700196 parser = argparse.ArgumentParser(prog=argv[0])
Dan Shi8df9c002016-03-08 15:37:39 -0800197 parser.add_argument('-b', '--board', dest='board', default='gandof',
198 help='Default is gandof.')
Jakob Juelich8f143912014-10-10 14:08:05 -0700199 parser.add_argument('-sb', '--shard_board', dest='shard_board',
200 default='quawks',
201 help='Default is quawks.')
Dan Shi7e04fa82013-07-25 15:08:48 -0700202 parser.add_argument('-i', '--build', dest='build', default=None,
Shuqian Zhaof3a114c2016-09-21 11:02:15 -0700203 help='Default is the latest stale build of given '
204 'board. Must be a stable build, otherwise AU test '
205 'will fail. (ex: gandolf-release/R54-8743.25.0)')
Jakob Juelich8f143912014-10-10 14:08:05 -0700206 parser.add_argument('-si', '--shard_build', dest='shard_build', default=None,
Shuqian Zhaof3a114c2016-09-21 11:02:15 -0700207 help='Default is the latest stable build of given '
208 'board. Must be a stable build, otherwise AU test '
Jakob Juelich8f143912014-10-10 14:08:05 -0700209 'will fail.')
Dan Shi7e04fa82013-07-25 15:08:48 -0700210 parser.add_argument('-p', '--pool', dest='pool', default='bvt')
Shuqian Zhaod4864772015-08-06 09:46:22 -0700211 parser.add_argument('-t', '--timeout_min', dest='timeout_min', type=int,
xixuan2d668582016-06-10 14:02:32 -0700212 default=DEFAULT_TIMEOUT_MIN_FOR_SUITE_JOB,
Shuqian Zhaod4864772015-08-06 09:46:22 -0700213 help='Time in mins to wait before abort the jobs we '
214 'are waiting on. Only for the asynchronous suites '
215 'triggered by create_and_return flag.')
Shuqian Zhao1f311c02016-09-01 19:30:54 -0700216 parser.add_argument('-ud', '--num_duts', dest='num_duts',
Allen Li64edf062017-11-27 15:33:54 -0800217 default=dict(DEFAULT_NUM_DUTS),
218 type=ast.literal_eval,
219 help="Python dict literal that specifies the required"
220 " number of DUTs for each board. E.g {'gandof':4}")
Shuqian Zhao676ed6f2016-09-21 14:20:50 -0700221 parser.add_argument('-c', '--continue_on_failure', action='store_true',
222 dest='continue_on_failure',
223 help='All tests continue to run when there is failure')
Shuqian Zhao0de876d2018-01-31 11:53:34 -0800224 parser.add_argument('-sl', '--service_respawn_limit', type=int,
225 default=DEFAULT_SERVICE_RESPAWN_LIMIT,
226 help='If a service crashes more than this, the test '
227 'push is considered failed.')
Dan Shi7e04fa82013-07-25 15:08:48 -0700228
Richard Barnette2af82212018-04-20 15:11:54 -0700229 arguments = parser.parse_args(argv[1:])
Dan Shi7e04fa82013-07-25 15:08:48 -0700230
Shuqian Zhaof3a114c2016-09-21 11:02:15 -0700231 # Get latest stable build as default build.
Richard Barnette2af82212018-04-20 15:11:54 -0700232 version_map = AFE.get_stable_version_map(AFE.CROS_IMAGE_TYPE)
Dan Shi7e04fa82013-07-25 15:08:48 -0700233 if not arguments.build:
Richard Barnette2af82212018-04-20 15:11:54 -0700234 arguments.build = version_map.get_image_name(arguments.board)
Jakob Juelich8f143912014-10-10 14:08:05 -0700235 if not arguments.shard_build:
Richard Barnette2af82212018-04-20 15:11:54 -0700236 arguments.shard_build = version_map.get_image_name(
237 arguments.shard_board)
Dan Shi7e04fa82013-07-25 15:08:48 -0700238 return arguments
239
240
Shuqian Zhaod4864772015-08-06 09:46:22 -0700241def do_run_suite(suite_name, arguments, use_shard=False,
Richard Barnetteb12413a2018-04-25 01:00:27 +0000242 create_and_return=False):
Dan Shi7e04fa82013-07-25 15:08:48 -0700243 """Call run_suite to run a suite job, and return the suite job id.
244
245 The script waits the suite job to finish before returning the suite job id.
246 Also it will echo the run_suite output to stdout.
247
248 @param suite_name: Name of a suite, e.g., dummy.
249 @param arguments: Arguments for run_suite command.
Jakob Juelich8f143912014-10-10 14:08:05 -0700250 @param use_shard: If true, suite is scheduled for shard board.
Shuqian Zhaod4864772015-08-06 09:46:22 -0700251 @param create_and_return: If True, run_suite just creates the suite, print
252 the job id, then finish immediately.
Jakob Juelich8f143912014-10-10 14:08:05 -0700253
Dan Shi7e04fa82013-07-25 15:08:48 -0700254 @return: Suite job ID.
255
256 """
Richard Barnetteb12413a2018-04-25 01:00:27 +0000257 if use_shard:
Jakob Juelich8f143912014-10-10 14:08:05 -0700258 board = arguments.shard_board
259 build = arguments.shard_build
Dan Shi81ddc422016-09-09 13:58:31 -0700260 else:
261 board = arguments.board
262 build = arguments.build
Jakob Juelich8f143912014-10-10 14:08:05 -0700263
Dan Shi47d32882014-12-22 16:25:05 -0800264 # Remove cros-version label to force provision.
Shuqian Zhao7a49f1b2016-10-24 16:48:04 -0700265 hosts = AFE.get_hosts(label=constants.Labels.BOARD_PREFIX+board,
266 locked=False)
Dan Shi47d32882014-12-22 16:25:05 -0800267 for host in hosts:
Dan Shi81ddc422016-09-09 13:58:31 -0700268 labels_to_remove = [
269 l for l in host.labels
Richard Barnetteb12413a2018-04-25 01:00:27 +0000270 if l.startswith(provision.CROS_VERSION_PREFIX)]
Dan Shi81ddc422016-09-09 13:58:31 -0700271 if labels_to_remove:
272 AFE.run('host_remove_labels', id=host.id, labels=labels_to_remove)
Dan Shi47d32882014-12-22 16:25:05 -0800273
Shuqian Zhaod01fad02016-11-18 10:00:22 -0800274 # Test repair work flow on shards, powerwash test will timeout after 7m.
Kevin Cheng6e4c2642015-12-11 09:45:57 -0800275 if use_shard and not create_and_return:
Shuqian Zhaod01fad02016-11-18 10:00:22 -0800276 powerwash_dut_to_test_repair(host.hostname, timeout=420)
Kevin Cheng6e4c2642015-12-11 09:45:57 -0800277
Dan Shief1a5c02015-04-07 17:37:09 -0700278 current_dir = os.path.dirname(os.path.realpath(__file__))
279 cmd = [os.path.join(current_dir, RUN_SUITE_COMMAND),
Dan Shi7e04fa82013-07-25 15:08:48 -0700280 '-s', suite_name,
Jakob Juelich8f143912014-10-10 14:08:05 -0700281 '-b', board,
282 '-i', build,
Dan Shi7e04fa82013-07-25 15:08:48 -0700283 '-p', arguments.pool,
Allen Li64edf062017-11-27 15:33:54 -0800284 '--minimum_duts', str(arguments.num_duts[board])]
Shuqian Zhaod4864772015-08-06 09:46:22 -0700285 if create_and_return:
286 cmd += ['-c']
Dan Shi7e04fa82013-07-25 15:08:48 -0700287
288 suite_job_id = None
Dan Shi7e04fa82013-07-25 15:08:48 -0700289
290 proc = subprocess.Popen(cmd, stdout=subprocess.PIPE,
291 stderr=subprocess.STDOUT)
292
293 while True:
294 line = proc.stdout.readline()
295
296 # Break when run_suite process completed.
297 if not line and proc.poll() != None:
298 break
299 print line.rstrip()
Aviv Keshet0d679eb2017-11-08 13:25:01 -0800300 _run_suite_output.append(line.rstrip())
Dan Shi7e04fa82013-07-25 15:08:48 -0700301
302 if not suite_job_id:
303 m = re.match(SUITE_JOB_START_INFO_REGEX, line)
304 if m and m.group(1):
305 suite_job_id = int(m.group(1))
Aviv Keshet0d679eb2017-11-08 13:25:01 -0800306 _all_suite_ids.append(suite_job_id)
Dan Shi7e04fa82013-07-25 15:08:48 -0700307
308 if not suite_job_id:
309 raise TestPushException('Failed to retrieve suite job ID.')
Dan Shia8da7602014-05-09 15:18:15 -0700310
Shuqian Zhaod4864772015-08-06 09:46:22 -0700311 # If create_and_return specified, wait for the suite to finish.
312 if create_and_return:
313 end = time.time() + arguments.timeout_min * 60
Dan Shiefd403e2016-02-03 11:37:02 -0800314 while not AFE.get_jobs(id=suite_job_id, finished=True):
Shuqian Zhaod4864772015-08-06 09:46:22 -0700315 if time.time() < end:
316 time.sleep(10)
317 else:
Dan Shiefd403e2016-02-03 11:37:02 -0800318 AFE.run('abort_host_queue_entries', job=suite_job_id)
Shuqian Zhaod4864772015-08-06 09:46:22 -0700319 raise TestPushException(
320 'Asynchronous suite triggered by create_and_return '
321 'flag has timed out after %d mins. Aborting it.' %
322 arguments.timeout_min)
323
Dan Shia8da7602014-05-09 15:18:15 -0700324 print 'Suite job %s is completed.' % suite_job_id
Dan Shi7e04fa82013-07-25 15:08:48 -0700325 return suite_job_id
326
327
Dan Shia8da7602014-05-09 15:18:15 -0700328def check_dut_image(build, suite_job_id):
329 """Confirm all DUTs used for the suite are imaged to expected build.
330
331 @param build: Expected build to be imaged.
332 @param suite_job_id: job ID of the suite job.
333 @raise TestPushException: If a DUT does not have expected build imaged.
334 """
335 print 'Checking image installed in DUTs...'
336 job_ids = [job.id for job in
337 models.Job.objects.filter(parent_job_id=suite_job_id)]
338 hqes = [models.HostQueueEntry.objects.filter(job_id=job_id)[0]
339 for job_id in job_ids]
340 hostnames = set([hqe.host.hostname for hqe in hqes])
341 for hostname in hostnames:
Prathmesh Prabhuf10f41a2017-04-21 11:52:16 -0700342 found_build = site_utils.get_build_from_afe(hostname, AFE)
343 if found_build != build:
Dan Shia8da7602014-05-09 15:18:15 -0700344 raise TestPushException('DUT is not imaged properly. Host %s has '
345 'build %s, while build %s is expected.' %
Prathmesh Prabhuf10f41a2017-04-21 11:52:16 -0700346 (hostname, found_build, build))
Dan Shia8da7602014-05-09 15:18:15 -0700347
348
Shuqian Zhaod4864772015-08-06 09:46:22 -0700349def test_suite(suite_name, expected_results, arguments, use_shard=False,
Richard Barnetteb12413a2018-04-25 01:00:27 +0000350 create_and_return=False):
Dan Shi7e04fa82013-07-25 15:08:48 -0700351 """Call run_suite to start a suite job and verify results.
352
353 @param suite_name: Name of a suite, e.g., dummy
354 @param expected_results: A dictionary of test name to test result.
355 @param arguments: Arguments for run_suite command.
Jakob Juelich8f143912014-10-10 14:08:05 -0700356 @param use_shard: If true, suite is scheduled for shard board.
Shuqian Zhaod4864772015-08-06 09:46:22 -0700357 @param create_and_return: If True, run_suite just creates the suite, print
358 the job id, then finish immediately.
Dan Shi7e04fa82013-07-25 15:08:48 -0700359 """
Shuqian Zhaod4864772015-08-06 09:46:22 -0700360 suite_job_id = do_run_suite(suite_name, arguments, use_shard,
Richard Barnetteb12413a2018-04-25 01:00:27 +0000361 create_and_return)
Dan Shi7e04fa82013-07-25 15:08:48 -0700362
Dan Shia8da7602014-05-09 15:18:15 -0700363 # Confirm all DUTs used for the suite are imaged to expected build.
Jakob Juelich8f143912014-10-10 14:08:05 -0700364 # hqe.host_id for jobs running in shard is not synced back to master db,
365 # therefore, skip verifying dut build for jobs running in shard.
Richard Barnetteb12413a2018-04-25 01:00:27 +0000366 build_expected = arguments.build
367 if not use_shard:
Dan Shi81ddc422016-09-09 13:58:31 -0700368 check_dut_image(build_expected, suite_job_id)
Dan Shia8da7602014-05-09 15:18:15 -0700369
Shuqian Zhao327b6952016-09-12 10:42:03 -0700370 # Verify test results are the expected results.
371 verify_test_results(suite_job_id, expected_results)
372
373
374def verify_test_results(job_id, expected_results):
375 """Verify the test results with the expected results.
376
377 @param job_id: id of the running jobs. For suite job, it is suite_job_id.
378 @param expected_results: A dictionary of test name to test result.
379 @raise TestPushException: If verify fails.
380 """
Dan Shia8da7602014-05-09 15:18:15 -0700381 print 'Comparing test results...'
Shuqian Zhao327b6952016-09-12 10:42:03 -0700382 test_views = site_utils.get_test_views_from_tko(job_id, TKO)
Xixuan Wu84a834f2018-08-10 15:22:26 -0700383 summary = test_push_common.summarize_push(test_views, expected_results,
384 _IGNORED_TESTS)
Dan Shi7e04fa82013-07-25 15:08:48 -0700385
386 # Test link to log can be loaded.
Shuqian Zhao327b6952016-09-12 10:42:03 -0700387 job_name = '%s-%s' % (job_id, getpass.getuser())
Prathmesh Prabhucd246f52018-01-03 13:45:48 -0800388 log_link = URL_PATTERN % (rpc_client_lib.add_protocol(URL_HOST), job_name)
Dan Shi7e04fa82013-07-25 15:08:48 -0700389 try:
390 urllib2.urlopen(log_link).read()
391 except urllib2.URLError:
392 summary.append('Failed to load page for link to log: %s.' % log_link)
393
394 if summary:
395 raise TestPushException('\n'.join(summary))
396
Dan Shief1a5c02015-04-07 17:37:09 -0700397def test_suite_wrapper(queue, suite_name, expected_results, arguments,
Richard Barnetteb12413a2018-04-25 01:00:27 +0000398 use_shard=False, create_and_return=False):
Dan Shief1a5c02015-04-07 17:37:09 -0700399 """Wrapper to call test_suite. Handle exception and pipe it to parent
400 process.
401
402 @param queue: Queue to save exception to be accessed by parent process.
403 @param suite_name: Name of a suite, e.g., dummy
404 @param expected_results: A dictionary of test name to test result.
405 @param arguments: Arguments for run_suite command.
406 @param use_shard: If true, suite is scheduled for shard board.
Shuqian Zhaod4864772015-08-06 09:46:22 -0700407 @param create_and_return: If True, run_suite just creates the suite, print
408 the job id, then finish immediately.
Dan Shief1a5c02015-04-07 17:37:09 -0700409 """
410 try:
Shuqian Zhaod4864772015-08-06 09:46:22 -0700411 test_suite(suite_name, expected_results, arguments, use_shard,
Richard Barnetteb12413a2018-04-25 01:00:27 +0000412 create_and_return)
Allen Li64edf062017-11-27 15:33:54 -0800413 except Exception:
Dan Shief1a5c02015-04-07 17:37:09 -0700414 # Store the whole exc_info leads to a PicklingError.
415 except_type, except_value, tb = sys.exc_info()
416 queue.put((except_type, except_value, traceback.extract_tb(tb)))
417
418
Dan Shief1a5c02015-04-07 17:37:09 -0700419def check_queue(queue):
420 """Check the queue for any exception being raised.
421
422 @param queue: Queue used to store exception for parent process to access.
423 @raise: Any exception found in the queue.
424 """
425 if queue.empty():
426 return
427 exc_info = queue.get()
428 # Raise the exception with original backtrace.
429 print 'Original stack trace of the exception:\n%s' % exc_info[2]
430 raise exc_info[0](exc_info[1])
431
432
Prathmesh Prabhubac5be02018-01-09 11:38:23 -0800433def _run_test_suites(arguments):
434 """Run the actual tests that comprise the test_push."""
435 # Use daemon flag will kill child processes when parent process fails.
436 use_daemon = not arguments.continue_on_failure
437 queue = multiprocessing.Queue()
438
439 push_to_prod_suite = multiprocessing.Process(
440 target=test_suite_wrapper,
Xixuan Wu665cfad2018-08-10 10:08:14 -0700441 args=(queue, PUSH_TO_PROD_SUITE,
442 test_push_common.EXPECTED_TEST_RESULTS, arguments))
Prathmesh Prabhubac5be02018-01-09 11:38:23 -0800443 push_to_prod_suite.daemon = use_daemon
444 push_to_prod_suite.start()
445
446 # suite test with --create_and_return flag
447 asynchronous_suite = multiprocessing.Process(
448 target=test_suite_wrapper,
Xixuan Wu665cfad2018-08-10 10:08:14 -0700449 args=(queue, DUMMY_SUITE,
450 test_push_common.EXPECTED_TEST_RESULTS_DUMMY,
451 arguments, True, True))
Prathmesh Prabhubac5be02018-01-09 11:38:23 -0800452 asynchronous_suite.daemon = True
453 asynchronous_suite.start()
454
455 while push_to_prod_suite.is_alive() or asynchronous_suite.is_alive():
456 check_queue(queue)
457 time.sleep(5)
458 check_queue(queue)
459 push_to_prod_suite.join()
460 asynchronous_suite.join()
461
462
Shuqian Zhao0de876d2018-01-31 11:53:34 -0800463def check_service_crash(respawn_limit, start_time):
464 """Check whether scheduler or host_scheduler crash during testing.
465
466 Since the testing push is kicked off at the beginning of a given hour, the way
467 to check whether a service is crashed is to check whether the times of the
468 service being respawn during testing push is over the respawn_limit.
469
470 @param respawn_limit: The maximum number of times the service is allowed to
471 be respawn.
472 @param start_time: The time that testing push is kicked off.
473 """
474 def _parse(filename_prefix, filename):
475 """Helper method to parse the time of the log.
476
477 @param filename_prefix: The prefix of the filename.
478 @param filename: The name of the log file.
479 """
480 return datetime.datetime.strptime(filename[len(filename_prefix):],
481 "%Y-%m-%d-%H.%M.%S")
482
483 services = ['scheduler', 'host_scheduler']
484 logs = os.listdir('%s/logs/' % AUTOTEST_DIR)
485 curr_time = datetime.datetime.now()
486
487 error_msg = ''
488 for service in services:
489 log_prefix = '%s.log.' % service
490 respawn_count = sum(1 for l in logs if l.startswith(log_prefix)
491 and start_time <= _parse(log_prefix, l) <= curr_time)
492
493 if respawn_count > respawn_limit:
494 error_msg += ('%s has been respawned %s times during testing push at %s. '
495 'It is very likely crashed. Please check!\n' %
496 (service, respawn_count,
497 start_time.strftime("%Y-%m-%d-%H")))
498 if error_msg:
499 raise TestPushException(error_msg)
500
501
Prathmesh Prabhubac5be02018-01-09 11:38:23 -0800502_SUCCESS_MSG = """
Jacob Kopczynski0ea4e042018-08-14 11:21:48 -0700503All tests completed successfully.
Prathmesh Prabhubac5be02018-01-09 11:38:23 -0800504
505Instructions for pushing to prod are available at
506https://goto.google.com/autotest-to-prod
507"""
508
509
Shuqian Zhao56969542017-05-30 12:56:57 -0700510def _main(arguments):
Prathmesh Prabhubac5be02018-01-09 11:38:23 -0800511 """Run test and promote repo branches if tests succeed.
Shuqian Zhao56969542017-05-30 12:56:57 -0700512
513 @param arguments: command line arguments.
514 """
Aviv Keshet0d679eb2017-11-08 13:25:01 -0800515
516 # TODO Use chromite.lib.parallel.Manager instead, to workaround the
517 # too-long-tmp-path problem.
518 mpmanager = multiprocessing.Manager()
Prathmesh Prabhubac5be02018-01-09 11:38:23 -0800519 # These are globals used by other functions in this module to communicate
520 # back from worker processes.
521 global _run_suite_output
Aviv Keshet0d679eb2017-11-08 13:25:01 -0800522 _run_suite_output = mpmanager.list()
Prathmesh Prabhubac5be02018-01-09 11:38:23 -0800523 global _all_suite_ids
Aviv Keshet0d679eb2017-11-08 13:25:01 -0800524 _all_suite_ids = mpmanager.list()
525
Dan Shi7e04fa82013-07-25 15:08:48 -0700526 try:
Shuqian Zhao0de876d2018-01-31 11:53:34 -0800527 start_time = datetime.datetime.now()
Shuqian Zhao06deae02017-02-28 09:55:59 -0800528 reverify_all_push_duts()
Prathmesh Prabhubac5be02018-01-09 11:38:23 -0800529 time.sleep(15) # Wait for the verify test to start.
Shuqian Zhaof239b312017-12-05 16:45:02 -0800530 check_dut_inventory(arguments.num_duts, arguments.pool)
Prathmesh Prabhubac5be02018-01-09 11:38:23 -0800531 _run_test_suites(arguments)
Shuqian Zhao91b20142018-02-09 10:10:54 -0800532 check_service_crash(arguments.service_respawn_limit, start_time)
Jacob Kopczynski0ea4e042018-08-14 11:21:48 -0700533 print _SUCCESS_MSG
Prathmesh Prabhubac5be02018-01-09 11:38:23 -0800534 except Exception:
Jacob Kopczynski0ea4e042018-08-14 11:21:48 -0700535 # Abort running jobs unless flagged to continue when there is a failure.
Shuqian Zhao676ed6f2016-09-21 14:20:50 -0700536 if not arguments.continue_on_failure:
Aviv Keshet0d679eb2017-11-08 13:25:01 -0800537 for suite_id in _all_suite_ids:
Shuqian Zhao676ed6f2016-09-21 14:20:50 -0700538 if AFE.get_jobs(id=suite_id, finished=False):
539 AFE.run('abort_host_queue_entries', job=suite_id)
Dan Shi7e04fa82013-07-25 15:08:48 -0700540 raise
Shuqian Zhaof794c492017-01-06 16:27:23 -0800541 finally:
Shuqian Zhaod2a99f02016-09-22 13:31:30 -0700542 # Reverify all the hosts
Shuqian Zhao06deae02017-02-28 09:55:59 -0800543 reverify_all_push_duts()
Dan Shi7e04fa82013-07-25 15:08:48 -0700544
Dan Shi7e04fa82013-07-25 15:08:48 -0700545
Shuqian Zhao56969542017-05-30 12:56:57 -0700546def main():
547 """Entry point."""
Richard Barnette2af82212018-04-20 15:11:54 -0700548 arguments = parse_arguments(sys.argv)
Jacob Kopczynski57765142018-08-30 16:57:12 -0700549 _main(arguments)
Prathmesh Prabhubac5be02018-01-09 11:38:23 -0800550
Shuqian Zhao56969542017-05-30 12:56:57 -0700551
Dan Shi7e04fa82013-07-25 15:08:48 -0700552if __name__ == '__main__':
Prathmesh Prabhubac5be02018-01-09 11:38:23 -0800553 main()