blob: b5d562445ac2d602bed0e8ea0304c024ce7b9e28 [file] [log] [blame]
Dan Shi7e04fa82013-07-25 15:08:48 -07001#!/usr/bin/python
2#
3# Copyright (c) 2013 The Chromium OS Authors. All rights reserved.
4# Use of this source code is governed by a BSD-style license that can be
5# found in the LICENSE file.
6
7"""Tool to validate code in prod branch before pushing to lab.
8
9The script runs push_to_prod suite to verify code in prod branch is ready to be
10pushed. Link to design document:
11https://docs.google.com/a/google.com/document/d/1JMz0xS3fZRSHMpFkkKAL_rxsdbNZomhHbC3B8L71uuI/edit
12
13To verify if prod branch can be pushed to lab, run following command in
Shuqian Zhaobb030ff2017-09-21 17:36:13 -070014chromeos-staging-master2.hot server:
Michael Liang52d9f1f2014-06-17 15:01:24 -070015/usr/local/autotest/site_utils/test_push.py -e someone@company.com
Dan Shi7e04fa82013-07-25 15:08:48 -070016
Shuqian Zhaof3a114c2016-09-21 11:02:15 -070017The script uses latest gandof stable build as test build by default.
Dan Shi7e04fa82013-07-25 15:08:48 -070018
19"""
20
21import argparse
Shuqian Zhao1f311c02016-09-01 19:30:54 -070022import ast
Shuqian Zhao7b2daea2016-10-25 13:31:06 -070023from contextlib import contextmanager
Shuqian Zhao0de876d2018-01-31 11:53:34 -080024import datetime
Dan Shi7e04fa82013-07-25 15:08:48 -070025import getpass
Dan Shief1a5c02015-04-07 17:37:09 -070026import multiprocessing
Dan Shi7e04fa82013-07-25 15:08:48 -070027import os
28import re
29import subprocess
30import sys
Dan Shief1a5c02015-04-07 17:37:09 -070031import time
32import traceback
Dan Shi7e04fa82013-07-25 15:08:48 -070033import urllib2
34
35import common
Dan Shia8da7602014-05-09 15:18:15 -070036try:
37 from autotest_lib.frontend import setup_django_environment
38 from autotest_lib.frontend.afe import models
Shuqian Zhao327b6952016-09-12 10:42:03 -070039 from autotest_lib.frontend.afe import rpc_utils
Dan Shia8da7602014-05-09 15:18:15 -070040except ImportError:
41 # Unittest may not have Django database configured and will fail to import.
42 pass
Dan Shi5fa602c2015-03-26 17:54:13 -070043from autotest_lib.client.common_lib import global_config
Shuqian Zhao327b6952016-09-12 10:42:03 -070044from autotest_lib.client.common_lib import priorities
Shuqian Zhaof239b312017-12-05 16:45:02 -080045from autotest_lib.client.common_lib.cros import retry
Prathmesh Prabhucd246f52018-01-03 13:45:48 -080046from autotest_lib.frontend.afe import rpc_client_lib
Xixuan Wu93e646c2017-12-07 18:36:10 -080047from autotest_lib.server import constants
Dan Shi7e04fa82013-07-25 15:08:48 -070048from autotest_lib.server import site_utils
Shuqian Zhao327b6952016-09-12 10:42:03 -070049from autotest_lib.server import utils
Dan Shi47d32882014-12-22 16:25:05 -080050from autotest_lib.server.cros import provision
Dan Shi7e04fa82013-07-25 15:08:48 -070051from autotest_lib.server.cros.dynamic_suite import frontend_wrappers
Dan Shi7e04fa82013-07-25 15:08:48 -070052
Shuqian Zhao56969542017-05-30 12:56:57 -070053try:
54 from chromite.lib import metrics
55 from chromite.lib import ts_mon_config
56except ImportError:
57 metrics = site_utils.metrics_mock
58 ts_mon_config = site_utils.metrics_mock
59
Shuqian Zhao7b2daea2016-10-25 13:31:06 -070060AUTOTEST_DIR=common.autotest_dir
Dan Shi7e04fa82013-07-25 15:08:48 -070061CONFIG = global_config.global_config
62
Dan Shiefd403e2016-02-03 11:37:02 -080063AFE = frontend_wrappers.RetryingAFE(timeout_min=0.5, delay_sec=2)
Shuqian Zhao327b6952016-09-12 10:42:03 -070064TKO = frontend_wrappers.RetryingTKO(timeout_min=0.1, delay_sec=10)
Dan Shiefd403e2016-02-03 11:37:02 -080065
Dan Shi7e04fa82013-07-25 15:08:48 -070066MAIL_FROM = 'chromeos-test@google.com'
Shuqian Zhao12861662016-08-31 19:23:17 -070067BUILD_REGEX = 'R[\d]+-[\d]+\.[\d]+\.[\d]+'
Dan Shi7e04fa82013-07-25 15:08:48 -070068RUN_SUITE_COMMAND = 'run_suite.py'
69PUSH_TO_PROD_SUITE = 'push_to_prod'
Jakob Juelich8f143912014-10-10 14:08:05 -070070DUMMY_SUITE = 'dummy'
xixuan2d668582016-06-10 14:02:32 -070071DEFAULT_TIMEOUT_MIN_FOR_SUITE_JOB = 30
Shuqian Zhao12861662016-08-31 19:23:17 -070072IMAGE_BUCKET = CONFIG.get_config_value('CROS', 'image_storage_server')
Allen Li64edf062017-11-27 15:33:54 -080073DEFAULT_NUM_DUTS = (
74 ('gandof', 4),
75 ('quawks', 2),
Allen Li64edf062017-11-27 15:33:54 -080076)
Dan Shi7e04fa82013-07-25 15:08:48 -070077
Fang Deng6dddf602014-04-17 17:01:47 -070078SUITE_JOB_START_INFO_REGEX = ('^.*Created suite job:.*'
79 'tab_id=view_job&object_id=(\d+)$')
Dan Shi7e04fa82013-07-25 15:08:48 -070080
81# Dictionary of test results keyed by test name regular expression.
82EXPECTED_TEST_RESULTS = {'^SERVER_JOB$': 'GOOD',
83 # This is related to dummy_Fail/control.dependency.
84 'dummy_Fail.dependency$': 'TEST_NA',
Dan Shidc9eb172014-12-09 16:05:02 -080085 'login_LoginSuccess.*': 'GOOD',
Dan Shi47d32882014-12-22 16:25:05 -080086 'provision_AutoUpdate.double': 'GOOD',
Dan Shi7e04fa82013-07-25 15:08:48 -070087 'dummy_Pass.*': 'GOOD',
88 'dummy_Fail.Fail$': 'FAIL',
89 'dummy_Fail.RetryFail$': 'FAIL',
90 'dummy_Fail.RetrySuccess': 'GOOD',
91 'dummy_Fail.Error$': 'ERROR',
92 'dummy_Fail.Warn$': 'WARN',
93 'dummy_Fail.NAError$': 'TEST_NA',
94 'dummy_Fail.Crash$': 'GOOD',
Aviv Keshetff024f92017-09-26 13:43:14 -070095 'autotest_SyncCount$': 'GOOD',
Dan Shi7e04fa82013-07-25 15:08:48 -070096 }
97
Jakob Juelich8f143912014-10-10 14:08:05 -070098EXPECTED_TEST_RESULTS_DUMMY = {'^SERVER_JOB$': 'GOOD',
99 'dummy_Pass.*': 'GOOD',
100 'dummy_Fail.Fail': 'FAIL',
101 'dummy_Fail.Warn': 'WARN',
102 'dummy_Fail.Crash': 'GOOD',
103 'dummy_Fail.Error': 'ERROR',
104 'dummy_Fail.NAError': 'TEST_NA',}
105
Shuqian Zhao327b6952016-09-12 10:42:03 -0700106EXPECTED_TEST_RESULTS_POWERWASH = {'platform_Powerwash': 'GOOD',
107 'SERVER_JOB': 'GOOD'}
108
Dan Shi7e04fa82013-07-25 15:08:48 -0700109URL_HOST = CONFIG.get_config_value('SERVER', 'hostname', type=str)
110URL_PATTERN = CONFIG.get_config_value('CROS', 'log_url_pattern', type=str)
111
Dan Shidc9eb172014-12-09 16:05:02 -0800112# Some test could be missing from the test results for various reasons. Add
113# such test in this list and explain the reason.
114IGNORE_MISSING_TESTS = [
115 # For latest build, npo_test_delta does not exist.
116 'autoupdate_EndToEndTest.npo_test_delta.*',
117 # For trybot build, nmo_test_delta does not exist.
118 'autoupdate_EndToEndTest.nmo_test_delta.*',
119 # Older build does not have login_LoginSuccess test in push_to_prod suite.
120 # TODO(dshi): Remove following lines after R41 is stable.
121 'login_LoginSuccess']
122
Aviv Keshet0d679eb2017-11-08 13:25:01 -0800123# Multiprocessing proxy objects that are used to share data between background
124# suite-running processes and main process. The multiprocessing-compatible
125# versions are initialized in _main.
126_run_suite_output = []
127_all_suite_ids = []
128
Shuqian Zhao7b2daea2016-10-25 13:31:06 -0700129# A dict maps the name of the updated repos and the path of them.
130UPDATED_REPOS = {'autotest': AUTOTEST_DIR,
131 'chromite': '%s/site-packages/chromite/' % AUTOTEST_DIR}
Shuqian Zhao80d32712016-11-11 16:37:36 -0800132PUSH_USER = 'chromeos-test-lab'
Dan Shi7e04fa82013-07-25 15:08:48 -0700133
Shuqian Zhao0de876d2018-01-31 11:53:34 -0800134DEFAULT_SERVICE_RESPAWN_LIMIT = 2
135
136
Dan Shi7e04fa82013-07-25 15:08:48 -0700137class TestPushException(Exception):
138 """Exception to be raised when the test to push to prod failed."""
139 pass
140
Shuqian Zhaof239b312017-12-05 16:45:02 -0800141@retry.retry(TestPushException, timeout_min=5, delay_sec=30)
142def check_dut_inventory(required_num_duts, pool):
143 """Check DUT inventory for each board in the pool specified..
144
145 @param required_num_duts: a dict specifying the number of DUT each platform
146 requires in order to finish push tests.
147 @param pool: the pool used by test_push.
148 @raise TestPushException: if number of DUTs are less than the requirement.
149 """
150 print 'Checking DUT inventory...'
151 pool_label = constants.Labels.POOL_PREFIX + pool
152 hosts = AFE.run('get_hosts', status='Ready', locked=False)
153 hosts = [h for h in hosts if pool_label in h.get('labels', [])]
154 platforms = [host['platform'] for host in hosts]
155 current_inventory = {p : platforms.count(p) for p in platforms}
156 error_msg = ''
157 for platform, req_num in required_num_duts.items():
158 curr_num = current_inventory.get(platform, 0)
159 if curr_num < req_num:
160 error_msg += ('\nRequire %d %s DUTs in pool: %s, only %d are Ready'
161 ' now' % (req_num, platform, pool, curr_num))
162 if error_msg:
163 raise TestPushException('Not enough DUTs to run push tests. %s' %
164 error_msg)
165
Dan Shi5ba5d2e2014-05-09 13:47:00 -0700166
Shuqian Zhao327b6952016-09-12 10:42:03 -0700167def powerwash_dut_to_test_repair(hostname, timeout):
168 """Powerwash dut to test repair workflow.
Kevin Cheng6e4c2642015-12-11 09:45:57 -0800169
170 @param hostname: hostname of the dut.
Shuqian Zhao327b6952016-09-12 10:42:03 -0700171 @param timeout: seconds of the powerwash test to hit timeout.
172 @raise TestPushException: if DUT fail to run the test.
Kevin Cheng6e4c2642015-12-11 09:45:57 -0800173 """
Shuqian Zhao327b6952016-09-12 10:42:03 -0700174 t = models.Test.objects.get(name='platform_Powerwash')
175 c = utils.read_file(os.path.join(common.autotest_dir, t.path))
176 job_id = rpc_utils.create_job_common(
177 'powerwash', priority=priorities.Priority.SUPER,
178 control_type='Server', control_file=c, hosts=[hostname])
179
Shuqian Zhaoe83a78c2016-09-16 15:01:25 -0700180 end = time.time() + timeout
Shuqian Zhao327b6952016-09-12 10:42:03 -0700181 while not TKO.get_job_test_statuses_from_db(job_id):
Shuqian Zhaoe83a78c2016-09-16 15:01:25 -0700182 if time.time() >= end:
183 AFE.run('abort_host_queue_entries', job=job_id)
Shuqian Zhao327b6952016-09-12 10:42:03 -0700184 raise TestPushException(
Shuqian Zhaoe83a78c2016-09-16 15:01:25 -0700185 'Powerwash test on %s timeout after %ds, abort it.' %
186 (hostname, timeout))
Shuqian Zhao327b6952016-09-12 10:42:03 -0700187 time.sleep(10)
188 verify_test_results(job_id, EXPECTED_TEST_RESULTS_POWERWASH)
189 # Kick off verify, verify will fail and a repair should be triggered.
190 AFE.reverify_hosts(hostnames=[hostname])
Kevin Cheng6e4c2642015-12-11 09:45:57 -0800191
192
Shuqian Zhao06deae02017-02-28 09:55:59 -0800193def reverify_all_push_duts():
194 """Reverify all the push DUTs."""
195 print 'Reverifying all DUTs.'
196 hosts = [h.hostname for h in AFE.get_hosts()]
Shuqian Zhaod2a99f02016-09-22 13:31:30 -0700197 AFE.reverify_hosts(hostnames=hosts)
198
199
Richard Barnette2af82212018-04-20 15:11:54 -0700200def parse_arguments(argv):
Dan Shi7e04fa82013-07-25 15:08:48 -0700201 """Parse arguments for test_push tool.
202
Richard Barnette2af82212018-04-20 15:11:54 -0700203 @param argv Argument vector, as for `sys.argv`, including the
204 command name in `argv[0]`.
Dan Shi7e04fa82013-07-25 15:08:48 -0700205 @return: Parsed arguments.
206
207 """
Richard Barnette2af82212018-04-20 15:11:54 -0700208 parser = argparse.ArgumentParser(prog=argv[0])
Dan Shi8df9c002016-03-08 15:37:39 -0800209 parser.add_argument('-b', '--board', dest='board', default='gandof',
210 help='Default is gandof.')
Jakob Juelich8f143912014-10-10 14:08:05 -0700211 parser.add_argument('-sb', '--shard_board', dest='shard_board',
212 default='quawks',
213 help='Default is quawks.')
Dan Shi7e04fa82013-07-25 15:08:48 -0700214 parser.add_argument('-i', '--build', dest='build', default=None,
Shuqian Zhaof3a114c2016-09-21 11:02:15 -0700215 help='Default is the latest stale build of given '
216 'board. Must be a stable build, otherwise AU test '
217 'will fail. (ex: gandolf-release/R54-8743.25.0)')
Jakob Juelich8f143912014-10-10 14:08:05 -0700218 parser.add_argument('-si', '--shard_build', dest='shard_build', default=None,
Shuqian Zhaof3a114c2016-09-21 11:02:15 -0700219 help='Default is the latest stable build of given '
220 'board. Must be a stable build, otherwise AU test '
Jakob Juelich8f143912014-10-10 14:08:05 -0700221 'will fail.')
Dan Shi7e04fa82013-07-25 15:08:48 -0700222 parser.add_argument('-p', '--pool', dest='pool', default='bvt')
Shuqian Zhaod4864772015-08-06 09:46:22 -0700223 parser.add_argument('-t', '--timeout_min', dest='timeout_min', type=int,
xixuan2d668582016-06-10 14:02:32 -0700224 default=DEFAULT_TIMEOUT_MIN_FOR_SUITE_JOB,
Shuqian Zhaod4864772015-08-06 09:46:22 -0700225 help='Time in mins to wait before abort the jobs we '
226 'are waiting on. Only for the asynchronous suites '
227 'triggered by create_and_return flag.')
Shuqian Zhao1f311c02016-09-01 19:30:54 -0700228 parser.add_argument('-ud', '--num_duts', dest='num_duts',
Allen Li64edf062017-11-27 15:33:54 -0800229 default=dict(DEFAULT_NUM_DUTS),
230 type=ast.literal_eval,
231 help="Python dict literal that specifies the required"
232 " number of DUTs for each board. E.g {'gandof':4}")
Shuqian Zhao676ed6f2016-09-21 14:20:50 -0700233 parser.add_argument('-c', '--continue_on_failure', action='store_true',
234 dest='continue_on_failure',
235 help='All tests continue to run when there is failure')
Shuqian Zhao0de876d2018-01-31 11:53:34 -0800236 parser.add_argument('-sl', '--service_respawn_limit', type=int,
237 default=DEFAULT_SERVICE_RESPAWN_LIMIT,
238 help='If a service crashes more than this, the test '
239 'push is considered failed.')
Dan Shi7e04fa82013-07-25 15:08:48 -0700240
Richard Barnette2af82212018-04-20 15:11:54 -0700241 arguments = parser.parse_args(argv[1:])
Dan Shi7e04fa82013-07-25 15:08:48 -0700242
Shuqian Zhaof3a114c2016-09-21 11:02:15 -0700243 # Get latest stable build as default build.
Richard Barnette2af82212018-04-20 15:11:54 -0700244 version_map = AFE.get_stable_version_map(AFE.CROS_IMAGE_TYPE)
Dan Shi7e04fa82013-07-25 15:08:48 -0700245 if not arguments.build:
Richard Barnette2af82212018-04-20 15:11:54 -0700246 arguments.build = version_map.get_image_name(arguments.board)
Jakob Juelich8f143912014-10-10 14:08:05 -0700247 if not arguments.shard_build:
Richard Barnette2af82212018-04-20 15:11:54 -0700248 arguments.shard_build = version_map.get_image_name(
249 arguments.shard_board)
Dan Shi7e04fa82013-07-25 15:08:48 -0700250 return arguments
251
252
Shuqian Zhaod4864772015-08-06 09:46:22 -0700253def do_run_suite(suite_name, arguments, use_shard=False,
Richard Barnetteb12413a2018-04-25 01:00:27 +0000254 create_and_return=False):
Dan Shi7e04fa82013-07-25 15:08:48 -0700255 """Call run_suite to run a suite job, and return the suite job id.
256
257 The script waits the suite job to finish before returning the suite job id.
258 Also it will echo the run_suite output to stdout.
259
260 @param suite_name: Name of a suite, e.g., dummy.
261 @param arguments: Arguments for run_suite command.
Jakob Juelich8f143912014-10-10 14:08:05 -0700262 @param use_shard: If true, suite is scheduled for shard board.
Shuqian Zhaod4864772015-08-06 09:46:22 -0700263 @param create_and_return: If True, run_suite just creates the suite, print
264 the job id, then finish immediately.
Jakob Juelich8f143912014-10-10 14:08:05 -0700265
Dan Shi7e04fa82013-07-25 15:08:48 -0700266 @return: Suite job ID.
267
268 """
Richard Barnetteb12413a2018-04-25 01:00:27 +0000269 if use_shard:
Jakob Juelich8f143912014-10-10 14:08:05 -0700270 board = arguments.shard_board
271 build = arguments.shard_build
Dan Shi81ddc422016-09-09 13:58:31 -0700272 else:
273 board = arguments.board
274 build = arguments.build
Jakob Juelich8f143912014-10-10 14:08:05 -0700275
Dan Shi47d32882014-12-22 16:25:05 -0800276 # Remove cros-version label to force provision.
Shuqian Zhao7a49f1b2016-10-24 16:48:04 -0700277 hosts = AFE.get_hosts(label=constants.Labels.BOARD_PREFIX+board,
278 locked=False)
Dan Shi47d32882014-12-22 16:25:05 -0800279 for host in hosts:
Dan Shi81ddc422016-09-09 13:58:31 -0700280 labels_to_remove = [
281 l for l in host.labels
Richard Barnetteb12413a2018-04-25 01:00:27 +0000282 if l.startswith(provision.CROS_VERSION_PREFIX)]
Dan Shi81ddc422016-09-09 13:58:31 -0700283 if labels_to_remove:
284 AFE.run('host_remove_labels', id=host.id, labels=labels_to_remove)
Dan Shi47d32882014-12-22 16:25:05 -0800285
Shuqian Zhaod01fad02016-11-18 10:00:22 -0800286 # Test repair work flow on shards, powerwash test will timeout after 7m.
Kevin Cheng6e4c2642015-12-11 09:45:57 -0800287 if use_shard and not create_and_return:
Shuqian Zhaod01fad02016-11-18 10:00:22 -0800288 powerwash_dut_to_test_repair(host.hostname, timeout=420)
Kevin Cheng6e4c2642015-12-11 09:45:57 -0800289
Dan Shief1a5c02015-04-07 17:37:09 -0700290 current_dir = os.path.dirname(os.path.realpath(__file__))
291 cmd = [os.path.join(current_dir, RUN_SUITE_COMMAND),
Dan Shi7e04fa82013-07-25 15:08:48 -0700292 '-s', suite_name,
Jakob Juelich8f143912014-10-10 14:08:05 -0700293 '-b', board,
294 '-i', build,
Dan Shi7e04fa82013-07-25 15:08:48 -0700295 '-p', arguments.pool,
Allen Li64edf062017-11-27 15:33:54 -0800296 '--minimum_duts', str(arguments.num_duts[board])]
Shuqian Zhaod4864772015-08-06 09:46:22 -0700297 if create_and_return:
298 cmd += ['-c']
Dan Shi7e04fa82013-07-25 15:08:48 -0700299
300 suite_job_id = None
Dan Shi7e04fa82013-07-25 15:08:48 -0700301
302 proc = subprocess.Popen(cmd, stdout=subprocess.PIPE,
303 stderr=subprocess.STDOUT)
304
305 while True:
306 line = proc.stdout.readline()
307
308 # Break when run_suite process completed.
309 if not line and proc.poll() != None:
310 break
311 print line.rstrip()
Aviv Keshet0d679eb2017-11-08 13:25:01 -0800312 _run_suite_output.append(line.rstrip())
Dan Shi7e04fa82013-07-25 15:08:48 -0700313
314 if not suite_job_id:
315 m = re.match(SUITE_JOB_START_INFO_REGEX, line)
316 if m and m.group(1):
317 suite_job_id = int(m.group(1))
Aviv Keshet0d679eb2017-11-08 13:25:01 -0800318 _all_suite_ids.append(suite_job_id)
Dan Shi7e04fa82013-07-25 15:08:48 -0700319
320 if not suite_job_id:
321 raise TestPushException('Failed to retrieve suite job ID.')
Dan Shia8da7602014-05-09 15:18:15 -0700322
Shuqian Zhaod4864772015-08-06 09:46:22 -0700323 # If create_and_return specified, wait for the suite to finish.
324 if create_and_return:
325 end = time.time() + arguments.timeout_min * 60
Dan Shiefd403e2016-02-03 11:37:02 -0800326 while not AFE.get_jobs(id=suite_job_id, finished=True):
Shuqian Zhaod4864772015-08-06 09:46:22 -0700327 if time.time() < end:
328 time.sleep(10)
329 else:
Dan Shiefd403e2016-02-03 11:37:02 -0800330 AFE.run('abort_host_queue_entries', job=suite_job_id)
Shuqian Zhaod4864772015-08-06 09:46:22 -0700331 raise TestPushException(
332 'Asynchronous suite triggered by create_and_return '
333 'flag has timed out after %d mins. Aborting it.' %
334 arguments.timeout_min)
335
Dan Shia8da7602014-05-09 15:18:15 -0700336 print 'Suite job %s is completed.' % suite_job_id
Dan Shi7e04fa82013-07-25 15:08:48 -0700337 return suite_job_id
338
339
Dan Shia8da7602014-05-09 15:18:15 -0700340def check_dut_image(build, suite_job_id):
341 """Confirm all DUTs used for the suite are imaged to expected build.
342
343 @param build: Expected build to be imaged.
344 @param suite_job_id: job ID of the suite job.
345 @raise TestPushException: If a DUT does not have expected build imaged.
346 """
347 print 'Checking image installed in DUTs...'
348 job_ids = [job.id for job in
349 models.Job.objects.filter(parent_job_id=suite_job_id)]
350 hqes = [models.HostQueueEntry.objects.filter(job_id=job_id)[0]
351 for job_id in job_ids]
352 hostnames = set([hqe.host.hostname for hqe in hqes])
353 for hostname in hostnames:
Prathmesh Prabhuf10f41a2017-04-21 11:52:16 -0700354 found_build = site_utils.get_build_from_afe(hostname, AFE)
355 if found_build != build:
Dan Shia8da7602014-05-09 15:18:15 -0700356 raise TestPushException('DUT is not imaged properly. Host %s has '
357 'build %s, while build %s is expected.' %
Prathmesh Prabhuf10f41a2017-04-21 11:52:16 -0700358 (hostname, found_build, build))
Dan Shia8da7602014-05-09 15:18:15 -0700359
360
Shuqian Zhaod4864772015-08-06 09:46:22 -0700361def test_suite(suite_name, expected_results, arguments, use_shard=False,
Richard Barnetteb12413a2018-04-25 01:00:27 +0000362 create_and_return=False):
Dan Shi7e04fa82013-07-25 15:08:48 -0700363 """Call run_suite to start a suite job and verify results.
364
365 @param suite_name: Name of a suite, e.g., dummy
366 @param expected_results: A dictionary of test name to test result.
367 @param arguments: Arguments for run_suite command.
Jakob Juelich8f143912014-10-10 14:08:05 -0700368 @param use_shard: If true, suite is scheduled for shard board.
Shuqian Zhaod4864772015-08-06 09:46:22 -0700369 @param create_and_return: If True, run_suite just creates the suite, print
370 the job id, then finish immediately.
Dan Shi7e04fa82013-07-25 15:08:48 -0700371 """
Shuqian Zhaod4864772015-08-06 09:46:22 -0700372 suite_job_id = do_run_suite(suite_name, arguments, use_shard,
Richard Barnetteb12413a2018-04-25 01:00:27 +0000373 create_and_return)
Dan Shi7e04fa82013-07-25 15:08:48 -0700374
Dan Shia8da7602014-05-09 15:18:15 -0700375 # Confirm all DUTs used for the suite are imaged to expected build.
Jakob Juelich8f143912014-10-10 14:08:05 -0700376 # hqe.host_id for jobs running in shard is not synced back to master db,
377 # therefore, skip verifying dut build for jobs running in shard.
Richard Barnetteb12413a2018-04-25 01:00:27 +0000378 build_expected = arguments.build
379 if not use_shard:
Dan Shi81ddc422016-09-09 13:58:31 -0700380 check_dut_image(build_expected, suite_job_id)
Dan Shia8da7602014-05-09 15:18:15 -0700381
Shuqian Zhao327b6952016-09-12 10:42:03 -0700382 # Verify test results are the expected results.
383 verify_test_results(suite_job_id, expected_results)
384
385
386def verify_test_results(job_id, expected_results):
387 """Verify the test results with the expected results.
388
389 @param job_id: id of the running jobs. For suite job, it is suite_job_id.
390 @param expected_results: A dictionary of test name to test result.
391 @raise TestPushException: If verify fails.
392 """
Dan Shia8da7602014-05-09 15:18:15 -0700393 print 'Comparing test results...'
Shuqian Zhao327b6952016-09-12 10:42:03 -0700394 test_views = site_utils.get_test_views_from_tko(job_id, TKO)
Dan Shi7e04fa82013-07-25 15:08:48 -0700395
396 mismatch_errors = []
397 extra_test_errors = []
398
399 found_keys = set()
Shuqian Zhao327b6952016-09-12 10:42:03 -0700400 for test_name, test_status in test_views.items():
Dan Shi7e04fa82013-07-25 15:08:48 -0700401 print "%s%s" % (test_name.ljust(30), test_status)
Dan Shi80b6ec02016-07-21 15:49:18 -0700402 # platform_InstallTestImage test may exist in old builds.
403 if re.search('platform_InstallTestImage_SERVER_JOB$', test_name):
404 continue
Dan Shi7e04fa82013-07-25 15:08:48 -0700405 test_found = False
406 for key,val in expected_results.items():
407 if re.search(key, test_name):
408 test_found = True
409 found_keys.add(key)
Dan Shi7e04fa82013-07-25 15:08:48 -0700410 if val != test_status:
411 error = ('%s Expected: [%s], Actual: [%s]' %
412 (test_name, val, test_status))
413 mismatch_errors.append(error)
414 if not test_found:
415 extra_test_errors.append(test_name)
416
417 missing_test_errors = set(expected_results.keys()) - found_keys
Dan Shidc9eb172014-12-09 16:05:02 -0800418 for exception in IGNORE_MISSING_TESTS:
419 try:
420 missing_test_errors.remove(exception)
421 except KeyError:
422 pass
423
Dan Shi7e04fa82013-07-25 15:08:48 -0700424 summary = []
425 if mismatch_errors:
426 summary.append(('Results of %d test(s) do not match expected '
427 'values:') % len(mismatch_errors))
428 summary.extend(mismatch_errors)
429 summary.append('\n')
430
431 if extra_test_errors:
432 summary.append('%d test(s) are not expected to be run:' %
433 len(extra_test_errors))
434 summary.extend(extra_test_errors)
435 summary.append('\n')
436
437 if missing_test_errors:
438 summary.append('%d test(s) are missing from the results:' %
439 len(missing_test_errors))
440 summary.extend(missing_test_errors)
441 summary.append('\n')
442
443 # Test link to log can be loaded.
Shuqian Zhao327b6952016-09-12 10:42:03 -0700444 job_name = '%s-%s' % (job_id, getpass.getuser())
Prathmesh Prabhucd246f52018-01-03 13:45:48 -0800445 log_link = URL_PATTERN % (rpc_client_lib.add_protocol(URL_HOST), job_name)
Dan Shi7e04fa82013-07-25 15:08:48 -0700446 try:
447 urllib2.urlopen(log_link).read()
448 except urllib2.URLError:
449 summary.append('Failed to load page for link to log: %s.' % log_link)
450
451 if summary:
452 raise TestPushException('\n'.join(summary))
453
454
Dan Shief1a5c02015-04-07 17:37:09 -0700455def test_suite_wrapper(queue, suite_name, expected_results, arguments,
Richard Barnetteb12413a2018-04-25 01:00:27 +0000456 use_shard=False, create_and_return=False):
Dan Shief1a5c02015-04-07 17:37:09 -0700457 """Wrapper to call test_suite. Handle exception and pipe it to parent
458 process.
459
460 @param queue: Queue to save exception to be accessed by parent process.
461 @param suite_name: Name of a suite, e.g., dummy
462 @param expected_results: A dictionary of test name to test result.
463 @param arguments: Arguments for run_suite command.
464 @param use_shard: If true, suite is scheduled for shard board.
Shuqian Zhaod4864772015-08-06 09:46:22 -0700465 @param create_and_return: If True, run_suite just creates the suite, print
466 the job id, then finish immediately.
Dan Shief1a5c02015-04-07 17:37:09 -0700467 """
468 try:
Shuqian Zhaod4864772015-08-06 09:46:22 -0700469 test_suite(suite_name, expected_results, arguments, use_shard,
Richard Barnetteb12413a2018-04-25 01:00:27 +0000470 create_and_return)
Allen Li64edf062017-11-27 15:33:54 -0800471 except Exception:
Dan Shief1a5c02015-04-07 17:37:09 -0700472 # Store the whole exc_info leads to a PicklingError.
473 except_type, except_value, tb = sys.exc_info()
474 queue.put((except_type, except_value, traceback.extract_tb(tb)))
475
476
Dan Shief1a5c02015-04-07 17:37:09 -0700477def check_queue(queue):
478 """Check the queue for any exception being raised.
479
480 @param queue: Queue used to store exception for parent process to access.
481 @raise: Any exception found in the queue.
482 """
483 if queue.empty():
484 return
485 exc_info = queue.get()
486 # Raise the exception with original backtrace.
487 print 'Original stack trace of the exception:\n%s' % exc_info[2]
488 raise exc_info[0](exc_info[1])
489
490
Shuqian Zhao7b2daea2016-10-25 13:31:06 -0700491def get_head_of_repos(repos):
492 """Get HEAD of updated repos, currently are autotest and chromite repos
493
494 @param repos: a map of repo name to the path of the repo. E.g.
495 {'autotest': '/usr/local/autotest'}
496 @return: a map of repo names to the current HEAD of that repo.
497 """
498 @contextmanager
499 def cd(new_wd):
500 """Helper function to change working directory.
501
502 @param new_wd: new working directory that switch to.
503 """
504 prev_wd = os.getcwd()
505 os.chdir(os.path.expanduser(new_wd))
506 try:
507 yield
508 finally:
509 os.chdir(prev_wd)
510
511 updated_repo_heads = {}
512 for repo_name, path_to_repo in repos.iteritems():
513 with cd(path_to_repo):
514 head = subprocess.check_output('git rev-parse HEAD',
515 shell=True).strip()
516 updated_repo_heads[repo_name] = head
517 return updated_repo_heads
518
519
Shuqian Zhao80d32712016-11-11 16:37:36 -0800520def push_prod_next_branch(updated_repo_heads):
521 """push prod-next branch to the tested HEAD after all tests pass.
522
523 The push command must be ran as PUSH_USER, since only PUSH_USER has the
524 right to push branches.
525
526 @param updated_repo_heads: a map of repo names to tested HEAD of that repo.
527 """
528 # prod-next branch for every repo is downloaded under PUSH_USER home dir.
Shuqian Zhaoaa0301c2016-11-21 09:46:41 -0800529 cmd = ('cd ~/{repo}; git pull; git rebase {hash} prod-next;'
530 'git push origin prod-next')
Shuqian Zhao80d32712016-11-11 16:37:36 -0800531 run_push_as_push_user = "sudo su - %s -c '%s'" % (PUSH_USER, cmd)
532
533 for repo_name, test_hash in updated_repo_heads.iteritems():
534 push_cmd = run_push_as_push_user.format(hash=test_hash, repo=repo_name)
535 print 'Pushing %s prod-next branch to %s' % (repo_name, test_hash)
536 print subprocess.check_output(push_cmd, stderr=subprocess.STDOUT,
537 shell=True)
538
539
Prathmesh Prabhubac5be02018-01-09 11:38:23 -0800540def _run_test_suites(arguments):
541 """Run the actual tests that comprise the test_push."""
542 # Use daemon flag will kill child processes when parent process fails.
543 use_daemon = not arguments.continue_on_failure
544 queue = multiprocessing.Queue()
545
546 push_to_prod_suite = multiprocessing.Process(
547 target=test_suite_wrapper,
548 args=(queue, PUSH_TO_PROD_SUITE, EXPECTED_TEST_RESULTS,
549 arguments))
550 push_to_prod_suite.daemon = use_daemon
551 push_to_prod_suite.start()
552
553 # suite test with --create_and_return flag
554 asynchronous_suite = multiprocessing.Process(
555 target=test_suite_wrapper,
556 args=(queue, DUMMY_SUITE, EXPECTED_TEST_RESULTS_DUMMY,
557 arguments, True, True))
558 asynchronous_suite.daemon = True
559 asynchronous_suite.start()
560
561 while push_to_prod_suite.is_alive() or asynchronous_suite.is_alive():
562 check_queue(queue)
563 time.sleep(5)
564 check_queue(queue)
565 push_to_prod_suite.join()
566 asynchronous_suite.join()
567
568
Shuqian Zhao0de876d2018-01-31 11:53:34 -0800569def check_service_crash(respawn_limit, start_time):
570 """Check whether scheduler or host_scheduler crash during testing.
571
572 Since the testing push is kicked off at the beginning of a given hour, the way
573 to check whether a service is crashed is to check whether the times of the
574 service being respawn during testing push is over the respawn_limit.
575
576 @param respawn_limit: The maximum number of times the service is allowed to
577 be respawn.
578 @param start_time: The time that testing push is kicked off.
579 """
580 def _parse(filename_prefix, filename):
581 """Helper method to parse the time of the log.
582
583 @param filename_prefix: The prefix of the filename.
584 @param filename: The name of the log file.
585 """
586 return datetime.datetime.strptime(filename[len(filename_prefix):],
587 "%Y-%m-%d-%H.%M.%S")
588
589 services = ['scheduler', 'host_scheduler']
590 logs = os.listdir('%s/logs/' % AUTOTEST_DIR)
591 curr_time = datetime.datetime.now()
592
593 error_msg = ''
594 for service in services:
595 log_prefix = '%s.log.' % service
596 respawn_count = sum(1 for l in logs if l.startswith(log_prefix)
597 and start_time <= _parse(log_prefix, l) <= curr_time)
598
599 if respawn_count > respawn_limit:
600 error_msg += ('%s has been respawned %s times during testing push at %s. '
601 'It is very likely crashed. Please check!\n' %
602 (service, respawn_count,
603 start_time.strftime("%Y-%m-%d-%H")))
604 if error_msg:
605 raise TestPushException(error_msg)
606
607
Prathmesh Prabhubac5be02018-01-09 11:38:23 -0800608def _promote_prod_next_refs():
609 """Updates prod-next branch on relevant repos."""
610 updated_repo_heads = get_head_of_repos(UPDATED_REPOS)
611 push_prod_next_branch(updated_repo_heads)
612 return updated_repo_heads
613
614
615_SUCCESS_MSG = """
616All tests completed successfully, the prod branch of the following repos is
617ready to be pushed to the hash list below.
618
619%(updated_repos_msg)s
620
621Instructions for pushing to prod are available at
622https://goto.google.com/autotest-to-prod
623"""
624
625
Shuqian Zhao56969542017-05-30 12:56:57 -0700626def _main(arguments):
Prathmesh Prabhubac5be02018-01-09 11:38:23 -0800627 """Run test and promote repo branches if tests succeed.
Shuqian Zhao56969542017-05-30 12:56:57 -0700628
629 @param arguments: command line arguments.
630 """
Aviv Keshet0d679eb2017-11-08 13:25:01 -0800631
632 # TODO Use chromite.lib.parallel.Manager instead, to workaround the
633 # too-long-tmp-path problem.
634 mpmanager = multiprocessing.Manager()
Prathmesh Prabhubac5be02018-01-09 11:38:23 -0800635 # These are globals used by other functions in this module to communicate
636 # back from worker processes.
637 global _run_suite_output
Aviv Keshet0d679eb2017-11-08 13:25:01 -0800638 _run_suite_output = mpmanager.list()
Prathmesh Prabhubac5be02018-01-09 11:38:23 -0800639 global _all_suite_ids
Aviv Keshet0d679eb2017-11-08 13:25:01 -0800640 _all_suite_ids = mpmanager.list()
641
Dan Shi7e04fa82013-07-25 15:08:48 -0700642 try:
Shuqian Zhao0de876d2018-01-31 11:53:34 -0800643 start_time = datetime.datetime.now()
Shuqian Zhao06deae02017-02-28 09:55:59 -0800644 reverify_all_push_duts()
Prathmesh Prabhubac5be02018-01-09 11:38:23 -0800645 time.sleep(15) # Wait for the verify test to start.
Shuqian Zhaof239b312017-12-05 16:45:02 -0800646 check_dut_inventory(arguments.num_duts, arguments.pool)
Prathmesh Prabhubac5be02018-01-09 11:38:23 -0800647 _run_test_suites(arguments)
Shuqian Zhao91b20142018-02-09 10:10:54 -0800648 check_service_crash(arguments.service_respawn_limit, start_time)
Prathmesh Prabhubac5be02018-01-09 11:38:23 -0800649 updated_repo_heads = _promote_prod_next_refs()
650 updated_repos_msg = '\n'.join(
651 ['%s: %s' % (k, v) for k, v in updated_repo_heads.iteritems()])
652 print _SUCCESS_MSG % {'updated_repos_msg': updated_repos_msg}
653 except Exception:
Shuqian Zhao676ed6f2016-09-21 14:20:50 -0700654 # Abort running jobs when choose not to continue when there is failure.
655 if not arguments.continue_on_failure:
Aviv Keshet0d679eb2017-11-08 13:25:01 -0800656 for suite_id in _all_suite_ids:
Shuqian Zhao676ed6f2016-09-21 14:20:50 -0700657 if AFE.get_jobs(id=suite_id, finished=False):
658 AFE.run('abort_host_queue_entries', job=suite_id)
Dan Shi7e04fa82013-07-25 15:08:48 -0700659 raise
Shuqian Zhaof794c492017-01-06 16:27:23 -0800660 finally:
Shuqian Zhaod2a99f02016-09-22 13:31:30 -0700661 # Reverify all the hosts
Shuqian Zhao06deae02017-02-28 09:55:59 -0800662 reverify_all_push_duts()
Dan Shi7e04fa82013-07-25 15:08:48 -0700663
Dan Shi7e04fa82013-07-25 15:08:48 -0700664
Shuqian Zhao56969542017-05-30 12:56:57 -0700665def main():
666 """Entry point."""
Richard Barnette2af82212018-04-20 15:11:54 -0700667 arguments = parse_arguments(sys.argv)
Shuqian Zhao034d85e2017-06-01 11:57:39 -0700668 with ts_mon_config.SetupTsMonGlobalState(service_name='test_push',
669 indirect=True):
Prathmesh Prabhubac5be02018-01-09 11:38:23 -0800670 test_push_success = False
671 try:
672 _main(arguments)
673 test_push_success = True
674 finally:
675 metrics.Counter('chromeos/autotest/test_push/completed').increment(
676 fields={'success': test_push_success})
677
Shuqian Zhao56969542017-05-30 12:56:57 -0700678
Dan Shi7e04fa82013-07-25 15:08:48 -0700679if __name__ == '__main__':
Prathmesh Prabhubac5be02018-01-09 11:38:23 -0800680 main()