blob: cfc35a063b2789d3a33ebae5f082d2a846bb36c3 [file] [log] [blame]
Xixuan Wu0bea9522018-05-08 17:49:19 -07001# Copyright 2018 The Chromium OS Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5"""Module for swarming execution."""
6
7from __future__ import absolute_import
8from __future__ import division
9from __future__ import print_function
10
Xixuan Wu98e5de32018-05-29 17:23:16 -070011import collections
Xixuan Wucb469512018-06-08 15:17:23 -070012import json
Xixuan Wu53d15712018-06-12 10:52:55 -070013import logging
Xixuan Wu89300762018-07-13 14:58:46 -070014import operator
Xixuan Wu0bea9522018-05-08 17:49:19 -070015import os
Xixuan Wucb469512018-06-08 15:17:23 -070016import urllib
Allen Li8b2beda2018-09-04 17:16:14 -070017import uuid
Xixuan Wucb469512018-06-08 15:17:23 -070018
19from lucifer import autotest
Prathmesh Prabhu9a4a4cc2018-09-26 11:29:37 -070020from skylab_suite import errors
Xixuan Wu0bea9522018-05-08 17:49:19 -070021
22
23SERVICE_ACCOUNT = '/creds/skylab_swarming_bot/skylab_bot_service_account.json'
Xixuan Wucb469512018-06-08 15:17:23 -070024SKYLAB_DRONE_POOL = 'ChromeOSSkylab'
Xixuan Wu53d15712018-06-12 10:52:55 -070025SKYLAB_SUITE_POOL = 'ChromeOSSkylab-suite'
Xixuan Wucb469512018-06-08 15:17:23 -070026
Xixuan Wu2406be32018-05-14 13:51:30 -070027TASK_COMPLETED = 'COMPLETED'
Xixuan Wu9af95a22018-05-18 10:46:42 -070028TASK_COMPLETED_SUCCESS = 'COMPLETED (SUCCESS)'
29TASK_COMPLETED_FAILURE = 'COMPLETED (FAILURE)'
Xixuan Wu2406be32018-05-14 13:51:30 -070030TASK_EXPIRED = 'EXPIRED'
31TASK_CANCELED = 'CANCELED'
32TASK_TIMEDOUT = 'TIMED_OUT'
Xixuan Wu8157c1f2018-06-06 15:26:00 -070033TASK_RUNNING = 'RUNNING'
Xixuan Wuf52e40d2018-06-14 12:10:44 -070034TASK_PENDING = 'PENDING'
Xixuan Wu799c8bd2018-07-11 10:18:01 -070035TASK_BOT_DIED = 'BOT_DIED'
Xixuan Wu74ee9b42018-07-11 16:01:12 -070036TASK_NO_RESOURCE = 'NO_RESOURCE'
Xixuan Wu4a99c102018-08-22 13:53:04 -070037TASK_KILLED = 'KILLED'
Xixuan Wu2406be32018-05-14 13:51:30 -070038TASK_FINISHED_STATUS = [TASK_COMPLETED,
39 TASK_EXPIRED,
40 TASK_CANCELED,
Xixuan Wu799c8bd2018-07-11 10:18:01 -070041 TASK_TIMEDOUT,
Xixuan Wu74ee9b42018-07-11 16:01:12 -070042 TASK_BOT_DIED,
Xixuan Wu4a99c102018-08-22 13:53:04 -070043 TASK_NO_RESOURCE,
44 TASK_KILLED]
Xixuan Wuaff23c72018-06-14 12:10:44 -070045# The swarming task failure status to retry. TASK_CANCELED won't get
46# retried since it's intentionally aborted.
Xixuan Wu74ee9b42018-07-11 16:01:12 -070047TASK_STATUS_TO_RETRY = [TASK_EXPIRED, TASK_TIMEDOUT, TASK_BOT_DIED,
48 TASK_NO_RESOURCE]
Xixuan Wu0bea9522018-05-08 17:49:19 -070049
Xixuan Wu6bd67ea2018-08-01 09:24:59 -070050DEFAULT_EXPIRATION_SECS = 10 * 60
Xixuan Wu98e5de32018-05-29 17:23:16 -070051DEFAULT_TIMEOUT_SECS = 60 * 60
52
Xixuan Wu89300762018-07-13 14:58:46 -070053# A mapping of priorities for skylab hwtest tasks. In swarming,
54# lower number means high priorities. Priority lower than 48 will
Xixuan Wufb6bb7f2018-07-19 16:22:21 -070055# be special tasks. The upper bound for priority is 255.
Xixuan Wu89300762018-07-13 14:58:46 -070056# Use the same priorities mapping as chromite/lib/constants.py
57SKYLAB_HWTEST_PRIORITIES_MAP = {
Xixuan Wufb6bb7f2018-07-19 16:22:21 -070058 'Weekly': 230,
Xixuan Wuff72a9b2018-08-22 11:38:27 -070059 'CTS': 215,
Xixuan Wufb6bb7f2018-07-19 16:22:21 -070060 'Daily': 200,
61 'PostBuild': 170,
62 'Default': 140,
63 'Build': 110,
64 'PFQ': 80,
Xixuan Wu89300762018-07-13 14:58:46 -070065 'CQ': 50,
66 'Super': 49,
67}
68SORTED_SKYLAB_HWTEST_PRIORITY = sorted(
69 SKYLAB_HWTEST_PRIORITIES_MAP.items(),
70 key=operator.itemgetter(1))
71
Xixuan Wu77d4a592018-06-08 10:40:57 -070072# TODO (xixuan): Use proto library or some future APIs instead of hardcoding.
73SWARMING_DUT_POOL_MAP = {
Xixuan Wu77d4a592018-06-08 10:40:57 -070074 'arc-presubmit': 'DUT_POOL_CTS_PERBUILD',
Aviv Keshet9ea7db02018-12-06 12:09:41 -080075 'bvt': 'DUT_POOL_BVT',
76 'cq': 'DUT_POOL_CQ',
77 'cts': 'DUT_POOL_CTS',
78 'quota-metered': 'DUT_POOL_QUOTA_METERED',
79 'suites': 'DUT_POOL_SUITES',
Xixuan Wu77d4a592018-06-08 10:40:57 -070080}
81SWARMING_DUT_READY_STATUS = 'ready'
Xixuan Wu98e5de32018-05-29 17:23:16 -070082
83# The structure of fallback swarming task request is:
84# NewTaskRequest:
85# ...
86# task_slices -> NewTaskSlice:
87# ...
88# properties -> TaskProperties
89# ...
90TaskProperties = collections.namedtuple(
91 'TaskProperties',
92 [
93 'command',
94 'dimensions',
95 'execution_timeout_secs',
96 'grace_period_secs',
97 'io_timeout_secs',
98 ])
99
100NewTaskSlice = collections.namedtuple(
101 'NewTaskSlice',
102 [
103 'expiration_secs',
104 'properties',
105 ])
106
107NewTaskRequest = collections.namedtuple(
108 'NewTaskRequest',
109 [
110 'name',
Xixuan Wu6ac13442018-06-12 11:26:30 -0700111 'parent_task_id',
Xixuan Wu98e5de32018-05-29 17:23:16 -0700112 'priority',
113 'tags',
114 'user',
115 'task_slices',
116 ])
117
Xixuan Wu0bea9522018-05-08 17:49:19 -0700118
119def _get_client():
120 return os.path.join(
121 os.path.expanduser('~'),
122 'chromiumos/chromite/third_party/swarming.client/swarming.py')
123
124
Xixuan Wu6a004562018-12-10 11:56:17 -0800125def to_swarming_pool_label(pool):
126 """Transfer passed-in suite pool label to swarming-recognized pool label."""
127 return SWARMING_DUT_POOL_MAP.get(pool, pool)
128
129
Xixuan Wu0bea9522018-05-08 17:49:19 -0700130def get_basic_swarming_cmd(command):
131 return [_get_client(), command,
132 '--auth-service-account-json', SERVICE_ACCOUNT,
Allen Li8b2beda2018-09-04 17:16:14 -0700133 '--swarming', get_swarming_server()]
134
135
Allen Li8b2beda2018-09-04 17:16:14 -0700136def make_logdog_annotation_url():
137 """Return a unique LogDog annotation URL.
138
139 If the appropriate LogDog server cannot be determined, return an
140 empty string.
141 """
142 logdog_server = get_logdog_server()
143 if not logdog_server:
144 return ''
145 return ('logdog://%s/chromeos/skylab/%s/+/annotations'
146 % (logdog_server, uuid.uuid4().hex))
147
148
149def get_swarming_server():
150 """Return the swarming server for the current environment."""
Prathmesh Prabhu9a4a4cc2018-09-26 11:29:37 -0700151 try:
152 return os.environ['SWARMING_SERVER']
153 except KeyError:
154 raise errors.DroneEnvironmentError(
155 'SWARMING_SERVER environment variable not set'
156 )
Xixuan Wu9af95a22018-05-18 10:46:42 -0700157
158
Prathmesh Prabhuacf41f02018-09-26 12:06:12 -0700159def get_logdog_server():
160 """Return the LogDog server for the current environment.
161
162 If the appropriate server cannot be determined, return an empty
163 string.
164 """
Prathmesh Prabhu9a4a4cc2018-09-26 11:29:37 -0700165 try:
166 return os.environ['LOGDOG_SERVER']
167 except KeyError:
168 raise errors.DroneEnvironmentError(
169 'LOGDOG_SERVER environment variable not set'
170 )
Prathmesh Prabhuacf41f02018-09-26 12:06:12 -0700171
172
Allen Li10047b72018-09-05 16:02:31 -0700173def get_new_task_swarming_cmd():
174 """Return a list of command args for creating a new task."""
175 return get_basic_swarming_cmd('post') + ['tasks/new']
176
177
Xixuan Wu65169412018-08-22 10:41:43 -0700178def make_fallback_request_dict(cmds, slices_dimensions, slices_expiration_secs,
179 task_name, priority, tags, user,
Xixuan Wu6ac13442018-06-12 11:26:30 -0700180 parent_task_id='',
Xixuan Wu98e5de32018-05-29 17:23:16 -0700181 expiration_secs=DEFAULT_EXPIRATION_SECS,
182 grace_period_secs=DEFAULT_TIMEOUT_SECS,
183 execution_timeout_secs=DEFAULT_TIMEOUT_SECS,
184 io_timeout_secs=DEFAULT_TIMEOUT_SECS):
185 """Form a json-compatible dict for fallback swarming call.
186
187 @param cmds: A list of cmd to run on swarming bots.
188 @param slices_dimensions: A list of dict to indicates different tries'
189 dimensions.
Xixuan Wu65169412018-08-22 10:41:43 -0700190 @param slices_expiration_secs: A list of Integer to indicates each slice's
191 expiration_secs.
Xixuan Wu98e5de32018-05-29 17:23:16 -0700192 @param task_name: The request's name.
193 @param priority: The request's priority. An integer.
Xixuan Wu98e5de32018-05-29 17:23:16 -0700194 @param grace_period_secs: The seconds to send a task after a SIGTERM before
195 sending it a SIGKILL.
196 @param execution_timeout_secs: The seconds to run before a task gets
197 terminated.
198 @param io_timeout_secs: The seconds to wait before a task is considered
199 hung.
200
201 @return a json-compatible dict, as a request for swarming call.
202 """
203 assert len(cmds) == len(slices_dimensions)
Xixuan Wu65169412018-08-22 10:41:43 -0700204 assert len(cmds) == len(slices_expiration_secs)
Xixuan Wu98e5de32018-05-29 17:23:16 -0700205 task_slices = []
Xixuan Wu65169412018-08-22 10:41:43 -0700206 for cmd, dimensions, expiration_secs in zip(cmds, slices_dimensions,
207 slices_expiration_secs):
Xixuan Wu98e5de32018-05-29 17:23:16 -0700208 properties = TaskProperties(
209 command=cmd,
210 dimensions=dimensions,
211 execution_timeout_secs=execution_timeout_secs,
212 grace_period_secs=grace_period_secs,
213 io_timeout_secs=io_timeout_secs)
214 task_slices.append(
215 NewTaskSlice(
216 expiration_secs=expiration_secs,
217 properties=properties))
218
219 task_request = NewTaskRequest(
220 name=task_name,
Xixuan Wu6ac13442018-06-12 11:26:30 -0700221 parent_task_id=parent_task_id,
Xixuan Wu98e5de32018-05-29 17:23:16 -0700222 priority=priority,
223 tags=tags,
224 user=user,
225 task_slices=task_slices)
226
227 return _to_raw_request(task_request)
228
229
230def _namedtuple_to_dict(value):
231 """Recursively converts a namedtuple to a dict.
232
233 Args:
234 value: a namedtuple object.
235
236 Returns:
237 A dict object with the same value.
238 """
239 out = dict(value._asdict())
240 for k, v in out.iteritems():
241 if hasattr(v, '_asdict'):
242 out[k] = _namedtuple_to_dict(v)
243 elif isinstance(v, (list, tuple)):
244 l = []
245 for elem in v:
246 if hasattr(elem, '_asdict'):
247 l.append(_namedtuple_to_dict(elem))
248 else:
249 l.append(elem)
250 out[k] = l
251
252 return out
253
254
255def _to_raw_request(request):
256 """Returns the json-compatible dict expected by the server.
257
258 Args:
259 request: a NewTaskRequest object.
260
261 Returns:
262 A json-compatible dict, which could be parsed by swarming proxy
263 service.
264 """
265 out = _namedtuple_to_dict(request)
266 for task_slice in out['task_slices']:
267 task_slice['properties']['dimensions'] = [
268 {'key': k, 'value': v}
269 for k, v in task_slice['properties']['dimensions'].iteritems()
270 ]
271 task_slice['properties']['dimensions'].sort(key=lambda x: x['key'])
272 return out
273
274
Xixuan Wu9af95a22018-05-18 10:46:42 -0700275def get_task_link(task_id):
Xixuan Wudbeaf7e2018-07-25 14:49:39 -0700276 return '%s/user/task/%s' % (os.environ.get('SWARMING_SERVER'), task_id)
Xixuan Wu9af95a22018-05-18 10:46:42 -0700277
278
279def get_task_final_state(task):
280 """Get the final state of a swarming task.
281
282 @param task: the json output of a swarming task fetched by API tasks.list.
283 """
284 state = task['state']
285 if state == TASK_COMPLETED:
286 state = (TASK_COMPLETED_FAILURE if task['failure'] else
287 TASK_COMPLETED_SUCCESS)
288
289 return state
Xixuan Wu415e8212018-06-04 17:01:12 -0700290
291
Xixuan Wuff19abe2018-06-20 10:44:45 -0700292def get_task_dut_name(task_dimensions):
Xixuan Wu415e8212018-06-04 17:01:12 -0700293 """Get the DUT name of running this task.
294
Xixuan Wuff19abe2018-06-20 10:44:45 -0700295 @param task_dimensions: a list of dict, e.g. [{'key': k, 'value': v}, ...]
Xixuan Wu415e8212018-06-04 17:01:12 -0700296 """
Xixuan Wuff19abe2018-06-20 10:44:45 -0700297 for dimension in task_dimensions:
Xixuan Wu415e8212018-06-04 17:01:12 -0700298 if dimension['key'] == 'dut_name':
299 return dimension['value'][0]
300
Xixuan Wuff19abe2018-06-20 10:44:45 -0700301 return ''
Xixuan Wucb469512018-06-08 15:17:23 -0700302
303
304def query_bots_count(dimensions):
305 """Get bots count for given requirements.
306
307 @param dimensions: A dict of dimensions for swarming bots.
308
309 @return a dict, which contains counts for different status of bots.
310 """
311 basic_swarming_cmd = get_basic_swarming_cmd('query')
312 conditions = [('dimensions', '%s:%s' % (k, v))
313 for k, v in dimensions.iteritems()]
314 swarming_cmd = basic_swarming_cmd + ['bots/count?%s' %
315 urllib.urlencode(conditions)]
316 cros_build_lib = autotest.chromite_load('cros_build_lib')
317 result = cros_build_lib.RunCommand(swarming_cmd, capture_output=True)
318 return json.loads(result.output)
319
320
321def get_idle_bots_count(outputs):
322 """Get the idle bots count.
323
324 @param outputs: The outputs of |query_bots_count|.
325 """
326 return (int(outputs['count']) - int(outputs['busy']) - int(outputs['dead'])
327 - int(outputs['quarantined']))
Xixuan Wu53d15712018-06-12 10:52:55 -0700328
329
330def query_task_by_tags(tags):
331 """Get tasks for given tags.
332
333 @param tags: A dict of tags for swarming tasks.
334
Xixuan Wuae8bfd22018-06-15 10:29:42 -0700335 @return a list, which contains all tasks queried by the given tags.
Xixuan Wu53d15712018-06-12 10:52:55 -0700336 """
337 basic_swarming_cmd = get_basic_swarming_cmd('query')
338 conditions = [('tags', '%s:%s' % (k, v)) for k, v in tags.iteritems()]
339 swarming_cmd = basic_swarming_cmd + ['tasks/list?%s' %
340 urllib.urlencode(conditions)]
341 cros_build_lib = autotest.chromite_load('cros_build_lib')
342 result = cros_build_lib.RunCommand(swarming_cmd, capture_output=True)
Xixuan Wu9dbf06b2018-07-13 16:33:18 -0700343 json_output = json.loads(result.output)
344 return json_output.get('items', [])
Xixuan Wu53d15712018-06-12 10:52:55 -0700345
346
Xixuan Wuae8bfd22018-06-15 10:29:42 -0700347def query_task_by_id(task_id):
348 """Get task for given id.
349
350 @param task_id: A string to indicate a swarming task id.
351
352 @return a dict, which contains the task with the given task_id.
353 """
354 basic_swarming_cmd = get_basic_swarming_cmd('query')
355 swarming_cmd = basic_swarming_cmd + ['task/%s/result' % task_id]
356 cros_build_lib = autotest.chromite_load('cros_build_lib')
357 result = cros_build_lib.RunCommand(swarming_cmd, capture_output=True)
358 return json.loads(result.output)
359
360
Xixuan Wu53d15712018-06-12 10:52:55 -0700361def abort_task(task_id):
362 """Abort a swarming task by its id.
363
364 @param task_id: A string swarming task id.
365 """
366 basic_swarming_cmd = get_basic_swarming_cmd('cancel')
367 swarming_cmd = basic_swarming_cmd + ['--kill-running', task_id]
368 cros_build_lib = autotest.chromite_load('cros_build_lib')
369 try:
370 cros_build_lib.RunCommand(swarming_cmd, log_output=True)
371 except cros_build_lib.RunCommandError:
372 logging.error('Task %s probably already gone, skip canceling it.',
373 task_id)
Xixuan Wu0c01b092018-06-13 14:12:55 -0700374
375
376def query_bots_list(dimensions):
377 """Get bots list for given requirements.
378
379 @param dimensions: A dict of dimensions for swarming bots.
380
381 @return a list of bot dicts.
382 """
383 basic_swarming_cmd = get_basic_swarming_cmd('query')
384 conditions = [('dimensions', '%s:%s' % (k, v))
385 for k, v in dimensions.iteritems()]
386 swarming_cmd = basic_swarming_cmd + ['bots/list?%s' %
387 urllib.urlencode(conditions)]
388 cros_build_lib = autotest.chromite_load('cros_build_lib')
389 result = cros_build_lib.RunCommand(swarming_cmd, capture_output=True)
Prathmesh Prabhua77dfd12018-09-26 11:53:50 -0700390 return json.loads(result.output).get('items', [])
Xixuan Wu0c01b092018-06-13 14:12:55 -0700391
392
393def bot_available(bot):
394 """Check whether a bot is available.
395
396 @param bot: A dict describes a bot's dimensions, i.e. an element in return
397 list of |query_bots_list|.
398
399 @return True if a bot is available to run task, otherwise False.
400 """
401 return not (bot['is_dead'] or bot['quarantined'])
Xixuan Wufba17192018-08-27 13:31:32 -0700402
403
404def get_child_tasks(parent_task_id):
405 """Get the child tasks based on a parent swarming task id.
406
407 @param parent_task_id: The parent swarming task id.
408
409 @return a list of dicts, each dict refers to the whole stats of a task,
410 keys include 'name', 'bot_dimensions', 'tags', 'bot_id', 'state', etc.
411 """
412 swarming_cmd = get_basic_swarming_cmd('query')
413 swarming_cmd += ['tasks/list?tags=parent_task_id:%s' % parent_task_id]
414 timeout_util = autotest.chromite_load('timeout_util')
415 cros_build_lib = autotest.chromite_load('cros_build_lib')
416 with timeout_util.Timeout(60):
417 child_tasks = cros_build_lib.RunCommand(
418 swarming_cmd, capture_output=True)
419 return json.loads(child_tasks.output)['items']