blob: c1c91e04967ec676ffd9b332d6e16006f97deb0c [file] [log] [blame]
Xixuan Wu0bea9522018-05-08 17:49:19 -07001# Copyright 2018 The Chromium OS Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5"""Module for swarming execution."""
6
7from __future__ import absolute_import
8from __future__ import division
9from __future__ import print_function
10
Xixuan Wu98e5de32018-05-29 17:23:16 -070011import collections
Xixuan Wucb469512018-06-08 15:17:23 -070012import json
Xixuan Wu53d15712018-06-12 10:52:55 -070013import logging
Xixuan Wu89300762018-07-13 14:58:46 -070014import operator
Xixuan Wu0bea9522018-05-08 17:49:19 -070015import os
Xixuan Wucb469512018-06-08 15:17:23 -070016import urllib
Allen Li8b2beda2018-09-04 17:16:14 -070017import uuid
Xixuan Wucb469512018-06-08 15:17:23 -070018
19from lucifer import autotest
Xixuan Wu0bea9522018-05-08 17:49:19 -070020
21
22SERVICE_ACCOUNT = '/creds/skylab_swarming_bot/skylab_bot_service_account.json'
Xixuan Wucb469512018-06-08 15:17:23 -070023SKYLAB_DRONE_POOL = 'ChromeOSSkylab'
Xixuan Wu53d15712018-06-12 10:52:55 -070024SKYLAB_SUITE_POOL = 'ChromeOSSkylab-suite'
Xixuan Wucb469512018-06-08 15:17:23 -070025
Xixuan Wu2406be32018-05-14 13:51:30 -070026TASK_COMPLETED = 'COMPLETED'
Xixuan Wu9af95a22018-05-18 10:46:42 -070027TASK_COMPLETED_SUCCESS = 'COMPLETED (SUCCESS)'
28TASK_COMPLETED_FAILURE = 'COMPLETED (FAILURE)'
Xixuan Wu2406be32018-05-14 13:51:30 -070029TASK_EXPIRED = 'EXPIRED'
30TASK_CANCELED = 'CANCELED'
31TASK_TIMEDOUT = 'TIMED_OUT'
Xixuan Wu8157c1f2018-06-06 15:26:00 -070032TASK_RUNNING = 'RUNNING'
Xixuan Wuf52e40d2018-06-14 12:10:44 -070033TASK_PENDING = 'PENDING'
Xixuan Wu799c8bd2018-07-11 10:18:01 -070034TASK_BOT_DIED = 'BOT_DIED'
Xixuan Wu74ee9b42018-07-11 16:01:12 -070035TASK_NO_RESOURCE = 'NO_RESOURCE'
Xixuan Wu4a99c102018-08-22 13:53:04 -070036TASK_KILLED = 'KILLED'
Xixuan Wu2406be32018-05-14 13:51:30 -070037TASK_FINISHED_STATUS = [TASK_COMPLETED,
38 TASK_EXPIRED,
39 TASK_CANCELED,
Xixuan Wu799c8bd2018-07-11 10:18:01 -070040 TASK_TIMEDOUT,
Xixuan Wu74ee9b42018-07-11 16:01:12 -070041 TASK_BOT_DIED,
Xixuan Wu4a99c102018-08-22 13:53:04 -070042 TASK_NO_RESOURCE,
43 TASK_KILLED]
Xixuan Wuaff23c72018-06-14 12:10:44 -070044# The swarming task failure status to retry. TASK_CANCELED won't get
45# retried since it's intentionally aborted.
Xixuan Wu74ee9b42018-07-11 16:01:12 -070046TASK_STATUS_TO_RETRY = [TASK_EXPIRED, TASK_TIMEDOUT, TASK_BOT_DIED,
47 TASK_NO_RESOURCE]
Xixuan Wu0bea9522018-05-08 17:49:19 -070048
Xixuan Wu6bd67ea2018-08-01 09:24:59 -070049DEFAULT_EXPIRATION_SECS = 10 * 60
Xixuan Wu98e5de32018-05-29 17:23:16 -070050DEFAULT_TIMEOUT_SECS = 60 * 60
51
Xixuan Wu89300762018-07-13 14:58:46 -070052# A mapping of priorities for skylab hwtest tasks. In swarming,
53# lower number means high priorities. Priority lower than 48 will
Xixuan Wufb6bb7f2018-07-19 16:22:21 -070054# be special tasks. The upper bound for priority is 255.
Xixuan Wu89300762018-07-13 14:58:46 -070055# Use the same priorities mapping as chromite/lib/constants.py
56SKYLAB_HWTEST_PRIORITIES_MAP = {
Xixuan Wufb6bb7f2018-07-19 16:22:21 -070057 'Weekly': 230,
Xixuan Wuff72a9b2018-08-22 11:38:27 -070058 'CTS': 215,
Xixuan Wufb6bb7f2018-07-19 16:22:21 -070059 'Daily': 200,
60 'PostBuild': 170,
61 'Default': 140,
62 'Build': 110,
63 'PFQ': 80,
Xixuan Wu89300762018-07-13 14:58:46 -070064 'CQ': 50,
65 'Super': 49,
66}
67SORTED_SKYLAB_HWTEST_PRIORITY = sorted(
68 SKYLAB_HWTEST_PRIORITIES_MAP.items(),
69 key=operator.itemgetter(1))
70
Xixuan Wu77d4a592018-06-08 10:40:57 -070071# TODO (xixuan): Use proto library or some future APIs instead of hardcoding.
72SWARMING_DUT_POOL_MAP = {
73 'cq': 'DUT_POOL_CQ',
74 'bvt': 'DUT_POOL_BVT',
75 'suites': 'DUT_POOL_SUITES',
76 'cts': 'DUT_POOL_CTS',
77 'arc-presubmit': 'DUT_POOL_CTS_PERBUILD',
78}
79SWARMING_DUT_READY_STATUS = 'ready'
Xixuan Wu98e5de32018-05-29 17:23:16 -070080
81# The structure of fallback swarming task request is:
82# NewTaskRequest:
83# ...
84# task_slices -> NewTaskSlice:
85# ...
86# properties -> TaskProperties
87# ...
88TaskProperties = collections.namedtuple(
89 'TaskProperties',
90 [
91 'command',
92 'dimensions',
93 'execution_timeout_secs',
94 'grace_period_secs',
95 'io_timeout_secs',
96 ])
97
98NewTaskSlice = collections.namedtuple(
99 'NewTaskSlice',
100 [
101 'expiration_secs',
102 'properties',
103 ])
104
105NewTaskRequest = collections.namedtuple(
106 'NewTaskRequest',
107 [
108 'name',
Xixuan Wu6ac13442018-06-12 11:26:30 -0700109 'parent_task_id',
Xixuan Wu98e5de32018-05-29 17:23:16 -0700110 'priority',
111 'tags',
112 'user',
113 'task_slices',
114 ])
115
Xixuan Wu0bea9522018-05-08 17:49:19 -0700116
117def _get_client():
118 return os.path.join(
119 os.path.expanduser('~'),
120 'chromiumos/chromite/third_party/swarming.client/swarming.py')
121
122
123def get_basic_swarming_cmd(command):
124 return [_get_client(), command,
125 '--auth-service-account-json', SERVICE_ACCOUNT,
Allen Li8b2beda2018-09-04 17:16:14 -0700126 '--swarming', get_swarming_server()]
127
128
Allen Li8b2beda2018-09-04 17:16:14 -0700129def make_logdog_annotation_url():
130 """Return a unique LogDog annotation URL.
131
132 If the appropriate LogDog server cannot be determined, return an
133 empty string.
134 """
135 logdog_server = get_logdog_server()
136 if not logdog_server:
137 return ''
138 return ('logdog://%s/chromeos/skylab/%s/+/annotations'
139 % (logdog_server, uuid.uuid4().hex))
140
141
142def get_swarming_server():
143 """Return the swarming server for the current environment."""
144 return os.environ.get('SWARMING_SERVER')
Xixuan Wu9af95a22018-05-18 10:46:42 -0700145
146
Prathmesh Prabhuacf41f02018-09-26 12:06:12 -0700147def get_logdog_server():
148 """Return the LogDog server for the current environment.
149
150 If the appropriate server cannot be determined, return an empty
151 string.
152 """
153 return os.environ.get('LOGDOG_SERVER') or ''
154
155
Allen Li10047b72018-09-05 16:02:31 -0700156def get_new_task_swarming_cmd():
157 """Return a list of command args for creating a new task."""
158 return get_basic_swarming_cmd('post') + ['tasks/new']
159
160
Xixuan Wu65169412018-08-22 10:41:43 -0700161def make_fallback_request_dict(cmds, slices_dimensions, slices_expiration_secs,
162 task_name, priority, tags, user,
Xixuan Wu6ac13442018-06-12 11:26:30 -0700163 parent_task_id='',
Xixuan Wu98e5de32018-05-29 17:23:16 -0700164 expiration_secs=DEFAULT_EXPIRATION_SECS,
165 grace_period_secs=DEFAULT_TIMEOUT_SECS,
166 execution_timeout_secs=DEFAULT_TIMEOUT_SECS,
167 io_timeout_secs=DEFAULT_TIMEOUT_SECS):
168 """Form a json-compatible dict for fallback swarming call.
169
170 @param cmds: A list of cmd to run on swarming bots.
171 @param slices_dimensions: A list of dict to indicates different tries'
172 dimensions.
Xixuan Wu65169412018-08-22 10:41:43 -0700173 @param slices_expiration_secs: A list of Integer to indicates each slice's
174 expiration_secs.
Xixuan Wu98e5de32018-05-29 17:23:16 -0700175 @param task_name: The request's name.
176 @param priority: The request's priority. An integer.
Xixuan Wu98e5de32018-05-29 17:23:16 -0700177 @param grace_period_secs: The seconds to send a task after a SIGTERM before
178 sending it a SIGKILL.
179 @param execution_timeout_secs: The seconds to run before a task gets
180 terminated.
181 @param io_timeout_secs: The seconds to wait before a task is considered
182 hung.
183
184 @return a json-compatible dict, as a request for swarming call.
185 """
186 assert len(cmds) == len(slices_dimensions)
Xixuan Wu65169412018-08-22 10:41:43 -0700187 assert len(cmds) == len(slices_expiration_secs)
Xixuan Wu98e5de32018-05-29 17:23:16 -0700188 task_slices = []
Xixuan Wu65169412018-08-22 10:41:43 -0700189 for cmd, dimensions, expiration_secs in zip(cmds, slices_dimensions,
190 slices_expiration_secs):
Xixuan Wu98e5de32018-05-29 17:23:16 -0700191 properties = TaskProperties(
192 command=cmd,
193 dimensions=dimensions,
194 execution_timeout_secs=execution_timeout_secs,
195 grace_period_secs=grace_period_secs,
196 io_timeout_secs=io_timeout_secs)
197 task_slices.append(
198 NewTaskSlice(
199 expiration_secs=expiration_secs,
200 properties=properties))
201
202 task_request = NewTaskRequest(
203 name=task_name,
Xixuan Wu6ac13442018-06-12 11:26:30 -0700204 parent_task_id=parent_task_id,
Xixuan Wu98e5de32018-05-29 17:23:16 -0700205 priority=priority,
206 tags=tags,
207 user=user,
208 task_slices=task_slices)
209
210 return _to_raw_request(task_request)
211
212
213def _namedtuple_to_dict(value):
214 """Recursively converts a namedtuple to a dict.
215
216 Args:
217 value: a namedtuple object.
218
219 Returns:
220 A dict object with the same value.
221 """
222 out = dict(value._asdict())
223 for k, v in out.iteritems():
224 if hasattr(v, '_asdict'):
225 out[k] = _namedtuple_to_dict(v)
226 elif isinstance(v, (list, tuple)):
227 l = []
228 for elem in v:
229 if hasattr(elem, '_asdict'):
230 l.append(_namedtuple_to_dict(elem))
231 else:
232 l.append(elem)
233 out[k] = l
234
235 return out
236
237
238def _to_raw_request(request):
239 """Returns the json-compatible dict expected by the server.
240
241 Args:
242 request: a NewTaskRequest object.
243
244 Returns:
245 A json-compatible dict, which could be parsed by swarming proxy
246 service.
247 """
248 out = _namedtuple_to_dict(request)
249 for task_slice in out['task_slices']:
250 task_slice['properties']['dimensions'] = [
251 {'key': k, 'value': v}
252 for k, v in task_slice['properties']['dimensions'].iteritems()
253 ]
254 task_slice['properties']['dimensions'].sort(key=lambda x: x['key'])
255 return out
256
257
Xixuan Wu9af95a22018-05-18 10:46:42 -0700258def get_task_link(task_id):
Xixuan Wudbeaf7e2018-07-25 14:49:39 -0700259 return '%s/user/task/%s' % (os.environ.get('SWARMING_SERVER'), task_id)
Xixuan Wu9af95a22018-05-18 10:46:42 -0700260
261
262def get_task_final_state(task):
263 """Get the final state of a swarming task.
264
265 @param task: the json output of a swarming task fetched by API tasks.list.
266 """
267 state = task['state']
268 if state == TASK_COMPLETED:
269 state = (TASK_COMPLETED_FAILURE if task['failure'] else
270 TASK_COMPLETED_SUCCESS)
271
272 return state
Xixuan Wu415e8212018-06-04 17:01:12 -0700273
274
Xixuan Wuff19abe2018-06-20 10:44:45 -0700275def get_task_dut_name(task_dimensions):
Xixuan Wu415e8212018-06-04 17:01:12 -0700276 """Get the DUT name of running this task.
277
Xixuan Wuff19abe2018-06-20 10:44:45 -0700278 @param task_dimensions: a list of dict, e.g. [{'key': k, 'value': v}, ...]
Xixuan Wu415e8212018-06-04 17:01:12 -0700279 """
Xixuan Wuff19abe2018-06-20 10:44:45 -0700280 for dimension in task_dimensions:
Xixuan Wu415e8212018-06-04 17:01:12 -0700281 if dimension['key'] == 'dut_name':
282 return dimension['value'][0]
283
Xixuan Wuff19abe2018-06-20 10:44:45 -0700284 return ''
Xixuan Wucb469512018-06-08 15:17:23 -0700285
286
287def query_bots_count(dimensions):
288 """Get bots count for given requirements.
289
290 @param dimensions: A dict of dimensions for swarming bots.
291
292 @return a dict, which contains counts for different status of bots.
293 """
294 basic_swarming_cmd = get_basic_swarming_cmd('query')
295 conditions = [('dimensions', '%s:%s' % (k, v))
296 for k, v in dimensions.iteritems()]
297 swarming_cmd = basic_swarming_cmd + ['bots/count?%s' %
298 urllib.urlencode(conditions)]
299 cros_build_lib = autotest.chromite_load('cros_build_lib')
300 result = cros_build_lib.RunCommand(swarming_cmd, capture_output=True)
301 return json.loads(result.output)
302
303
304def get_idle_bots_count(outputs):
305 """Get the idle bots count.
306
307 @param outputs: The outputs of |query_bots_count|.
308 """
309 return (int(outputs['count']) - int(outputs['busy']) - int(outputs['dead'])
310 - int(outputs['quarantined']))
Xixuan Wu53d15712018-06-12 10:52:55 -0700311
312
313def query_task_by_tags(tags):
314 """Get tasks for given tags.
315
316 @param tags: A dict of tags for swarming tasks.
317
Xixuan Wuae8bfd22018-06-15 10:29:42 -0700318 @return a list, which contains all tasks queried by the given tags.
Xixuan Wu53d15712018-06-12 10:52:55 -0700319 """
320 basic_swarming_cmd = get_basic_swarming_cmd('query')
321 conditions = [('tags', '%s:%s' % (k, v)) for k, v in tags.iteritems()]
322 swarming_cmd = basic_swarming_cmd + ['tasks/list?%s' %
323 urllib.urlencode(conditions)]
324 cros_build_lib = autotest.chromite_load('cros_build_lib')
325 result = cros_build_lib.RunCommand(swarming_cmd, capture_output=True)
Xixuan Wu9dbf06b2018-07-13 16:33:18 -0700326 json_output = json.loads(result.output)
327 return json_output.get('items', [])
Xixuan Wu53d15712018-06-12 10:52:55 -0700328
329
Xixuan Wuae8bfd22018-06-15 10:29:42 -0700330def query_task_by_id(task_id):
331 """Get task for given id.
332
333 @param task_id: A string to indicate a swarming task id.
334
335 @return a dict, which contains the task with the given task_id.
336 """
337 basic_swarming_cmd = get_basic_swarming_cmd('query')
338 swarming_cmd = basic_swarming_cmd + ['task/%s/result' % task_id]
339 cros_build_lib = autotest.chromite_load('cros_build_lib')
340 result = cros_build_lib.RunCommand(swarming_cmd, capture_output=True)
341 return json.loads(result.output)
342
343
Xixuan Wu53d15712018-06-12 10:52:55 -0700344def abort_task(task_id):
345 """Abort a swarming task by its id.
346
347 @param task_id: A string swarming task id.
348 """
349 basic_swarming_cmd = get_basic_swarming_cmd('cancel')
350 swarming_cmd = basic_swarming_cmd + ['--kill-running', task_id]
351 cros_build_lib = autotest.chromite_load('cros_build_lib')
352 try:
353 cros_build_lib.RunCommand(swarming_cmd, log_output=True)
354 except cros_build_lib.RunCommandError:
355 logging.error('Task %s probably already gone, skip canceling it.',
356 task_id)
Xixuan Wu0c01b092018-06-13 14:12:55 -0700357
358
359def query_bots_list(dimensions):
360 """Get bots list for given requirements.
361
362 @param dimensions: A dict of dimensions for swarming bots.
363
364 @return a list of bot dicts.
365 """
366 basic_swarming_cmd = get_basic_swarming_cmd('query')
367 conditions = [('dimensions', '%s:%s' % (k, v))
368 for k, v in dimensions.iteritems()]
369 swarming_cmd = basic_swarming_cmd + ['bots/list?%s' %
370 urllib.urlencode(conditions)]
371 cros_build_lib = autotest.chromite_load('cros_build_lib')
372 result = cros_build_lib.RunCommand(swarming_cmd, capture_output=True)
373 return json.loads(result.output)['items']
374
375
376def bot_available(bot):
377 """Check whether a bot is available.
378
379 @param bot: A dict describes a bot's dimensions, i.e. an element in return
380 list of |query_bots_list|.
381
382 @return True if a bot is available to run task, otherwise False.
383 """
384 return not (bot['is_dead'] or bot['quarantined'])
Xixuan Wufba17192018-08-27 13:31:32 -0700385
386
387def get_child_tasks(parent_task_id):
388 """Get the child tasks based on a parent swarming task id.
389
390 @param parent_task_id: The parent swarming task id.
391
392 @return a list of dicts, each dict refers to the whole stats of a task,
393 keys include 'name', 'bot_dimensions', 'tags', 'bot_id', 'state', etc.
394 """
395 swarming_cmd = get_basic_swarming_cmd('query')
396 swarming_cmd += ['tasks/list?tags=parent_task_id:%s' % parent_task_id]
397 timeout_util = autotest.chromite_load('timeout_util')
398 cros_build_lib = autotest.chromite_load('cros_build_lib')
399 with timeout_util.Timeout(60):
400 child_tasks = cros_build_lib.RunCommand(
401 swarming_cmd, capture_output=True)
402 return json.loads(child_tasks.output)['items']