blob: 7e523e004b74d786ff8b9a0cb5b9be3aa5e061e7 [file] [log] [blame]
Richard Barnetteffed1722016-05-18 15:57:22 -07001#pylint: disable=C0111
beeps5e2bb4a2013-10-28 11:26:45 -07002
3"""
4Prejob tasks.
5
6Prejob tasks _usually_ run before a job and verify the state of a machine.
7Cleanup and repair are exceptions, cleanup can run after a job too, while
8repair will run anytime the host needs a repair, which could be pre or post
9job. Most of the work specific to this module is achieved through the prolog
10and epilog of each task.
11
12All prejob tasks must have a host, though they may not have an HQE. If a
13prejob task has a hqe, it will activate the hqe through its on_pending
beepsec1c4b22013-11-18 08:26:39 -080014method on successful completion. A row in afe_special_tasks with values:
beeps5e2bb4a2013-10-28 11:26:45 -070015 host=C1, unlocked, is_active=0, is_complete=0, type=Verify
16will indicate to the scheduler that it needs to schedule a new special task
17of type=Verify, against the C1 host. While the special task is running
18the scheduler only monitors it through the Agent, and its is_active bit=1.
19Once a special task finishes, we set its is_active=0, is_complete=1 and
20success bits, so the scheduler ignores it.
21HQE.on_pending:
22 Host, HQE -> Pending, Starting
23 This status is acted upon in the scheduler, to assign an AgentTask.
24PreJobTask:
25 epilog:
26 failure:
27 requeue hqe
28 repair the host
29Children PreJobTasks:
30 prolog:
31 set Host, HQE status
32 epilog:
33 success:
34 on_pending
35 failure:
36 repair throgh PreJobTask
37 set Host, HQE status
beepsec1c4b22013-11-18 08:26:39 -080038
39Failing a prejob task effects both the Host and the HQE, as follows:
40
41- Host: PreJob failure will result in a Repair job getting queued against
42the host, is we haven't already tried repairing it more than the
43max_repair_limit. When this happens, the host will remain in whatever status
44the prejob task left it in, till the Repair job puts it into 'Repairing'. This
45way the host_scheduler won't pick bad hosts and assign them to jobs.
46
47If we have already tried repairing the host too many times, the PreJobTask
48will flip the host to 'RepairFailed' in its epilog, and it will remain in this
49state till it is recovered and reverified.
50
51- HQE: Is either requeued or failed. Requeuing the HQE involves putting it
52in the Queued state and setting its host_id to None, so it gets a new host
53in the next scheduler tick. Failing the HQE results in either a Parsing
54or Archiving postjob task, and an eventual Failed status for the HQE.
beeps5e2bb4a2013-10-28 11:26:45 -070055"""
beepsec1c4b22013-11-18 08:26:39 -080056
beeps5e2bb4a2013-10-28 11:26:45 -070057import logging
Prathmesh Prabhu2c7471d2016-11-15 20:19:57 +000058import re
beeps5e2bb4a2013-10-28 11:26:45 -070059
60from autotest_lib.client.common_lib import host_protections
61from autotest_lib.frontend.afe import models
Prathmesh Prabhubcc5b7e2018-08-17 17:10:21 -070062from autotest_lib.scheduler import agent_task
63from autotest_lib.scheduler import drone_manager
64from autotest_lib.scheduler import scheduler_config
beeps5e2bb4a2013-10-28 11:26:45 -070065from autotest_lib.server import autoserv_utils
66from autotest_lib.server.cros import provision
67
68
69class PreJobTask(agent_task.SpecialAgentTask):
beeps5e2bb4a2013-10-28 11:26:45 -070070 def epilog(self):
71 super(PreJobTask, self).epilog()
72
beeps5e2bb4a2013-10-28 11:26:45 -070073 if self.host.protection == host_protections.Protection.DO_NOT_VERIFY:
74 # effectively ignore failure for these hosts
75 self.success = True
Richard Barnetteffed1722016-05-18 15:57:22 -070076
77 if self.success:
78 self.host.record_working_state(True,
79 self.task.time_finished)
beeps5e2bb4a2013-10-28 11:26:45 -070080 return
81
82 if self.queue_entry:
83 # If we requeue a HQE, we should cancel any remaining pre-job
84 # tasks against this host, otherwise we'll be left in a state
85 # where a queued HQE has special tasks to run against a host.
86 models.SpecialTask.objects.filter(
87 queue_entry__id=self.queue_entry.id,
88 host__id=self.host.id,
89 is_complete=0).update(is_complete=1, success=0)
90
91 previous_provisions = models.SpecialTask.objects.filter(
92 task=models.SpecialTask.Task.PROVISION,
93 queue_entry_id=self.queue_entry.id).count()
94 if (previous_provisions >
95 scheduler_config.config.max_provision_retries):
96 self._actually_fail_queue_entry()
97 # This abort will mark the aborted bit on the HQE itself, to
98 # signify that we're killing it. Technically it also will do
99 # the recursive aborting of all child jobs, but that shouldn't
100 # matter here, as only suites have children, and those are
101 # hostless and thus don't have provisioning.
102 # TODO(milleral) http://crbug.com/188217
103 # However, we can't actually do this yet, as if we set the
104 # abort bit the FinalReparseTask will set the status of the HQE
105 # to ABORTED, which then means that we don't show the status in
106 # run_suite. So in the meantime, don't mark the HQE as
107 # aborted.
108 # queue_entry.abort()
109 else:
110 # requeue() must come after handling provision retries, since
111 # _actually_fail_queue_entry needs an execution subdir.
112 # We also don't want to requeue if we hit the provision retry
113 # limit, since then we overwrite the PARSING state of the HQE.
114 self.queue_entry.requeue()
115
Dan Shia1f0d022014-10-24 12:13:04 -0700116 # Limit the repair on a host when a prejob task fails, e.g., reset,
117 # verify etc. The number of repair jobs is limited to the specific
118 # HQE and host.
beeps5e2bb4a2013-10-28 11:26:45 -0700119 previous_repairs = models.SpecialTask.objects.filter(
120 task=models.SpecialTask.Task.REPAIR,
Dan Shia1f0d022014-10-24 12:13:04 -0700121 queue_entry_id=self.queue_entry.id,
122 host_id=self.queue_entry.host_id).count()
beeps5e2bb4a2013-10-28 11:26:45 -0700123 if previous_repairs >= scheduler_config.config.max_repair_limit:
124 self.host.set_status(models.Host.Status.REPAIR_FAILED)
125 self._fail_queue_entry()
126 return
127
128 queue_entry = models.HostQueueEntry.objects.get(
129 id=self.queue_entry.id)
130 else:
131 queue_entry = None
132
133 models.SpecialTask.objects.create(
134 host=models.Host.objects.get(id=self.host.id),
135 task=models.SpecialTask.Task.REPAIR,
136 queue_entry=queue_entry,
137 requested_by=self.task.requested_by)
138
139
140 def _should_pending(self):
141 """
142 Decide if we should call the host queue entry's on_pending method.
143 We should if:
144 1) There exists an associated host queue entry.
145 2) The current special task completed successfully.
146 3) There do not exist any more special tasks to be run before the
147 host queue entry starts.
148
149 @returns: True if we should call pending, false if not.
150
151 """
152 if not self.queue_entry or not self.success:
153 return False
154
155 # We know if this is the last one when we create it, so we could add
156 # another column to the database to keep track of this information, but
157 # I expect the overhead of querying here to be minimal.
158 queue_entry = models.HostQueueEntry.objects.get(id=self.queue_entry.id)
159 queued = models.SpecialTask.objects.filter(
160 host__id=self.host.id, is_active=False,
161 is_complete=False, queue_entry=queue_entry)
162 queued = queued.exclude(id=self.task.id)
163 return queued.count() == 0
164
165
166class VerifyTask(PreJobTask):
167 TASK_TYPE = models.SpecialTask.Task.VERIFY
168
169
170 def __init__(self, task):
Alex Millerec212252014-02-28 16:48:34 -0800171 args = ['-v']
172 if task.queue_entry:
173 args.extend(self._generate_autoserv_label_args(task))
174 super(VerifyTask, self).__init__(task, args)
beeps5e2bb4a2013-10-28 11:26:45 -0700175 self._set_ids(host=self.host, queue_entries=[self.queue_entry])
176
177
178 def prolog(self):
179 super(VerifyTask, self).prolog()
180
181 logging.info("starting verify on %s", self.host.hostname)
182 if self.queue_entry:
183 self.queue_entry.set_status(models.HostQueueEntry.Status.VERIFYING)
184 self.host.set_status(models.Host.Status.VERIFYING)
185
186 # Delete any queued manual reverifies for this host. One verify will do
187 # and there's no need to keep records of other requests.
188 self.remove_special_tasks(models.SpecialTask.Task.VERIFY,
189 keep_last_one=True)
190
191
192 def epilog(self):
193 super(VerifyTask, self).epilog()
194 if self.success:
195 if self._should_pending():
196 self.queue_entry.on_pending()
197 else:
198 self.host.set_status(models.Host.Status.READY)
199
200
201class CleanupTask(PreJobTask):
202 # note this can also run post-job, but when it does, it's running standalone
203 # against the host (not related to the job), so it's not considered a
204 # PostJobTask
205
206 TASK_TYPE = models.SpecialTask.Task.CLEANUP
207
208
209 def __init__(self, task, recover_run_monitor=None):
Alex Millerec212252014-02-28 16:48:34 -0800210 args = ['--cleanup']
211 if task.queue_entry:
212 args.extend(self._generate_autoserv_label_args(task))
213 super(CleanupTask, self).__init__(task, args)
beeps5e2bb4a2013-10-28 11:26:45 -0700214 self._set_ids(host=self.host, queue_entries=[self.queue_entry])
215
216
217 def prolog(self):
218 super(CleanupTask, self).prolog()
219 logging.info("starting cleanup task for host: %s", self.host.hostname)
220 self.host.set_status(models.Host.Status.CLEANING)
221 if self.queue_entry:
222 self.queue_entry.set_status(models.HostQueueEntry.Status.CLEANING)
223
224
225 def _finish_epilog(self):
226 if not self.queue_entry or not self.success:
227 return
228
229 do_not_verify_protection = host_protections.Protection.DO_NOT_VERIFY
230 should_run_verify = (
231 self.queue_entry.job.run_verify
232 and self.host.protection != do_not_verify_protection)
233 if should_run_verify:
234 entry = models.HostQueueEntry.objects.get(id=self.queue_entry.id)
235 models.SpecialTask.objects.create(
236 host=models.Host.objects.get(id=self.host.id),
237 queue_entry=entry,
238 task=models.SpecialTask.Task.VERIFY)
239 else:
240 if self._should_pending():
241 self.queue_entry.on_pending()
242
243
244 def epilog(self):
245 super(CleanupTask, self).epilog()
246
247 if self.success:
248 self.host.update_field('dirty', 0)
249 self.host.set_status(models.Host.Status.READY)
250
251 self._finish_epilog()
252
253
254class ResetTask(PreJobTask):
255 """Task to reset a DUT, including cleanup and verify."""
256 # note this can also run post-job, but when it does, it's running standalone
257 # against the host (not related to the job), so it's not considered a
258 # PostJobTask
259
260 TASK_TYPE = models.SpecialTask.Task.RESET
261
262
263 def __init__(self, task, recover_run_monitor=None):
Alex Millerec212252014-02-28 16:48:34 -0800264 args = ['--reset']
265 if task.queue_entry:
266 args.extend(self._generate_autoserv_label_args(task))
267 super(ResetTask, self).__init__(task, args)
beeps5e2bb4a2013-10-28 11:26:45 -0700268 self._set_ids(host=self.host, queue_entries=[self.queue_entry])
269
270
271 def prolog(self):
272 super(ResetTask, self).prolog()
273 logging.info('starting reset task for host: %s',
274 self.host.hostname)
275 self.host.set_status(models.Host.Status.RESETTING)
276 if self.queue_entry:
277 self.queue_entry.set_status(models.HostQueueEntry.Status.RESETTING)
278
279 # Delete any queued cleanups for this host.
280 self.remove_special_tasks(models.SpecialTask.Task.CLEANUP,
281 keep_last_one=False)
282
283 # Delete any queued reverifies for this host.
284 self.remove_special_tasks(models.SpecialTask.Task.VERIFY,
285 keep_last_one=False)
286
287 # Only one reset is needed.
288 self.remove_special_tasks(models.SpecialTask.Task.RESET,
289 keep_last_one=True)
290
291
292 def epilog(self):
293 super(ResetTask, self).epilog()
294
295 if self.success:
296 self.host.update_field('dirty', 0)
297
298 if self._should_pending():
299 self.queue_entry.on_pending()
300 else:
301 self.host.set_status(models.Host.Status.READY)
302
303
Prathmesh Prabhu2c7471d2016-11-15 20:19:57 +0000304# TODO (ayatane): Refactor using server/cros/provision
305def _is_cros_version(label):
306 """Return whether the label is a cros-version: label."""
307 return label.startswith('cros-version:')
308
309
310# TODO (ayatane): Refactor using server/cros/provision
311def _get_cros_version(label):
312 """Return cros-version from cros-version label."""
313 return label[len('cros-version:'):]
314
315
316# TODO (ayatane): Refactor into server/cros/provision
317class _CrosImage(object):
318 """The name of a CrOS image."""
319
320 _name_pattern = re.compile(
321 r'^'
322 r'(?P<group>[a-z0-9-]+)'
323 r'/'
324 r'(?P<milestone>LATEST|R[0-9]+)'
325 r'-'
326 r'(?P<version>[0-9.]+)'
327 r'(-(?P<rc>rc[0-9]+))?'
328 r'$'
329 )
330
331 def __init__(self, name):
332 """Initialize instance.
333
334 @param name: Image name string (lumpy-release/R27-3773.0.0)
335 """
336 self._name = name
337 match = self._name_pattern.search(name)
338 if match is None:
339 raise ValueError('Invalid CrOS image name: %r' % name)
340 self.group = match.group('group')
341 self.milestone = match.group('milestone')
342 self.version = match.group('version')
343 self.rc = match.group('rc')
344
345 def __repr__(self):
346 return '{cls}({name!r})'.format(cls=type(self).__name__,
347 name=self._name)
348
349 def __str__(self):
350 return self._name
351
352
beeps5e2bb4a2013-10-28 11:26:45 -0700353class ProvisionTask(PreJobTask):
354 TASK_TYPE = models.SpecialTask.Task.PROVISION
355
356 def __init__(self, task):
357 # Provisioning requires that we be associated with a job/queue entry
358 assert task.queue_entry, "No HQE associated with provision task!"
359 # task.queue_entry is an afe model HostQueueEntry object.
360 # self.queue_entry is a scheduler models HostQueueEntry object, but
361 # it gets constructed and assigned in __init__, so it's not available
362 # yet. Therefore, we're stuck pulling labels off of the afe model
363 # so that we can pass the --provision args into the __init__ call.
Alex Millerec212252014-02-28 16:48:34 -0800364 labels = {x.name for x in task.queue_entry.job.labels}
Dan Shi7279a5a2016-04-07 11:04:28 -0700365 _, provisionable = provision.Provision.partition(labels)
Alex Millerdf15ec52014-02-28 18:18:48 -0800366 extra_command_args = ['--provision',
Alex Millerec212252014-02-28 16:48:34 -0800367 '--job-labels', ','.join(provisionable)]
beeps5e2bb4a2013-10-28 11:26:45 -0700368 super(ProvisionTask, self).__init__(task, extra_command_args)
Allen Li02d7e742016-10-14 15:30:36 -0700369 self._set_milestone(labels)
beeps5e2bb4a2013-10-28 11:26:45 -0700370 self._set_ids(host=self.host, queue_entries=[self.queue_entry])
371
372
Allen Li02d7e742016-10-14 15:30:36 -0700373 def _set_milestone(self, labels):
374 """Set build milestone from the labels.
375
376 @param labels: iterable of labels.
377 """
Prathmesh Prabhu2c7471d2016-11-15 20:19:57 +0000378 labels = (label
379 for label in labels
380 if _is_cros_version(label))
381 for label in labels:
382 try:
383 cros_image = _CrosImage(_get_cros_version(label))
384 except ValueError as e:
385 logging.warning('Could not parse cros-version. Error msg: %s', e)
386 self._milestone = 'N/A'
387 else:
388 self._milestone = cros_image.milestone
389 break
Allen Li02d7e742016-10-14 15:30:36 -0700390
391
beeps5e2bb4a2013-10-28 11:26:45 -0700392 def _command_line(self):
Prathmesh Prabhubcc5b7e2018-08-17 17:10:21 -0700393 # If we give queue_entry to autoserv_run_job_command, then it will
394 # append -c for this invocation if the queue_entry is a client side
395 # test. We don't want that, as it messes with provisioning, so we just
396 # drop it from the arguments here.
beeps5e2bb4a2013-10-28 11:26:45 -0700397 # Note that we also don't verify job_repo_url as provisioining tasks are
398 # required to stage whatever content we need, and the job itself will
399 # force autotest to be staged if it isn't already.
Prathmesh Prabhubcc5b7e2018-08-17 17:10:21 -0700400 return autoserv_utils.autoserv_run_job_command(
401 autoserv_utils.autoserv_directory,
402 self.host.hostname,
403 results_directory=drone_manager.WORKING_DIRECTORY,
404 extra_args=self._extra_command_args,
405 in_lab=True,
406 )
beeps5e2bb4a2013-10-28 11:26:45 -0700407
408 def prolog(self):
409 super(ProvisionTask, self).prolog()
410 # add check for previous provision task and abort if exist.
411 logging.info("starting provision task for host: %s", self.host.hostname)
412 self.queue_entry.set_status(
413 models.HostQueueEntry.Status.PROVISIONING)
414 self.host.set_status(models.Host.Status.PROVISIONING)
415
416
417 def epilog(self):
418 super(ProvisionTask, self).epilog()
419
beepsec1c4b22013-11-18 08:26:39 -0800420 # If we were not successful in provisioning the machine
421 # leave the DUT in whatever status was set in the PreJobTask's
422 # epilog. If this task was successful the host status will get
423 # set appropriately as a fallout of the hqe's on_pending. If
424 # we don't call on_pending, it can only be because:
425 # 1. This task was not successful:
426 # a. Another repair is queued: this repair job will set the host
427 # status, and it will remain in 'Provisioning' till then.
428 # b. We have hit the max_repair_limit: in which case the host
429 # status is set to 'RepairFailed' in the epilog of PreJobTask.
430 # 2. The task was successful, but there are other special tasks:
431 # Those special tasks will set the host status appropriately.
beeps5e2bb4a2013-10-28 11:26:45 -0700432 if self._should_pending():
433 self.queue_entry.on_pending()
beeps5e2bb4a2013-10-28 11:26:45 -0700434
435
436class RepairTask(agent_task.SpecialAgentTask):
437 TASK_TYPE = models.SpecialTask.Task.REPAIR
438
439
440 def __init__(self, task):
441 """\
442 queue_entry: queue entry to mark failed if this repair fails.
443 """
444 protection = host_protections.Protection.get_string(
445 task.host.protection)
446 # normalize the protection name
447 protection = host_protections.Protection.get_attr_name(protection)
448
Alex Millerec212252014-02-28 16:48:34 -0800449 args = ['-R', '--host-protection', protection]
450 if task.queue_entry:
451 args.extend(self._generate_autoserv_label_args(task))
452
453 super(RepairTask, self).__init__(task, args)
beeps5e2bb4a2013-10-28 11:26:45 -0700454
455 # *don't* include the queue entry in IDs -- if the queue entry is
456 # aborted, we want to leave the repair task running
457 self._set_ids(host=self.host)
458
459
460 def prolog(self):
461 super(RepairTask, self).prolog()
462 logging.info("repair_task starting")
463 self.host.set_status(models.Host.Status.REPAIRING)
464
465
466 def epilog(self):
467 super(RepairTask, self).epilog()
468
469 if self.success:
470 self.host.set_status(models.Host.Status.READY)
471 else:
472 self.host.set_status(models.Host.Status.REPAIR_FAILED)
473 if self.queue_entry:
474 self._fail_queue_entry()
Richard Barnetteffed1722016-05-18 15:57:22 -0700475 self.host.record_working_state(bool(self.success),
476 self.task.time_finished)