blob: 4524fd70aae71459d1bc2f0e83d997fda230510c [file] [log] [blame]
beeps5e2bb4a2013-10-28 11:26:45 -07001#pylint: disable-msg=C0111
2
3"""
4Prejob tasks.
5
6Prejob tasks _usually_ run before a job and verify the state of a machine.
7Cleanup and repair are exceptions, cleanup can run after a job too, while
8repair will run anytime the host needs a repair, which could be pre or post
9job. Most of the work specific to this module is achieved through the prolog
10and epilog of each task.
11
12All prejob tasks must have a host, though they may not have an HQE. If a
13prejob task has a hqe, it will activate the hqe through its on_pending
beepsec1c4b22013-11-18 08:26:39 -080014method on successful completion. A row in afe_special_tasks with values:
beeps5e2bb4a2013-10-28 11:26:45 -070015 host=C1, unlocked, is_active=0, is_complete=0, type=Verify
16will indicate to the scheduler that it needs to schedule a new special task
17of type=Verify, against the C1 host. While the special task is running
18the scheduler only monitors it through the Agent, and its is_active bit=1.
19Once a special task finishes, we set its is_active=0, is_complete=1 and
20success bits, so the scheduler ignores it.
21HQE.on_pending:
22 Host, HQE -> Pending, Starting
23 This status is acted upon in the scheduler, to assign an AgentTask.
24PreJobTask:
25 epilog:
26 failure:
27 requeue hqe
28 repair the host
29Children PreJobTasks:
30 prolog:
31 set Host, HQE status
32 epilog:
33 success:
34 on_pending
35 failure:
36 repair throgh PreJobTask
37 set Host, HQE status
beepsec1c4b22013-11-18 08:26:39 -080038
39Failing a prejob task effects both the Host and the HQE, as follows:
40
41- Host: PreJob failure will result in a Repair job getting queued against
42the host, is we haven't already tried repairing it more than the
43max_repair_limit. When this happens, the host will remain in whatever status
44the prejob task left it in, till the Repair job puts it into 'Repairing'. This
45way the host_scheduler won't pick bad hosts and assign them to jobs.
46
47If we have already tried repairing the host too many times, the PreJobTask
48will flip the host to 'RepairFailed' in its epilog, and it will remain in this
49state till it is recovered and reverified.
50
51- HQE: Is either requeued or failed. Requeuing the HQE involves putting it
52in the Queued state and setting its host_id to None, so it gets a new host
53in the next scheduler tick. Failing the HQE results in either a Parsing
54or Archiving postjob task, and an eventual Failed status for the HQE.
beeps5e2bb4a2013-10-28 11:26:45 -070055"""
beepsec1c4b22013-11-18 08:26:39 -080056
beeps5e2bb4a2013-10-28 11:26:45 -070057import logging
58import os
59
60from autotest_lib.client.common_lib import host_protections
61from autotest_lib.frontend.afe import models
62from autotest_lib.scheduler import agent_task, scheduler_config
63from autotest_lib.server import autoserv_utils
64from autotest_lib.server.cros import provision
65
66
67class PreJobTask(agent_task.SpecialAgentTask):
68 def _copy_to_results_repository(self):
69 if not self.queue_entry or self.queue_entry.meta_host:
70 return
71
72 self.queue_entry.set_execution_subdir()
73 log_name = os.path.basename(self.task.execution_path())
74 source = os.path.join(self.task.execution_path(), 'debug',
75 'autoserv.DEBUG')
76 destination = os.path.join(
77 self.queue_entry.execution_path(), log_name)
78
79 self.monitor.try_copy_to_results_repository(
80 source, destination_path=destination)
81
82
83 def epilog(self):
84 super(PreJobTask, self).epilog()
85
86 if self.success:
87 return
88
89 if self.host.protection == host_protections.Protection.DO_NOT_VERIFY:
90 # effectively ignore failure for these hosts
91 self.success = True
92 return
93
94 if self.queue_entry:
95 # If we requeue a HQE, we should cancel any remaining pre-job
96 # tasks against this host, otherwise we'll be left in a state
97 # where a queued HQE has special tasks to run against a host.
98 models.SpecialTask.objects.filter(
99 queue_entry__id=self.queue_entry.id,
100 host__id=self.host.id,
101 is_complete=0).update(is_complete=1, success=0)
102
103 previous_provisions = models.SpecialTask.objects.filter(
104 task=models.SpecialTask.Task.PROVISION,
105 queue_entry_id=self.queue_entry.id).count()
106 if (previous_provisions >
107 scheduler_config.config.max_provision_retries):
108 self._actually_fail_queue_entry()
109 # This abort will mark the aborted bit on the HQE itself, to
110 # signify that we're killing it. Technically it also will do
111 # the recursive aborting of all child jobs, but that shouldn't
112 # matter here, as only suites have children, and those are
113 # hostless and thus don't have provisioning.
114 # TODO(milleral) http://crbug.com/188217
115 # However, we can't actually do this yet, as if we set the
116 # abort bit the FinalReparseTask will set the status of the HQE
117 # to ABORTED, which then means that we don't show the status in
118 # run_suite. So in the meantime, don't mark the HQE as
119 # aborted.
120 # queue_entry.abort()
121 else:
122 # requeue() must come after handling provision retries, since
123 # _actually_fail_queue_entry needs an execution subdir.
124 # We also don't want to requeue if we hit the provision retry
125 # limit, since then we overwrite the PARSING state of the HQE.
126 self.queue_entry.requeue()
127
Dan Shia1f0d022014-10-24 12:13:04 -0700128 # Limit the repair on a host when a prejob task fails, e.g., reset,
129 # verify etc. The number of repair jobs is limited to the specific
130 # HQE and host.
beeps5e2bb4a2013-10-28 11:26:45 -0700131 previous_repairs = models.SpecialTask.objects.filter(
132 task=models.SpecialTask.Task.REPAIR,
Dan Shia1f0d022014-10-24 12:13:04 -0700133 queue_entry_id=self.queue_entry.id,
134 host_id=self.queue_entry.host_id).count()
beeps5e2bb4a2013-10-28 11:26:45 -0700135 if previous_repairs >= scheduler_config.config.max_repair_limit:
136 self.host.set_status(models.Host.Status.REPAIR_FAILED)
137 self._fail_queue_entry()
138 return
139
140 queue_entry = models.HostQueueEntry.objects.get(
141 id=self.queue_entry.id)
142 else:
143 queue_entry = None
144
145 models.SpecialTask.objects.create(
146 host=models.Host.objects.get(id=self.host.id),
147 task=models.SpecialTask.Task.REPAIR,
148 queue_entry=queue_entry,
149 requested_by=self.task.requested_by)
150
151
152 def _should_pending(self):
153 """
154 Decide if we should call the host queue entry's on_pending method.
155 We should if:
156 1) There exists an associated host queue entry.
157 2) The current special task completed successfully.
158 3) There do not exist any more special tasks to be run before the
159 host queue entry starts.
160
161 @returns: True if we should call pending, false if not.
162
163 """
164 if not self.queue_entry or not self.success:
165 return False
166
167 # We know if this is the last one when we create it, so we could add
168 # another column to the database to keep track of this information, but
169 # I expect the overhead of querying here to be minimal.
170 queue_entry = models.HostQueueEntry.objects.get(id=self.queue_entry.id)
171 queued = models.SpecialTask.objects.filter(
172 host__id=self.host.id, is_active=False,
173 is_complete=False, queue_entry=queue_entry)
174 queued = queued.exclude(id=self.task.id)
175 return queued.count() == 0
176
177
178class VerifyTask(PreJobTask):
179 TASK_TYPE = models.SpecialTask.Task.VERIFY
180
181
182 def __init__(self, task):
Alex Millerec212252014-02-28 16:48:34 -0800183 args = ['-v']
184 if task.queue_entry:
185 args.extend(self._generate_autoserv_label_args(task))
186 super(VerifyTask, self).__init__(task, args)
beeps5e2bb4a2013-10-28 11:26:45 -0700187 self._set_ids(host=self.host, queue_entries=[self.queue_entry])
188
189
190 def prolog(self):
191 super(VerifyTask, self).prolog()
192
193 logging.info("starting verify on %s", self.host.hostname)
194 if self.queue_entry:
195 self.queue_entry.set_status(models.HostQueueEntry.Status.VERIFYING)
196 self.host.set_status(models.Host.Status.VERIFYING)
197
198 # Delete any queued manual reverifies for this host. One verify will do
199 # and there's no need to keep records of other requests.
200 self.remove_special_tasks(models.SpecialTask.Task.VERIFY,
201 keep_last_one=True)
202
203
204 def epilog(self):
205 super(VerifyTask, self).epilog()
206 if self.success:
207 if self._should_pending():
208 self.queue_entry.on_pending()
209 else:
210 self.host.set_status(models.Host.Status.READY)
211
212
213class CleanupTask(PreJobTask):
214 # note this can also run post-job, but when it does, it's running standalone
215 # against the host (not related to the job), so it's not considered a
216 # PostJobTask
217
218 TASK_TYPE = models.SpecialTask.Task.CLEANUP
219
220
221 def __init__(self, task, recover_run_monitor=None):
Alex Millerec212252014-02-28 16:48:34 -0800222 args = ['--cleanup']
223 if task.queue_entry:
224 args.extend(self._generate_autoserv_label_args(task))
225 super(CleanupTask, self).__init__(task, args)
beeps5e2bb4a2013-10-28 11:26:45 -0700226 self._set_ids(host=self.host, queue_entries=[self.queue_entry])
227
228
229 def prolog(self):
230 super(CleanupTask, self).prolog()
231 logging.info("starting cleanup task for host: %s", self.host.hostname)
232 self.host.set_status(models.Host.Status.CLEANING)
233 if self.queue_entry:
234 self.queue_entry.set_status(models.HostQueueEntry.Status.CLEANING)
235
236
237 def _finish_epilog(self):
238 if not self.queue_entry or not self.success:
239 return
240
241 do_not_verify_protection = host_protections.Protection.DO_NOT_VERIFY
242 should_run_verify = (
243 self.queue_entry.job.run_verify
244 and self.host.protection != do_not_verify_protection)
245 if should_run_verify:
246 entry = models.HostQueueEntry.objects.get(id=self.queue_entry.id)
247 models.SpecialTask.objects.create(
248 host=models.Host.objects.get(id=self.host.id),
249 queue_entry=entry,
250 task=models.SpecialTask.Task.VERIFY)
251 else:
252 if self._should_pending():
253 self.queue_entry.on_pending()
254
255
256 def epilog(self):
257 super(CleanupTask, self).epilog()
258
259 if self.success:
260 self.host.update_field('dirty', 0)
261 self.host.set_status(models.Host.Status.READY)
262
263 self._finish_epilog()
264
265
266class ResetTask(PreJobTask):
267 """Task to reset a DUT, including cleanup and verify."""
268 # note this can also run post-job, but when it does, it's running standalone
269 # against the host (not related to the job), so it's not considered a
270 # PostJobTask
271
272 TASK_TYPE = models.SpecialTask.Task.RESET
273
274
275 def __init__(self, task, recover_run_monitor=None):
Alex Millerec212252014-02-28 16:48:34 -0800276 args = ['--reset']
277 if task.queue_entry:
278 args.extend(self._generate_autoserv_label_args(task))
279 super(ResetTask, self).__init__(task, args)
beeps5e2bb4a2013-10-28 11:26:45 -0700280 self._set_ids(host=self.host, queue_entries=[self.queue_entry])
281
282
283 def prolog(self):
284 super(ResetTask, self).prolog()
285 logging.info('starting reset task for host: %s',
286 self.host.hostname)
287 self.host.set_status(models.Host.Status.RESETTING)
288 if self.queue_entry:
289 self.queue_entry.set_status(models.HostQueueEntry.Status.RESETTING)
290
291 # Delete any queued cleanups for this host.
292 self.remove_special_tasks(models.SpecialTask.Task.CLEANUP,
293 keep_last_one=False)
294
295 # Delete any queued reverifies for this host.
296 self.remove_special_tasks(models.SpecialTask.Task.VERIFY,
297 keep_last_one=False)
298
299 # Only one reset is needed.
300 self.remove_special_tasks(models.SpecialTask.Task.RESET,
301 keep_last_one=True)
302
303
304 def epilog(self):
305 super(ResetTask, self).epilog()
306
307 if self.success:
308 self.host.update_field('dirty', 0)
309
310 if self._should_pending():
311 self.queue_entry.on_pending()
312 else:
313 self.host.set_status(models.Host.Status.READY)
314
315
316class ProvisionTask(PreJobTask):
317 TASK_TYPE = models.SpecialTask.Task.PROVISION
318
319 def __init__(self, task):
320 # Provisioning requires that we be associated with a job/queue entry
321 assert task.queue_entry, "No HQE associated with provision task!"
322 # task.queue_entry is an afe model HostQueueEntry object.
323 # self.queue_entry is a scheduler models HostQueueEntry object, but
324 # it gets constructed and assigned in __init__, so it's not available
325 # yet. Therefore, we're stuck pulling labels off of the afe model
326 # so that we can pass the --provision args into the __init__ call.
Alex Millerec212252014-02-28 16:48:34 -0800327 labels = {x.name for x in task.queue_entry.job.labels}
beeps5e2bb4a2013-10-28 11:26:45 -0700328 _, provisionable = provision.filter_labels(labels)
Alex Millerdf15ec52014-02-28 18:18:48 -0800329 extra_command_args = ['--provision',
Alex Millerec212252014-02-28 16:48:34 -0800330 '--job-labels', ','.join(provisionable)]
beeps5e2bb4a2013-10-28 11:26:45 -0700331 super(ProvisionTask, self).__init__(task, extra_command_args)
332 self._set_ids(host=self.host, queue_entries=[self.queue_entry])
333
334
335 def _command_line(self):
336 # If we give queue_entry to _autoserv_command_line, then it will append
337 # -c for this invocation if the queue_entry is a client side test. We
338 # don't want that, as it messes with provisioning, so we just drop it
339 # from the arguments here.
340 # Note that we also don't verify job_repo_url as provisioining tasks are
341 # required to stage whatever content we need, and the job itself will
342 # force autotest to be staged if it isn't already.
343 return autoserv_utils._autoserv_command_line(self.host.hostname,
344 self._extra_command_args)
345
346
347 def prolog(self):
348 super(ProvisionTask, self).prolog()
349 # add check for previous provision task and abort if exist.
350 logging.info("starting provision task for host: %s", self.host.hostname)
351 self.queue_entry.set_status(
352 models.HostQueueEntry.Status.PROVISIONING)
353 self.host.set_status(models.Host.Status.PROVISIONING)
354
355
356 def epilog(self):
357 super(ProvisionTask, self).epilog()
358
beepsec1c4b22013-11-18 08:26:39 -0800359 # If we were not successful in provisioning the machine
360 # leave the DUT in whatever status was set in the PreJobTask's
361 # epilog. If this task was successful the host status will get
362 # set appropriately as a fallout of the hqe's on_pending. If
363 # we don't call on_pending, it can only be because:
364 # 1. This task was not successful:
365 # a. Another repair is queued: this repair job will set the host
366 # status, and it will remain in 'Provisioning' till then.
367 # b. We have hit the max_repair_limit: in which case the host
368 # status is set to 'RepairFailed' in the epilog of PreJobTask.
369 # 2. The task was successful, but there are other special tasks:
370 # Those special tasks will set the host status appropriately.
beeps5e2bb4a2013-10-28 11:26:45 -0700371 if self._should_pending():
372 self.queue_entry.on_pending()
beeps5e2bb4a2013-10-28 11:26:45 -0700373
374
375class RepairTask(agent_task.SpecialAgentTask):
376 TASK_TYPE = models.SpecialTask.Task.REPAIR
377
378
379 def __init__(self, task):
380 """\
381 queue_entry: queue entry to mark failed if this repair fails.
382 """
383 protection = host_protections.Protection.get_string(
384 task.host.protection)
385 # normalize the protection name
386 protection = host_protections.Protection.get_attr_name(protection)
387
Alex Millerec212252014-02-28 16:48:34 -0800388 args = ['-R', '--host-protection', protection]
389 if task.queue_entry:
390 args.extend(self._generate_autoserv_label_args(task))
391
392 super(RepairTask, self).__init__(task, args)
beeps5e2bb4a2013-10-28 11:26:45 -0700393
394 # *don't* include the queue entry in IDs -- if the queue entry is
395 # aborted, we want to leave the repair task running
396 self._set_ids(host=self.host)
397
398
399 def prolog(self):
400 super(RepairTask, self).prolog()
401 logging.info("repair_task starting")
402 self.host.set_status(models.Host.Status.REPAIRING)
403
404
405 def epilog(self):
406 super(RepairTask, self).epilog()
407
408 if self.success:
409 self.host.set_status(models.Host.Status.READY)
410 else:
411 self.host.set_status(models.Host.Status.REPAIR_FAILED)
412 if self.queue_entry:
413 self._fail_queue_entry()