blob: a394d0ca92595edcae616d481182e09a0fc726a7 [file] [log] [blame]
beeps5e2bb4a2013-10-28 11:26:45 -07001#pylint: disable-msg=C0111
2
3"""
4Prejob tasks.
5
6Prejob tasks _usually_ run before a job and verify the state of a machine.
7Cleanup and repair are exceptions, cleanup can run after a job too, while
8repair will run anytime the host needs a repair, which could be pre or post
9job. Most of the work specific to this module is achieved through the prolog
10and epilog of each task.
11
12All prejob tasks must have a host, though they may not have an HQE. If a
13prejob task has a hqe, it will activate the hqe through its on_pending
beepsec1c4b22013-11-18 08:26:39 -080014method on successful completion. A row in afe_special_tasks with values:
beeps5e2bb4a2013-10-28 11:26:45 -070015 host=C1, unlocked, is_active=0, is_complete=0, type=Verify
16will indicate to the scheduler that it needs to schedule a new special task
17of type=Verify, against the C1 host. While the special task is running
18the scheduler only monitors it through the Agent, and its is_active bit=1.
19Once a special task finishes, we set its is_active=0, is_complete=1 and
20success bits, so the scheduler ignores it.
21HQE.on_pending:
22 Host, HQE -> Pending, Starting
23 This status is acted upon in the scheduler, to assign an AgentTask.
24PreJobTask:
25 epilog:
26 failure:
27 requeue hqe
28 repair the host
29Children PreJobTasks:
30 prolog:
31 set Host, HQE status
32 epilog:
33 success:
34 on_pending
35 failure:
36 repair throgh PreJobTask
37 set Host, HQE status
beepsec1c4b22013-11-18 08:26:39 -080038
39Failing a prejob task effects both the Host and the HQE, as follows:
40
41- Host: PreJob failure will result in a Repair job getting queued against
42the host, is we haven't already tried repairing it more than the
43max_repair_limit. When this happens, the host will remain in whatever status
44the prejob task left it in, till the Repair job puts it into 'Repairing'. This
45way the host_scheduler won't pick bad hosts and assign them to jobs.
46
47If we have already tried repairing the host too many times, the PreJobTask
48will flip the host to 'RepairFailed' in its epilog, and it will remain in this
49state till it is recovered and reverified.
50
51- HQE: Is either requeued or failed. Requeuing the HQE involves putting it
52in the Queued state and setting its host_id to None, so it gets a new host
53in the next scheduler tick. Failing the HQE results in either a Parsing
54or Archiving postjob task, and an eventual Failed status for the HQE.
beeps5e2bb4a2013-10-28 11:26:45 -070055"""
beepsec1c4b22013-11-18 08:26:39 -080056
beeps5e2bb4a2013-10-28 11:26:45 -070057import logging
beeps5e2bb4a2013-10-28 11:26:45 -070058
59from autotest_lib.client.common_lib import host_protections
60from autotest_lib.frontend.afe import models
61from autotest_lib.scheduler import agent_task, scheduler_config
62from autotest_lib.server import autoserv_utils
63from autotest_lib.server.cros import provision
64
65
66class PreJobTask(agent_task.SpecialAgentTask):
beeps5e2bb4a2013-10-28 11:26:45 -070067 def epilog(self):
68 super(PreJobTask, self).epilog()
69
70 if self.success:
71 return
72
73 if self.host.protection == host_protections.Protection.DO_NOT_VERIFY:
74 # effectively ignore failure for these hosts
75 self.success = True
76 return
77
78 if self.queue_entry:
79 # If we requeue a HQE, we should cancel any remaining pre-job
80 # tasks against this host, otherwise we'll be left in a state
81 # where a queued HQE has special tasks to run against a host.
82 models.SpecialTask.objects.filter(
83 queue_entry__id=self.queue_entry.id,
84 host__id=self.host.id,
85 is_complete=0).update(is_complete=1, success=0)
86
87 previous_provisions = models.SpecialTask.objects.filter(
88 task=models.SpecialTask.Task.PROVISION,
89 queue_entry_id=self.queue_entry.id).count()
90 if (previous_provisions >
91 scheduler_config.config.max_provision_retries):
92 self._actually_fail_queue_entry()
93 # This abort will mark the aborted bit on the HQE itself, to
94 # signify that we're killing it. Technically it also will do
95 # the recursive aborting of all child jobs, but that shouldn't
96 # matter here, as only suites have children, and those are
97 # hostless and thus don't have provisioning.
98 # TODO(milleral) http://crbug.com/188217
99 # However, we can't actually do this yet, as if we set the
100 # abort bit the FinalReparseTask will set the status of the HQE
101 # to ABORTED, which then means that we don't show the status in
102 # run_suite. So in the meantime, don't mark the HQE as
103 # aborted.
104 # queue_entry.abort()
105 else:
106 # requeue() must come after handling provision retries, since
107 # _actually_fail_queue_entry needs an execution subdir.
108 # We also don't want to requeue if we hit the provision retry
109 # limit, since then we overwrite the PARSING state of the HQE.
110 self.queue_entry.requeue()
111
Dan Shia1f0d022014-10-24 12:13:04 -0700112 # Limit the repair on a host when a prejob task fails, e.g., reset,
113 # verify etc. The number of repair jobs is limited to the specific
114 # HQE and host.
beeps5e2bb4a2013-10-28 11:26:45 -0700115 previous_repairs = models.SpecialTask.objects.filter(
116 task=models.SpecialTask.Task.REPAIR,
Dan Shia1f0d022014-10-24 12:13:04 -0700117 queue_entry_id=self.queue_entry.id,
118 host_id=self.queue_entry.host_id).count()
beeps5e2bb4a2013-10-28 11:26:45 -0700119 if previous_repairs >= scheduler_config.config.max_repair_limit:
120 self.host.set_status(models.Host.Status.REPAIR_FAILED)
121 self._fail_queue_entry()
122 return
123
124 queue_entry = models.HostQueueEntry.objects.get(
125 id=self.queue_entry.id)
126 else:
127 queue_entry = None
128
129 models.SpecialTask.objects.create(
130 host=models.Host.objects.get(id=self.host.id),
131 task=models.SpecialTask.Task.REPAIR,
132 queue_entry=queue_entry,
133 requested_by=self.task.requested_by)
134
135
136 def _should_pending(self):
137 """
138 Decide if we should call the host queue entry's on_pending method.
139 We should if:
140 1) There exists an associated host queue entry.
141 2) The current special task completed successfully.
142 3) There do not exist any more special tasks to be run before the
143 host queue entry starts.
144
145 @returns: True if we should call pending, false if not.
146
147 """
148 if not self.queue_entry or not self.success:
149 return False
150
151 # We know if this is the last one when we create it, so we could add
152 # another column to the database to keep track of this information, but
153 # I expect the overhead of querying here to be minimal.
154 queue_entry = models.HostQueueEntry.objects.get(id=self.queue_entry.id)
155 queued = models.SpecialTask.objects.filter(
156 host__id=self.host.id, is_active=False,
157 is_complete=False, queue_entry=queue_entry)
158 queued = queued.exclude(id=self.task.id)
159 return queued.count() == 0
160
161
162class VerifyTask(PreJobTask):
163 TASK_TYPE = models.SpecialTask.Task.VERIFY
164
165
166 def __init__(self, task):
Alex Millerec212252014-02-28 16:48:34 -0800167 args = ['-v']
168 if task.queue_entry:
169 args.extend(self._generate_autoserv_label_args(task))
170 super(VerifyTask, self).__init__(task, args)
beeps5e2bb4a2013-10-28 11:26:45 -0700171 self._set_ids(host=self.host, queue_entries=[self.queue_entry])
172
173
174 def prolog(self):
175 super(VerifyTask, self).prolog()
176
177 logging.info("starting verify on %s", self.host.hostname)
178 if self.queue_entry:
179 self.queue_entry.set_status(models.HostQueueEntry.Status.VERIFYING)
180 self.host.set_status(models.Host.Status.VERIFYING)
181
182 # Delete any queued manual reverifies for this host. One verify will do
183 # and there's no need to keep records of other requests.
184 self.remove_special_tasks(models.SpecialTask.Task.VERIFY,
185 keep_last_one=True)
186
187
188 def epilog(self):
189 super(VerifyTask, self).epilog()
190 if self.success:
191 if self._should_pending():
192 self.queue_entry.on_pending()
193 else:
194 self.host.set_status(models.Host.Status.READY)
195
196
197class CleanupTask(PreJobTask):
198 # note this can also run post-job, but when it does, it's running standalone
199 # against the host (not related to the job), so it's not considered a
200 # PostJobTask
201
202 TASK_TYPE = models.SpecialTask.Task.CLEANUP
203
204
205 def __init__(self, task, recover_run_monitor=None):
Alex Millerec212252014-02-28 16:48:34 -0800206 args = ['--cleanup']
207 if task.queue_entry:
208 args.extend(self._generate_autoserv_label_args(task))
209 super(CleanupTask, self).__init__(task, args)
beeps5e2bb4a2013-10-28 11:26:45 -0700210 self._set_ids(host=self.host, queue_entries=[self.queue_entry])
211
212
213 def prolog(self):
214 super(CleanupTask, self).prolog()
215 logging.info("starting cleanup task for host: %s", self.host.hostname)
216 self.host.set_status(models.Host.Status.CLEANING)
217 if self.queue_entry:
218 self.queue_entry.set_status(models.HostQueueEntry.Status.CLEANING)
219
220
221 def _finish_epilog(self):
222 if not self.queue_entry or not self.success:
223 return
224
225 do_not_verify_protection = host_protections.Protection.DO_NOT_VERIFY
226 should_run_verify = (
227 self.queue_entry.job.run_verify
228 and self.host.protection != do_not_verify_protection)
229 if should_run_verify:
230 entry = models.HostQueueEntry.objects.get(id=self.queue_entry.id)
231 models.SpecialTask.objects.create(
232 host=models.Host.objects.get(id=self.host.id),
233 queue_entry=entry,
234 task=models.SpecialTask.Task.VERIFY)
235 else:
236 if self._should_pending():
237 self.queue_entry.on_pending()
238
239
240 def epilog(self):
241 super(CleanupTask, self).epilog()
242
243 if self.success:
244 self.host.update_field('dirty', 0)
245 self.host.set_status(models.Host.Status.READY)
246
247 self._finish_epilog()
248
249
250class ResetTask(PreJobTask):
251 """Task to reset a DUT, including cleanup and verify."""
252 # note this can also run post-job, but when it does, it's running standalone
253 # against the host (not related to the job), so it's not considered a
254 # PostJobTask
255
256 TASK_TYPE = models.SpecialTask.Task.RESET
257
258
259 def __init__(self, task, recover_run_monitor=None):
Alex Millerec212252014-02-28 16:48:34 -0800260 args = ['--reset']
261 if task.queue_entry:
262 args.extend(self._generate_autoserv_label_args(task))
263 super(ResetTask, self).__init__(task, args)
beeps5e2bb4a2013-10-28 11:26:45 -0700264 self._set_ids(host=self.host, queue_entries=[self.queue_entry])
265
266
267 def prolog(self):
268 super(ResetTask, self).prolog()
269 logging.info('starting reset task for host: %s',
270 self.host.hostname)
271 self.host.set_status(models.Host.Status.RESETTING)
272 if self.queue_entry:
273 self.queue_entry.set_status(models.HostQueueEntry.Status.RESETTING)
274
275 # Delete any queued cleanups for this host.
276 self.remove_special_tasks(models.SpecialTask.Task.CLEANUP,
277 keep_last_one=False)
278
279 # Delete any queued reverifies for this host.
280 self.remove_special_tasks(models.SpecialTask.Task.VERIFY,
281 keep_last_one=False)
282
283 # Only one reset is needed.
284 self.remove_special_tasks(models.SpecialTask.Task.RESET,
285 keep_last_one=True)
286
287
288 def epilog(self):
289 super(ResetTask, self).epilog()
290
291 if self.success:
292 self.host.update_field('dirty', 0)
293
294 if self._should_pending():
295 self.queue_entry.on_pending()
296 else:
297 self.host.set_status(models.Host.Status.READY)
298
299
300class ProvisionTask(PreJobTask):
301 TASK_TYPE = models.SpecialTask.Task.PROVISION
302
303 def __init__(self, task):
304 # Provisioning requires that we be associated with a job/queue entry
305 assert task.queue_entry, "No HQE associated with provision task!"
306 # task.queue_entry is an afe model HostQueueEntry object.
307 # self.queue_entry is a scheduler models HostQueueEntry object, but
308 # it gets constructed and assigned in __init__, so it's not available
309 # yet. Therefore, we're stuck pulling labels off of the afe model
310 # so that we can pass the --provision args into the __init__ call.
Alex Millerec212252014-02-28 16:48:34 -0800311 labels = {x.name for x in task.queue_entry.job.labels}
Dan Shi7279a5a2016-04-07 11:04:28 -0700312 _, provisionable = provision.Provision.partition(labels)
Alex Millerdf15ec52014-02-28 18:18:48 -0800313 extra_command_args = ['--provision',
Alex Millerec212252014-02-28 16:48:34 -0800314 '--job-labels', ','.join(provisionable)]
beeps5e2bb4a2013-10-28 11:26:45 -0700315 super(ProvisionTask, self).__init__(task, extra_command_args)
316 self._set_ids(host=self.host, queue_entries=[self.queue_entry])
317
318
319 def _command_line(self):
320 # If we give queue_entry to _autoserv_command_line, then it will append
321 # -c for this invocation if the queue_entry is a client side test. We
322 # don't want that, as it messes with provisioning, so we just drop it
323 # from the arguments here.
324 # Note that we also don't verify job_repo_url as provisioining tasks are
325 # required to stage whatever content we need, and the job itself will
326 # force autotest to be staged if it isn't already.
327 return autoserv_utils._autoserv_command_line(self.host.hostname,
Simran Basi8e6affb2015-12-16 11:54:11 -0800328 self._extra_command_args,
329 in_lab=True)
beeps5e2bb4a2013-10-28 11:26:45 -0700330
331
332 def prolog(self):
333 super(ProvisionTask, self).prolog()
334 # add check for previous provision task and abort if exist.
335 logging.info("starting provision task for host: %s", self.host.hostname)
336 self.queue_entry.set_status(
337 models.HostQueueEntry.Status.PROVISIONING)
338 self.host.set_status(models.Host.Status.PROVISIONING)
339
340
341 def epilog(self):
342 super(ProvisionTask, self).epilog()
343
beepsec1c4b22013-11-18 08:26:39 -0800344 # If we were not successful in provisioning the machine
345 # leave the DUT in whatever status was set in the PreJobTask's
346 # epilog. If this task was successful the host status will get
347 # set appropriately as a fallout of the hqe's on_pending. If
348 # we don't call on_pending, it can only be because:
349 # 1. This task was not successful:
350 # a. Another repair is queued: this repair job will set the host
351 # status, and it will remain in 'Provisioning' till then.
352 # b. We have hit the max_repair_limit: in which case the host
353 # status is set to 'RepairFailed' in the epilog of PreJobTask.
354 # 2. The task was successful, but there are other special tasks:
355 # Those special tasks will set the host status appropriately.
beeps5e2bb4a2013-10-28 11:26:45 -0700356 if self._should_pending():
357 self.queue_entry.on_pending()
beeps5e2bb4a2013-10-28 11:26:45 -0700358
359
360class RepairTask(agent_task.SpecialAgentTask):
361 TASK_TYPE = models.SpecialTask.Task.REPAIR
362
363
364 def __init__(self, task):
365 """\
366 queue_entry: queue entry to mark failed if this repair fails.
367 """
368 protection = host_protections.Protection.get_string(
369 task.host.protection)
370 # normalize the protection name
371 protection = host_protections.Protection.get_attr_name(protection)
372
Alex Millerec212252014-02-28 16:48:34 -0800373 args = ['-R', '--host-protection', protection]
374 if task.queue_entry:
375 args.extend(self._generate_autoserv_label_args(task))
376
377 super(RepairTask, self).__init__(task, args)
beeps5e2bb4a2013-10-28 11:26:45 -0700378
379 # *don't* include the queue entry in IDs -- if the queue entry is
380 # aborted, we want to leave the repair task running
381 self._set_ids(host=self.host)
382
383
384 def prolog(self):
385 super(RepairTask, self).prolog()
386 logging.info("repair_task starting")
387 self.host.set_status(models.Host.Status.REPAIRING)
388
389
390 def epilog(self):
391 super(RepairTask, self).epilog()
392
393 if self.success:
394 self.host.set_status(models.Host.Status.READY)
395 else:
396 self.host.set_status(models.Host.Status.REPAIR_FAILED)
397 if self.queue_entry:
398 self._fail_queue_entry()