| #pylint: disable=C0111 |
| |
| """ |
| Prejob tasks. |
| |
| Prejob tasks _usually_ run before a job and verify the state of a machine. |
| Cleanup and repair are exceptions, cleanup can run after a job too, while |
| repair will run anytime the host needs a repair, which could be pre or post |
| job. Most of the work specific to this module is achieved through the prolog |
| and epilog of each task. |
| |
| All prejob tasks must have a host, though they may not have an HQE. If a |
| prejob task has a hqe, it will activate the hqe through its on_pending |
| method on successful completion. A row in afe_special_tasks with values: |
| host=C1, unlocked, is_active=0, is_complete=0, type=Verify |
| will indicate to the scheduler that it needs to schedule a new special task |
| of type=Verify, against the C1 host. While the special task is running |
| the scheduler only monitors it through the Agent, and its is_active bit=1. |
| Once a special task finishes, we set its is_active=0, is_complete=1 and |
| success bits, so the scheduler ignores it. |
| HQE.on_pending: |
| Host, HQE -> Pending, Starting |
| This status is acted upon in the scheduler, to assign an AgentTask. |
| PreJobTask: |
| epilog: |
| failure: |
| requeue hqe |
| repair the host |
| Children PreJobTasks: |
| prolog: |
| set Host, HQE status |
| epilog: |
| success: |
| on_pending |
| failure: |
| repair throgh PreJobTask |
| set Host, HQE status |
| |
| Failing a prejob task effects both the Host and the HQE, as follows: |
| |
| - Host: PreJob failure will result in a Repair job getting queued against |
| the host, is we haven't already tried repairing it more than the |
| max_repair_limit. When this happens, the host will remain in whatever status |
| the prejob task left it in, till the Repair job puts it into 'Repairing'. This |
| way the host_scheduler won't pick bad hosts and assign them to jobs. |
| |
| If we have already tried repairing the host too many times, the PreJobTask |
| will flip the host to 'RepairFailed' in its epilog, and it will remain in this |
| state till it is recovered and reverified. |
| |
| - HQE: Is either requeued or failed. Requeuing the HQE involves putting it |
| in the Queued state and setting its host_id to None, so it gets a new host |
| in the next scheduler tick. Failing the HQE results in either a Parsing |
| or Archiving postjob task, and an eventual Failed status for the HQE. |
| """ |
| |
| import logging |
| import re |
| |
| from autotest_lib.client.common_lib import host_protections |
| from autotest_lib.frontend.afe import models |
| from autotest_lib.scheduler import agent_task, scheduler_config |
| from autotest_lib.server import autoserv_utils |
| from autotest_lib.server.cros import provision |
| |
| |
| class PreJobTask(agent_task.SpecialAgentTask): |
| def epilog(self): |
| super(PreJobTask, self).epilog() |
| |
| if self.host.protection == host_protections.Protection.DO_NOT_VERIFY: |
| # effectively ignore failure for these hosts |
| self.success = True |
| |
| if self.success: |
| self.host.record_working_state(True, |
| self.task.time_finished) |
| return |
| |
| if self.queue_entry: |
| # If we requeue a HQE, we should cancel any remaining pre-job |
| # tasks against this host, otherwise we'll be left in a state |
| # where a queued HQE has special tasks to run against a host. |
| models.SpecialTask.objects.filter( |
| queue_entry__id=self.queue_entry.id, |
| host__id=self.host.id, |
| is_complete=0).update(is_complete=1, success=0) |
| |
| previous_provisions = models.SpecialTask.objects.filter( |
| task=models.SpecialTask.Task.PROVISION, |
| queue_entry_id=self.queue_entry.id).count() |
| if (previous_provisions > |
| scheduler_config.config.max_provision_retries): |
| self._actually_fail_queue_entry() |
| # This abort will mark the aborted bit on the HQE itself, to |
| # signify that we're killing it. Technically it also will do |
| # the recursive aborting of all child jobs, but that shouldn't |
| # matter here, as only suites have children, and those are |
| # hostless and thus don't have provisioning. |
| # TODO(milleral) http://crbug.com/188217 |
| # However, we can't actually do this yet, as if we set the |
| # abort bit the FinalReparseTask will set the status of the HQE |
| # to ABORTED, which then means that we don't show the status in |
| # run_suite. So in the meantime, don't mark the HQE as |
| # aborted. |
| # queue_entry.abort() |
| else: |
| # requeue() must come after handling provision retries, since |
| # _actually_fail_queue_entry needs an execution subdir. |
| # We also don't want to requeue if we hit the provision retry |
| # limit, since then we overwrite the PARSING state of the HQE. |
| self.queue_entry.requeue() |
| |
| # Limit the repair on a host when a prejob task fails, e.g., reset, |
| # verify etc. The number of repair jobs is limited to the specific |
| # HQE and host. |
| previous_repairs = models.SpecialTask.objects.filter( |
| task=models.SpecialTask.Task.REPAIR, |
| queue_entry_id=self.queue_entry.id, |
| host_id=self.queue_entry.host_id).count() |
| if previous_repairs >= scheduler_config.config.max_repair_limit: |
| self.host.set_status(models.Host.Status.REPAIR_FAILED) |
| self._fail_queue_entry() |
| return |
| |
| queue_entry = models.HostQueueEntry.objects.get( |
| id=self.queue_entry.id) |
| else: |
| queue_entry = None |
| |
| models.SpecialTask.objects.create( |
| host=models.Host.objects.get(id=self.host.id), |
| task=models.SpecialTask.Task.REPAIR, |
| queue_entry=queue_entry, |
| requested_by=self.task.requested_by) |
| |
| |
| def _should_pending(self): |
| """ |
| Decide if we should call the host queue entry's on_pending method. |
| We should if: |
| 1) There exists an associated host queue entry. |
| 2) The current special task completed successfully. |
| 3) There do not exist any more special tasks to be run before the |
| host queue entry starts. |
| |
| @returns: True if we should call pending, false if not. |
| |
| """ |
| if not self.queue_entry or not self.success: |
| return False |
| |
| # We know if this is the last one when we create it, so we could add |
| # another column to the database to keep track of this information, but |
| # I expect the overhead of querying here to be minimal. |
| queue_entry = models.HostQueueEntry.objects.get(id=self.queue_entry.id) |
| queued = models.SpecialTask.objects.filter( |
| host__id=self.host.id, is_active=False, |
| is_complete=False, queue_entry=queue_entry) |
| queued = queued.exclude(id=self.task.id) |
| return queued.count() == 0 |
| |
| |
| class VerifyTask(PreJobTask): |
| TASK_TYPE = models.SpecialTask.Task.VERIFY |
| |
| |
| def __init__(self, task): |
| args = ['-v'] |
| if task.queue_entry: |
| args.extend(self._generate_autoserv_label_args(task)) |
| super(VerifyTask, self).__init__(task, args) |
| self._set_ids(host=self.host, queue_entries=[self.queue_entry]) |
| |
| |
| def prolog(self): |
| super(VerifyTask, self).prolog() |
| |
| logging.info("starting verify on %s", self.host.hostname) |
| if self.queue_entry: |
| self.queue_entry.set_status(models.HostQueueEntry.Status.VERIFYING) |
| self.host.set_status(models.Host.Status.VERIFYING) |
| |
| # Delete any queued manual reverifies for this host. One verify will do |
| # and there's no need to keep records of other requests. |
| self.remove_special_tasks(models.SpecialTask.Task.VERIFY, |
| keep_last_one=True) |
| |
| |
| def epilog(self): |
| super(VerifyTask, self).epilog() |
| if self.success: |
| if self._should_pending(): |
| self.queue_entry.on_pending() |
| else: |
| self.host.set_status(models.Host.Status.READY) |
| |
| |
| class CleanupTask(PreJobTask): |
| # note this can also run post-job, but when it does, it's running standalone |
| # against the host (not related to the job), so it's not considered a |
| # PostJobTask |
| |
| TASK_TYPE = models.SpecialTask.Task.CLEANUP |
| |
| |
| def __init__(self, task, recover_run_monitor=None): |
| args = ['--cleanup'] |
| if task.queue_entry: |
| args.extend(self._generate_autoserv_label_args(task)) |
| super(CleanupTask, self).__init__(task, args) |
| self._set_ids(host=self.host, queue_entries=[self.queue_entry]) |
| |
| |
| def prolog(self): |
| super(CleanupTask, self).prolog() |
| logging.info("starting cleanup task for host: %s", self.host.hostname) |
| self.host.set_status(models.Host.Status.CLEANING) |
| if self.queue_entry: |
| self.queue_entry.set_status(models.HostQueueEntry.Status.CLEANING) |
| |
| |
| def _finish_epilog(self): |
| if not self.queue_entry or not self.success: |
| return |
| |
| do_not_verify_protection = host_protections.Protection.DO_NOT_VERIFY |
| should_run_verify = ( |
| self.queue_entry.job.run_verify |
| and self.host.protection != do_not_verify_protection) |
| if should_run_verify: |
| entry = models.HostQueueEntry.objects.get(id=self.queue_entry.id) |
| models.SpecialTask.objects.create( |
| host=models.Host.objects.get(id=self.host.id), |
| queue_entry=entry, |
| task=models.SpecialTask.Task.VERIFY) |
| else: |
| if self._should_pending(): |
| self.queue_entry.on_pending() |
| |
| |
| def epilog(self): |
| super(CleanupTask, self).epilog() |
| |
| if self.success: |
| self.host.update_field('dirty', 0) |
| self.host.set_status(models.Host.Status.READY) |
| |
| self._finish_epilog() |
| |
| |
| class ResetTask(PreJobTask): |
| """Task to reset a DUT, including cleanup and verify.""" |
| # note this can also run post-job, but when it does, it's running standalone |
| # against the host (not related to the job), so it's not considered a |
| # PostJobTask |
| |
| TASK_TYPE = models.SpecialTask.Task.RESET |
| |
| |
| def __init__(self, task, recover_run_monitor=None): |
| args = ['--reset'] |
| if task.queue_entry: |
| args.extend(self._generate_autoserv_label_args(task)) |
| super(ResetTask, self).__init__(task, args) |
| self._set_ids(host=self.host, queue_entries=[self.queue_entry]) |
| |
| |
| def prolog(self): |
| super(ResetTask, self).prolog() |
| logging.info('starting reset task for host: %s', |
| self.host.hostname) |
| self.host.set_status(models.Host.Status.RESETTING) |
| if self.queue_entry: |
| self.queue_entry.set_status(models.HostQueueEntry.Status.RESETTING) |
| |
| # Delete any queued cleanups for this host. |
| self.remove_special_tasks(models.SpecialTask.Task.CLEANUP, |
| keep_last_one=False) |
| |
| # Delete any queued reverifies for this host. |
| self.remove_special_tasks(models.SpecialTask.Task.VERIFY, |
| keep_last_one=False) |
| |
| # Only one reset is needed. |
| self.remove_special_tasks(models.SpecialTask.Task.RESET, |
| keep_last_one=True) |
| |
| |
| def epilog(self): |
| super(ResetTask, self).epilog() |
| |
| if self.success: |
| self.host.update_field('dirty', 0) |
| |
| if self._should_pending(): |
| self.queue_entry.on_pending() |
| else: |
| self.host.set_status(models.Host.Status.READY) |
| |
| |
| # TODO (ayatane): Refactor using server/cros/provision |
| def _is_cros_version(label): |
| """Return whether the label is a cros-version: label.""" |
| return label.startswith('cros-version:') |
| |
| |
| # TODO (ayatane): Refactor using server/cros/provision |
| def _get_cros_version(label): |
| """Return cros-version from cros-version label.""" |
| return label[len('cros-version:'):] |
| |
| |
| # TODO (ayatane): Refactor into server/cros/provision |
| class _CrosImage(object): |
| """The name of a CrOS image.""" |
| |
| _name_pattern = re.compile( |
| r'^' |
| r'(?P<group>[a-z0-9-]+)' |
| r'/' |
| r'(?P<milestone>LATEST|R[0-9]+)' |
| r'-' |
| r'(?P<version>[0-9.]+)' |
| r'(-(?P<rc>rc[0-9]+))?' |
| r'$' |
| ) |
| |
| def __init__(self, name): |
| """Initialize instance. |
| |
| @param name: Image name string (lumpy-release/R27-3773.0.0) |
| """ |
| self._name = name |
| match = self._name_pattern.search(name) |
| self.group = match.group('group') |
| self.milestone = match.group('milestone') |
| self.version = match.group('version') |
| self.rc = match.group('rc') |
| |
| def __repr__(self): |
| return '{cls}({name!r})'.format(cls=type(self).__name__, |
| name=self._name) |
| |
| def __str__(self): |
| return self._name |
| |
| |
| class ProvisionTask(PreJobTask): |
| TASK_TYPE = models.SpecialTask.Task.PROVISION |
| |
| def __init__(self, task): |
| # Provisioning requires that we be associated with a job/queue entry |
| assert task.queue_entry, "No HQE associated with provision task!" |
| # task.queue_entry is an afe model HostQueueEntry object. |
| # self.queue_entry is a scheduler models HostQueueEntry object, but |
| # it gets constructed and assigned in __init__, so it's not available |
| # yet. Therefore, we're stuck pulling labels off of the afe model |
| # so that we can pass the --provision args into the __init__ call. |
| labels = {x.name for x in task.queue_entry.job.labels} |
| _, provisionable = provision.Provision.partition(labels) |
| extra_command_args = ['--provision', |
| '--job-labels', ','.join(provisionable)] |
| super(ProvisionTask, self).__init__(task, extra_command_args) |
| self._set_milestone(labels) |
| self._set_ids(host=self.host, queue_entries=[self.queue_entry]) |
| |
| def _set_milestone(self, labels): |
| """Set build milestone from the labels. |
| |
| @param labels: iterable of labels. |
| """ |
| labels = (label |
| for label in labels |
| if _is_cros_version(label)) |
| for label in labels: |
| cros_image = _CrosImage(_get_cros_version(label)) |
| self._milestone = cros_image.milestone |
| break |
| |
| def _command_line(self): |
| # If we give queue_entry to _autoserv_command_line, then it will append |
| # -c for this invocation if the queue_entry is a client side test. We |
| # don't want that, as it messes with provisioning, so we just drop it |
| # from the arguments here. |
| # Note that we also don't verify job_repo_url as provisioining tasks are |
| # required to stage whatever content we need, and the job itself will |
| # force autotest to be staged if it isn't already. |
| return autoserv_utils._autoserv_command_line(self.host.hostname, |
| self._extra_command_args, |
| in_lab=True) |
| |
| |
| def prolog(self): |
| super(ProvisionTask, self).prolog() |
| # add check for previous provision task and abort if exist. |
| logging.info("starting provision task for host: %s", self.host.hostname) |
| self.queue_entry.set_status( |
| models.HostQueueEntry.Status.PROVISIONING) |
| self.host.set_status(models.Host.Status.PROVISIONING) |
| |
| |
| def epilog(self): |
| super(ProvisionTask, self).epilog() |
| |
| # If we were not successful in provisioning the machine |
| # leave the DUT in whatever status was set in the PreJobTask's |
| # epilog. If this task was successful the host status will get |
| # set appropriately as a fallout of the hqe's on_pending. If |
| # we don't call on_pending, it can only be because: |
| # 1. This task was not successful: |
| # a. Another repair is queued: this repair job will set the host |
| # status, and it will remain in 'Provisioning' till then. |
| # b. We have hit the max_repair_limit: in which case the host |
| # status is set to 'RepairFailed' in the epilog of PreJobTask. |
| # 2. The task was successful, but there are other special tasks: |
| # Those special tasks will set the host status appropriately. |
| if self._should_pending(): |
| self.queue_entry.on_pending() |
| |
| |
| class RepairTask(agent_task.SpecialAgentTask): |
| TASK_TYPE = models.SpecialTask.Task.REPAIR |
| |
| |
| def __init__(self, task): |
| """\ |
| queue_entry: queue entry to mark failed if this repair fails. |
| """ |
| protection = host_protections.Protection.get_string( |
| task.host.protection) |
| # normalize the protection name |
| protection = host_protections.Protection.get_attr_name(protection) |
| |
| args = ['-R', '--host-protection', protection] |
| if task.queue_entry: |
| args.extend(self._generate_autoserv_label_args(task)) |
| |
| super(RepairTask, self).__init__(task, args) |
| |
| # *don't* include the queue entry in IDs -- if the queue entry is |
| # aborted, we want to leave the repair task running |
| self._set_ids(host=self.host) |
| |
| |
| def prolog(self): |
| super(RepairTask, self).prolog() |
| logging.info("repair_task starting") |
| self.host.set_status(models.Host.Status.REPAIRING) |
| |
| |
| def epilog(self): |
| super(RepairTask, self).epilog() |
| |
| if self.success: |
| self.host.set_status(models.Host.Status.READY) |
| else: |
| self.host.set_status(models.Host.Status.REPAIR_FAILED) |
| if self.queue_entry: |
| self._fail_queue_entry() |
| self.host.record_working_state(bool(self.success), |
| self.task.time_finished) |