One-off fix to address the issue where a scheduler shutdown immediately
after a special task leaves the HQE in a strange state. Specifically, we
saw this when a cleanup fails, and the scheduler shuts down before the
associated repair starts.
HQEs are now requeued after a failed cleanup/verify.
TODO: reimplement scheduler to maintain less state in memory by not
relying on storing an array of AgentTasks.
Risk: medium (scheduler change)
Visibility: medium (scheduler bug fix)
Signed-off-by: James Ren <jamesren@google.com>
git-svn-id: http://test.kernel.org/svn/autotest/trunk@3573 592f7852-d20e-0410-864c-8624ca9c26a4
diff --git a/scheduler/monitor_db.py b/scheduler/monitor_db.py
index d761f1a..54871bc 100755
--- a/scheduler/monitor_db.py
+++ b/scheduler/monitor_db.py
@@ -1686,14 +1686,7 @@
protection = host_protections.Protection.get_attr_name(protection)
self.host = host
- self.queue_entry = None
- # recovery code can pass a HQE that's already been requeued. for a
- # metahost, that means the host has been unassigned. in that case,
- # ignore the HQE.
- hqe_still_assigned_to_this_host = (queue_entry and queue_entry.host
- and queue_entry.host.id == host.id)
- if hqe_still_assigned_to_this_host:
- self.queue_entry = queue_entry
+ self.queue_entry = queue_entry
super(RepairTask, self).__init__(
task, ['-R', '--host-protection', protection],
@@ -1708,8 +1701,6 @@
super(RepairTask, self).prolog()
logging.info("repair_task starting")
self.host.set_status('Repairing')
- if self.queue_entry:
- self.queue_entry.requeue()
def _keyval_path(self):
@@ -1770,6 +1761,9 @@
self.monitor.get_process(), source,
destination_path=destination)
+ if not self.success and self.queue_entry:
+ self.queue_entry.requeue()
+
class VerifyTask(PreJobTask):
TASK_TYPE = models.SpecialTask.Task.VERIFY