Treat unrecoverable host queue entries as a fatal error. Their existance
means we've got a consistency problem that needs human intervention to clean
up.
This can happen when a previously running monitor_db dies due within a race
condition window such as a host_queue_entry being created without its
corresponding special_task entry existing yet. (such races are actively
being looked at and fixed)
Signed-off-by: Gregory Smith <gps@google.com>
git-svn-id: http://test.kernel.org/svn/autotest/trunk@3569 592f7852-d20e-0410-864c-8624ca9c26a4
diff --git a/scheduler/monitor_db.py b/scheduler/monitor_db.py
index 2cf2b10..1368295 100755
--- a/scheduler/monitor_db.py
+++ b/scheduler/monitor_db.py
@@ -912,12 +912,13 @@
where='active AND NOT complete AND '
'(aborted OR status != "Pending")')
- message = '\n'.join(str(entry) for entry in queue_entries
- if not self.get_agents_for_entry(entry))
- if message:
- email_manager.manager.enqueue_notify_email(
- 'Unrecovered active host queue entries exist',
- message)
+ unrecovered_active_hqes = [entry for entry in queue_entries
+ if not self.get_agents_for_entry(entry)]
+ if unrecovered_active_hqes:
+ message = '\n'.join(str(hqe) for hqe in unrecovered_active_hqes)
+ raise SchedulerError(
+ '%d unrecovered active host queue entries:\n%s' %
+ (len(unrecovered_active_hqes), message))
def _find_reverify(self):