Add job maximum runtime, a new per-job timeout that counts time since the job actually started. * added started_on field to host_queue_entries, so that we could actually compute this timeout * added max_runtime_hrs to jobs, with default in global config, and added option to create_job() RPC * added the usual controls to AFE and the CLI for the new job option * added new max runtime timeout method to * added migration to add new fields and set a safe default max runtime for existing jobs Signed-off-by: Steve Howard <showard@google.com> git-svn-id: http://test.kernel.org/svn/autotest/trunk@3132 592f7852-d20e-0410-864c-8624ca9c26a4

commit: 12f3e3212795a539d95973f893ac570e669e3a22 [log] [tgz]
author: showard <showard@592f7852-d20e-0410-864c-8624ca9c26a4> Wed May 13 21:27:42 2009 +0000
committer: showard <showard@592f7852-d20e-0410-864c-8624ca9c26a4> Wed May 13 21:27:42 2009 +0000
tree: f361d0fd90bd6227c5b635308d7a44bee2a27dbd
parent: 838c747138ffceb8ee26f04e1d4e6a442f127e6a [diff] [blame]
diff --git a/scheduler/monitor_db_cleanup.py b/scheduler/monitor_db_cleanup.py
index 9a55da6..59313c0 100644
--- a/scheduler/monitor_db_cleanup.py
+++ b/scheduler/monitor_db_cleanup.py

@@ -52,6 +52,7 @@
             logging.info('Running periodic cleanup')
             self._abort_timed_out_jobs()
             self._abort_jobs_past_synch_start_timeout()
+            self._abort_jobs_past_max_runtime()
             self._clear_inactive_blocks()
             self._check_for_db_inconsistencies()
 
@@ -88,6 +89,24 @@
                 queue_entry.abort(None)
 
 
+    def _abort_jobs_past_max_runtime(self):
+        """
+        Abort executions that have started and are past the job's max runtime.
+        """
+        logging.info('Aborting all jobs that have passed maximum runtime')
+        rows = self._db.execute("""
+            SELECT hqe.id
+            FROM host_queue_entries AS hqe
+            INNER JOIN jobs ON (hqe.job_id = jobs.id)
+            WHERE NOT hqe.complete AND NOT hqe.aborted AND
+            hqe.started_on + INTERVAL jobs.max_runtime_hrs HOUR < NOW()""")
+        query = models.HostQueueEntry.objects.filter(
+            id__in=[row[0] for row in rows])
+        for queue_entry in query.distinct():
+            logging.warning('Aborting entry %s due to max runtime', queue_entry)
+            queue_entry.abort(None)
+
+
     def _check_for_db_inconsistencies(self):
         logging.info('Checking for db inconsistencies')
         query = models.HostQueueEntry.objects.filter(active=True, complete=True)
commit	12f3e3212795a539d95973f893ac570e669e3a22	[log] [tgz]
author	showard <showard@592f7852-d20e-0410-864c-8624ca9c26a4>	Wed May 13 21:27:42 2009 +0000
committer	showard <showard@592f7852-d20e-0410-864c-8624ca9c26a4>	Wed May 13 21:27:42 2009 +0000
tree	f361d0fd90bd6227c5b635308d7a44bee2a27dbd
parent	838c747138ffceb8ee26f04e1d4e6a442f127e6a [diff] [blame]