This patch enables the scheduler to pick up jobs that were left running after it crashes, and see them to completion.

-autoserv writes .autoserv_execute file with pid + exit status
-PidfileRunMonitor class in monitor_db uses this file to track running autoserv processes, double-checks with /proc filesystem
-monitor_db default recovery changed to
  -pick up running QueueEntry processes
  -kill other autoserv processes
  -requeue verifying queue entries
  -reverify hosts that were left in verify/repair
-monitor_db optional host recovery flag verifies Repair Failed hosts, and looks for running hosts with no corresponding active queue entries

The --no-recover scheduler option has been removed.  The scheduler always recovers running jobs when starting up.  There is a new --recover-hosts options, which forces the scheduler to reverify all dead hosts.  Shutdown mode has also been removed from the scheduler, since the scheduler may now be upgraded by simply killing and restarting it.

Signed-off-by: Steve Howard <showard@google.com>



git-svn-id: http://test.kernel.org/svn/autotest/trunk@1332 592f7852-d20e-0410-864c-8624ca9c26a4
diff --git a/server/autoserv b/server/autoserv
index 931f3c0..4fc5a3d 100755
--- a/server/autoserv
+++ b/server/autoserv
@@ -15,13 +15,36 @@
 
 import sys, os, re, server_job, hosts.site_host, utils, traceback, signal
 
+class PidFileManager(object):
+	pid_file = None
+
+	def open_pid_file(self, results_dir):
+		pid_file_path = os.path.join(results_dir, '.autoserv_execute')
+		assert not os.path.exists(pid_file_path)
+		self.pid_file = open(pid_file_path, 'w')
+		self.pid_file.write(str(os.getpid()) + '\n')
+		self.pid_file.flush()
+
+
+	def close_pid_file(self, exit_code, signal_code=0):
+		if not self.pid_file:
+			return
+		real_exit_code = (exit_code << 8) | (signal_code & 0xFF)
+		self.pid_file.write(str(real_exit_code) + '\n')
+		self.pid_file.close()
+
+
+pid_file_manager = PidFileManager()
+
+
 # Create separate process group
 os.setpgrp()
 
-# Implement SIGTERM handler	
+# Implement SIGTERM handler
 def handle_sigint(signum, frame):
+	pid_file_manager.close_pid_file(1, signal.SIGTERM)
 	os.killpg(os.getpgrp(), signal.SIGKILL)
-	
+
 # Set signal handler
 signal.signal(signal.SIGTERM, handle_sigint)
 
@@ -40,6 +63,7 @@
 	[-v]                       # verify the machines only
 	[-R]                       # repair the machines
 	[-n]                       # no teeing the status to stdout/stderr
+	[-p]                       # write pidfile (.autoserv_execute)
 	<control file>             # name of the control file to run
 	[args ...]                 # args to pass through to the control file
 """
@@ -65,6 +89,7 @@
 verify   = parser.parse_opts('-v')
 repair   = parser.parse_opts('-R')
 no_tee   = parser.parse_opts('-n')
+write_pidfile = parser.parse_opts('-p')
 
 
 if len(parser.args) < 1 and not verify and not repair:
@@ -104,19 +129,24 @@
 	job.stdout.tee_redirect(os.path.join(debug_dir, 'autoserv.stdout'))
 	job.stderr.tee_redirect(os.path.join(debug_dir, 'autoserv.stderr'))
 
-if repair:
-	job.repair()
-	sys.exit(0)
-elif verify:
-	job.verify()
-	sys.exit(0)
+if write_pidfile:
+	pid_file_manager.open_pid_file(results)
 
+# run the job
+exit_code = 0
 try:
-	job.run(reboot, install_before, install_after)
+	if repair:
+		job.repair()
+	elif verify:
+		job.verify()
+	else:
+		job.run(reboot, install_before, install_after)
 except:
 	job.aborted = True
 	traceback.print_exc()
 
-# if the job was aborted, return a non-zero error code
 if getattr(job, 'aborted', False):
-	sys.exit(1)
+	exit_code = 1
+pid_file_manager.close_pid_file(exit_code)
+
+sys.exit(exit_code)