* impose prioritization on SpecialTasks based on task type: Repair, then Cleanup, then Verify.  remove prioritization of STs with queue entry over those without.  this leads to more sane ordering of execution in certain unusual contexts -- the added functional test cases illustrate a few (in some cases, it's not just more sane, it eliminates bugs as well).
* block STs from running on hosts with active HQEs, unless the ST is linked to the HQE.  this is a good check in general but specifically prevents a bug where a requested reverify could run on a host in pending.  there's a functional test case for that too.
* block jobs from running on hosts with active agents, and let special tasks get scheduled before new jobs in each tick.  this is necessary for some cases after removing the above-mentioned prioritization of STs with HQEs.  otherwise, for example, a job could get scheduled before a previous post-job cleanup has run.  (new test cases cover this as well.)

Signed-off-by: Steve Howard <showard@google.com>


git-svn-id: http://test.kernel.org/svn/autotest/trunk@3890 592f7852-d20e-0410-864c-8624ca9c26a4
diff --git a/scheduler/monitor_db_functional_test.py b/scheduler/monitor_db_functional_test.py
index 1008b37..eaa9e5b 100644
--- a/scheduler/monitor_db_functional_test.py
+++ b/scheduler/monitor_db_functional_test.py
@@ -54,6 +54,11 @@
 
 
 class MockDroneManager(NullMethodObject):
+    """
+    Public attributes:
+    max_runnable_processes_value: value returned by max_runnable_processes().
+            tests can change this to activate throttling.
+    """
     _NULL_METHODS = ('reinitialize_drones', 'copy_to_results_repository',
                      'copy_results_on_drone')
 
@@ -72,6 +77,8 @@
 
     def __init__(self):
         super(MockDroneManager, self).__init__()
+        self.max_runnable_processes_value = 100
+
         # maps result_dir to set of tuples (file_path, file_contents)
         self._attached_files = {}
         # maps pidfile IDs to PidfileContents
@@ -98,6 +105,11 @@
         self._set_pidfile_exit_status(pidfile_id, exit_status)
 
 
+    def finish_specific_process(self, working_directory, pidfile_name):
+        pidfile_id = self._pidfile_index[(working_directory, pidfile_name)]
+        self._set_pidfile_exit_status(pidfile_id, 0)
+
+
     def _set_pidfile_exit_status(self, pidfile_id, exit_status):
         assert pidfile_id is not None
         contents = self._pidfiles[pidfile_id]
@@ -128,7 +140,7 @@
 
 
     def max_runnable_processes(self):
-        return 100
+        return self.max_runnable_processes_value
 
 
     def refresh(self):
@@ -652,5 +664,132 @@
                                                 _PidfileType.VERIFY)
 
 
+    def test_job_scheduled_just_after_abort(self):
+        # test a pretty obscure corner case where a job is aborted while queued,
+        # another job is ready to run, and throttling is active. the post-abort
+        # cleanup must not be pre-empted by the second job.
+        job1, queue_entry1 = self._make_job_and_queue_entry()
+        job2, queue_entry2 = self._make_job_and_queue_entry()
+
+        self.mock_drone_manager.max_runnable_processes_value = 0
+        self._run_dispatcher() # schedule job1, but won't start verify
+        job1.hostqueueentry_set.update(aborted=True)
+        self.mock_drone_manager.max_runnable_processes_value = 100
+        self._run_dispatcher() # cleanup must run here, not verify for job2
+        self._check_statuses(queue_entry1, HqeStatus.ABORTED,
+                             HostStatus.CLEANING)
+        self.mock_drone_manager.finish_process(_PidfileType.CLEANUP)
+        self._run_dispatcher() # now verify starts for job2
+        self._check_statuses(queue_entry2, HqeStatus.VERIFYING,
+                             HostStatus.VERIFYING)
+
+
+    def _test_job_scheduled_just_after_abort_2(self):
+        # test a pretty obscure corner case where a job is aborted while queued,
+        # another job is ready to run, and throttling is active. the post-abort
+        # cleanup must not be pre-empted by the second job.
+        job1, _ = self._make_job_and_queue_entry()
+        job2 = self._create_job(hosts=[1,2])
+        job2.synch_count = 2
+        job2.save()
+
+        self.mock_drone_manager.max_runnable_processes_value = 0
+        self._run_dispatcher() # schedule job1, but won't start verify
+        job1.hostqueueentry_set.update(aborted=True)
+        self.mock_drone_manager.max_runnable_processes_value = 100
+        self._run_dispatcher()
+        self.mock_drone_manager.finish_process(_PidfileType.VERIFY)
+        self._run_dispatcher()
+        self.mock_drone_manager.finish_process(_PidfileType.CLEANUP)
+        self._run_dispatcher()
+        self.mock_drone_manager.finish_specific_process(
+                'hosts/host2/2-verify', monitor_db._AUTOSERV_PID_FILE)
+        self._run_dispatcher()
+
+
+    def test_reverify_interrupting_pre_job(self):
+        # ensure things behave sanely if a reverify is scheduled in the middle
+        # of pre-job actions
+        _, queue_entry = self._make_job_and_queue_entry()
+
+        self._run_dispatcher() # pre-job verify
+        self._create_reverify_request()
+        self.mock_drone_manager.finish_process(_PidfileType.VERIFY,
+                                               exit_status=256)
+        self._run_dispatcher() # repair
+        self.mock_drone_manager.finish_process(_PidfileType.REPAIR)
+        self._run_dispatcher() # reverify runs now
+        self.mock_drone_manager.finish_process(_PidfileType.VERIFY)
+        self._run_dispatcher() # pre-job verify
+        self.mock_drone_manager.finish_process(_PidfileType.VERIFY)
+        self._run_dispatcher() # and job runs...
+        self._check_statuses(queue_entry, HqeStatus.RUNNING, HostStatus.RUNNING)
+        self._finish_job(queue_entry) # reverify has been deleted
+        self._check_statuses(queue_entry, HqeStatus.COMPLETED,
+                             HostStatus.READY)
+        self._assert_nothing_is_running()
+
+
+    def test_reverify_while_job_running(self):
+        # once a job is running, a reverify must not be allowed to preempt
+        # Gathering
+        _, queue_entry = self._make_job_and_queue_entry()
+        self._run_pre_job_verify(queue_entry)
+        self._run_dispatcher() # job runs
+        self._create_reverify_request()
+        # make job end with a signal, so gathering will run
+        self.mock_drone_manager.finish_process(_PidfileType.JOB,
+                                               exit_status=271)
+        self._run_dispatcher() # gathering must start
+        self.mock_drone_manager.finish_process(_PidfileType.GATHER)
+        self._run_dispatcher() # parsing and cleanup
+        self._finish_parsing_and_cleanup()
+        self._run_dispatcher() # now reverify runs
+        self._check_statuses(queue_entry, HqeStatus.FAILED,
+                             HostStatus.VERIFYING)
+        self.mock_drone_manager.finish_process(_PidfileType.VERIFY)
+        self._run_dispatcher()
+        self._check_host_status(queue_entry.host, HostStatus.READY)
+
+
+    def test_reverify_while_host_pending(self):
+        # ensure that if a reverify is scheduled while a host is in Pending, it
+        # won't run until the host is actually free
+        job = self._create_job(hosts=[1,2])
+        queue_entry = job.hostqueueentry_set.get(host__hostname='host1')
+        job.synch_count = 2
+        job.save()
+
+        host2 = self.hosts[1]
+        host2.locked = True
+        host2.save()
+
+        self._run_dispatcher() # verify host1
+        self.mock_drone_manager.finish_process(_PidfileType.VERIFY)
+        self._run_dispatcher() # host1 Pending
+        self._check_statuses(queue_entry, HqeStatus.PENDING, HostStatus.PENDING)
+        self._create_reverify_request()
+        self._run_dispatcher() # nothing should happen here
+        self._check_statuses(queue_entry, HqeStatus.PENDING, HostStatus.PENDING)
+
+        # now let the job run
+        host2.locked = False
+        host2.save()
+        self._run_dispatcher() # verify host2
+        self.mock_drone_manager.finish_process(_PidfileType.VERIFY)
+        self._run_dispatcher() # run job
+        self._finish_job(queue_entry)
+        # need to explicitly finish host1's post-job cleanup
+        self.mock_drone_manager.finish_specific_process(
+                'hosts/host1/4-cleanup', monitor_db._AUTOSERV_PID_FILE)
+        self._run_dispatcher()
+        # the reverify should now be running
+        self._check_statuses(queue_entry, HqeStatus.COMPLETED,
+                             HostStatus.VERIFYING)
+        self.mock_drone_manager.finish_process(_PidfileType.VERIFY)
+        self._run_dispatcher()
+        self._check_host_status(queue_entry.host, HostStatus.READY)
+
+
 if __name__ == '__main__':
     unittest.main()