add a bunch of tests to the scheduler functional test to cover pre- and post-job cleanup, including failure cases

Signed-off-by: Steve Howard <showard@google.com>


git-svn-id: http://test.kernel.org/svn/autotest/trunk@3871 592f7852-d20e-0410-864c-8624ca9c26a4
diff --git a/scheduler/monitor_db_functional_test.py b/scheduler/monitor_db_functional_test.py
index 9ec510e..2921d50 100644
--- a/scheduler/monitor_db_functional_test.py
+++ b/scheduler/monitor_db_functional_test.py
@@ -1,6 +1,6 @@
 #!/usr/bin/python
 
-import logging, unittest
+import logging, os, unittest
 import common
 from autotest_lib.client.common_lib import enum, global_config
 from autotest_lib.database import database_connection
@@ -17,6 +17,9 @@
         _re_translator(r'GROUP_CONCAT\((.*?)\)', r'\1'),
 )
 
+HqeStatus = models.HostQueueEntry.Status
+HostStatus = models.Host.Status
+
 class NullMethodObject(object):
     _NULL_METHODS = ()
 
@@ -51,8 +54,21 @@
 
 
 class MockDroneManager(NullMethodObject):
-    _NULL_METHODS = ('refresh', 'reinitialize_drones',
-                     'copy_to_results_repository')
+    _NULL_METHODS = ('reinitialize_drones', 'copy_to_results_repository',
+                     'copy_results_on_drone')
+
+    class _DummyPidfileId(object):
+        """
+        Object to represent pidfile IDs that is opaque to the scheduler code but
+        still debugging-friendly for us.
+        """
+        def __init__(self, debug_string):
+            self._debug_string = debug_string
+
+
+        def __str__(self):
+            return self._debug_string
+
 
     def __init__(self):
         super(MockDroneManager, self).__init__()
@@ -70,6 +86,9 @@
         self._process_index = {}
         # tracks pidfiles of processes that have been killed
         self._killed_pidfiles = set()
+        # pidfile IDs that have just been unregistered (so will disappear on the
+        # next cycle)
+        self._unregistered_pidfiles = set()
 
 
     # utility APIs for use by the test
@@ -91,6 +110,13 @@
         return pidfile_id in self._killed_pidfiles
 
 
+    def running_pidfile_ids(self):
+        return [str(pidfile_id) for pidfile_id, pidfile_contents
+                in self._pidfiles.iteritems()
+                if pidfile_contents.process is not None
+                and pidfile_contents.exit_status is None]
+
+
     # DroneManager emulation APIs for use by monitor_db
 
     def get_orphaned_autoserv_processes(self):
@@ -105,6 +131,13 @@
         return 100
 
 
+    def refresh(self):
+        for pidfile_id in self._unregistered_pidfiles:
+            # intentionally handle non-registered pidfiles silently
+            self._pidfiles.pop(pidfile_id, None)
+        self._unregistered_pidfiles = set()
+
+
     def execute_actions(self):
         # executing an "execute_command" causes a pidfile to be created
         for pidfile_id in self._future_pidfiles:
@@ -146,7 +179,8 @@
 
     def execute_command(self, command, working_directory, pidfile_name,
                         log_file=None, paired_with_pidfile=None):
-        pidfile_id = object() # PidfileIds are opaque to monitor_db
+        pidfile_id = self._DummyPidfileId(
+                self._get_pidfile_debug_string(working_directory, pidfile_name))
         self._future_pidfiles.append(pidfile_id)
         self._initialize_pidfile(pidfile_id)
         self._pidfile_index[(working_directory, pidfile_name)] = pidfile_id
@@ -154,9 +188,14 @@
         return pidfile_id
 
 
+    def _get_pidfile_debug_string(self, working_directory, pidfile_name):
+        return os.path.join(working_directory, pidfile_name)
+
+
     def get_pidfile_contents(self, pidfile_id, use_second_read=False):
-        return self._pidfiles.get(pidfile_id,
-                                           drone_manager.PidfileContents())
+        if pidfile_id not in self._pidfiles:
+            print 'Request for nonexistent pidfile %s' % pidfile_id
+        return self._pidfiles.get(pidfile_id, drone_manager.PidfileContents())
 
 
     def is_process_running(self, process):
@@ -168,8 +207,7 @@
 
 
     def unregister_pidfile(self, pidfile_id):
-        # intentionally handle non-registered pidfiles silently
-        self._pidfiles.pop(pidfile_id, None)
+        self._unregistered_pidfiles.add(pidfile_id)
 
 
     def absolute_path(self, path):
@@ -182,7 +220,11 @@
 
 
     def get_pidfile_id_from(self, execution_tag, pidfile_name):
-        return self._pidfile_index.get((execution_tag, pidfile_name), object())
+        debug_string = ('Nonexistent pidfile: '
+                        + self._get_pidfile_debug_string(execution_tag,
+                                                         pidfile_name))
+        return self._pidfile_index.get((execution_tag, pidfile_name),
+                                       self._DummyPidfileId(debug_string))
 
 
     def kill_process(self, process):
@@ -263,24 +305,136 @@
                      '%s/%s not executed' % (working_directory, pidfile_name))
 
 
+    def _check_statuses(self, queue_entry, queue_entry_status, host_status):
+        # update from DB
+        queue_entry = models.HostQueueEntry.objects.get(id=queue_entry.id)
+        self.assertEquals(queue_entry.status, queue_entry_status)
+        self.assertEquals(queue_entry.host.status, host_status)
+
+
+    def _run_pre_job_verify(self, queue_entry):
+        self._run_dispatcher() # launches verify
+        self._check_statuses(queue_entry, HqeStatus.VERIFYING,
+                             HostStatus.VERIFYING)
+        self.mock_drone_manager.finish_process(_PidfileType.VERIFY)
+
+
     def test_simple_job(self):
         self._initialize_test()
         job, queue_entry = self._make_job_and_queue_entry()
-        self._run_dispatcher() # launches verify
-        self.mock_drone_manager.finish_process(_PidfileType.VERIFY)
+        self._run_pre_job_verify(queue_entry)
         self._run_dispatcher() # launches job
-        self._finish_job()
-
-        # update from DB
-        queue_entry = models.HostQueueEntry.objects.get(id=queue_entry.id)
-        self.assertEquals(queue_entry.status,
-                          models.HostQueueEntry.Status.COMPLETED)
-        self.assertEquals(queue_entry.host.status, models.Host.Status.READY)
+        self._check_statuses(queue_entry, HqeStatus.RUNNING, HostStatus.RUNNING)
+        self._finish_job(queue_entry)
+        self._check_statuses(queue_entry, HqeStatus.COMPLETED, HostStatus.READY)
+        self._assert_nothing_is_running()
 
 
-    def _finish_job(self):
+    def _setup_for_pre_job_cleanup(self):
+        self._initialize_test()
+        job, queue_entry = self._make_job_and_queue_entry()
+        job.reboot_before = models.RebootBefore.ALWAYS
+        job.save()
+        return queue_entry
+
+
+    def _run_pre_job_cleanup_job(self, queue_entry):
+        self._run_dispatcher() # cleanup
+        self._check_statuses(queue_entry, HqeStatus.VERIFYING,
+                             HostStatus.CLEANING)
+        self.mock_drone_manager.finish_process(_PidfileType.CLEANUP)
+        self._run_dispatcher() # verify
+        self.mock_drone_manager.finish_process(_PidfileType.VERIFY)
+        self._run_dispatcher() # job
+        self._finish_job(queue_entry)
+
+
+    def test_pre_job_cleanup(self):
+        queue_entry = self._setup_for_pre_job_cleanup()
+        self._run_pre_job_cleanup_job(queue_entry)
+
+
+    def _run_pre_job_cleanup_one_failure(self):
+        queue_entry = self._setup_for_pre_job_cleanup()
+        self._run_dispatcher() # cleanup
+        self.mock_drone_manager.finish_process(_PidfileType.CLEANUP,
+                                               exit_status=256)
+        self._run_dispatcher() # repair
+        self._check_statuses(queue_entry, HqeStatus.QUEUED,
+                             HostStatus.REPAIRING)
+        self.mock_drone_manager.finish_process(_PidfileType.REPAIR)
+        return queue_entry
+
+
+    def test_pre_job_cleanup_failure(self):
+        queue_entry = self._run_pre_job_cleanup_one_failure()
+        # from here the job should run as normal
+        self._run_pre_job_cleanup_job(queue_entry)
+
+
+    def test_pre_job_cleanup_double_failure(self):
+        # TODO (showard): this test isn't perfect.  in reality, when the second
+        # cleanup fails, it copies its results over to the job directory using
+        # copy_results_on_drone() and then parses them.  since we don't handle
+        # that, there appear to be no results at the job directory.  the
+        # scheduler handles this gracefully, parsing gets effectively skipped,
+        # and this test passes as is.  but we ought to properly test that
+        # behavior.
+        queue_entry = self._run_pre_job_cleanup_one_failure()
+        self._run_dispatcher() # second cleanup
+        self.mock_drone_manager.finish_process(_PidfileType.CLEANUP,
+                                               exit_status=256)
+        self._run_dispatcher()
+        self._check_statuses(queue_entry, HqeStatus.FAILED,
+                             HostStatus.REPAIR_FAILED)
+        # nothing else should run
+        self._assert_nothing_is_running()
+
+
+    def _assert_nothing_is_running(self):
+        self.assertEquals(self.mock_drone_manager.running_pidfile_ids(), [])
+
+
+    def _run_post_job_cleanup_failure_up_to_repair(self):
+        self._initialize_test()
+        job, queue_entry = self._make_job_and_queue_entry()
+        job.reboot_after = models.RebootAfter.ALWAYS
+        job.save()
+
+        self._run_pre_job_verify(queue_entry)
+        self._run_dispatcher() # job
+        self.mock_drone_manager.finish_process(_PidfileType.JOB)
+        self._run_dispatcher() # parsing + cleanup
+        self.mock_drone_manager.finish_process(_PidfileType.PARSE)
+        self.mock_drone_manager.finish_process(_PidfileType.CLEANUP,
+                                               exit_status=256)
+        self._run_dispatcher() # repair, HQE unaffected
+        self._check_statuses(queue_entry, HqeStatus.COMPLETED,
+                             HostStatus.REPAIRING)
+        return queue_entry
+
+
+    def test_post_job_cleanup_failure(self):
+        queue_entry = self._run_post_job_cleanup_failure_up_to_repair()
+        self.mock_drone_manager.finish_process(_PidfileType.REPAIR)
+        self._run_dispatcher()
+        self._check_statuses(queue_entry, HqeStatus.COMPLETED, HostStatus.READY)
+
+
+    def test_post_job_cleanup_failure_repair_failure(self):
+        queue_entry = self._run_post_job_cleanup_failure_up_to_repair()
+        self.mock_drone_manager.finish_process(_PidfileType.REPAIR,
+                                               exit_status=256)
+        self._run_dispatcher()
+        self._check_statuses(queue_entry, HqeStatus.COMPLETED,
+                             HostStatus.REPAIR_FAILED)
+
+
+    def _finish_job(self, queue_entry):
         self.mock_drone_manager.finish_process(_PidfileType.JOB)
         self._run_dispatcher() # launches parsing + cleanup
+        self._check_statuses(queue_entry, HqeStatus.PARSING,
+                             HostStatus.CLEANING)
         self._finish_parsing_and_cleanup()
 
 
@@ -339,22 +493,22 @@
     def test_recover_running_no_process(self):
         # recovery should re-execute a Running HQE if no process is found
         _, queue_entry = self._make_job_and_queue_entry()
-        queue_entry.status = models.HostQueueEntry.Status.RUNNING
+        queue_entry.status = HqeStatus.RUNNING
         queue_entry.execution_subdir = '1-myuser/host1'
         queue_entry.save()
-        queue_entry.host.status = models.Host.Status.RUNNING
+        queue_entry.host.status = HostStatus.RUNNING
         queue_entry.host.save()
 
         self._initialize_test()
         self._run_dispatcher()
-        self._finish_job()
+        self._finish_job(queue_entry)
 
 
     def test_recover_verifying_hqe_no_special_task(self):
         # recovery should fail on a Verifing HQE with no corresponding
         # Verify or Cleanup SpecialTask
         _, queue_entry = self._make_job_and_queue_entry()
-        queue_entry.status = models.HostQueueEntry.Status.VERIFYING
+        queue_entry.status = HqeStatus.VERIFYING
         queue_entry.save()
 
         # make some dummy SpecialTasks that shouldn't count
@@ -370,7 +524,7 @@
 
     def _test_recover_verifying_hqe_helper(self, task, pidfile_type):
         _, queue_entry = self._make_job_and_queue_entry()
-        queue_entry.status = models.HostQueueEntry.Status.VERIFYING
+        queue_entry.status = HqeStatus.VERIFYING
         queue_entry.save()
 
         special_task = models.SpecialTask.objects.create(