[autotest] Queue calls in drone after drone refresh.

Drone refresh is done in a non-thread safe fashion. It starts the refresh at
the beginning of the tick, then follow by couple other operations, then wait
for the refresh to finish. When it starts, it executes all queued calls in
drone using drone_utils. After drone_utils finishes processing the calls,
the scheduler will empty the queued calls in drones.

That means any calls added between the drone refresh is started and the
completion of drone refresh will be removed without being called.

This CL moves the cleanup call after the drone refresh, also add a comment
about potential future issues. A better fix might fix the root cause. For
example, add a tracker in each drone's call queue. After drone refresh is done,
only clear the calls being processed within refresh. crbug.com/484715 is filed
to track this issue.

BUG=chromium:484039
TEST=local scheduler run, make sure lxc_cleanup is kicked off and finished.

Change-Id: I1bb3229a3da578299949a00af25b3d4674eeed4b
Reviewed-on: https://chromium-review.googlesource.com/269255
Trybot-Ready: Dan Shi <dshi@chromium.org>
Tested-by: Dan Shi <dshi@chromium.org>
Reviewed-by: Richard Barnette <jrbarnette@chromium.org>
Reviewed-by: Simran Basi <sbasi@chromium.org>
Commit-Queue: Dan Shi <dshi@chromium.org>
diff --git a/scheduler/drone_manager.py b/scheduler/drone_manager.py
index e74cb72..d4e95d5 100644
--- a/scheduler/drone_manager.py
+++ b/scheduler/drone_manager.py
@@ -307,19 +307,10 @@
     def cleanup_orphaned_containers(self):
         """Queue cleanup_orphaned_containers call at each drone.
         """
-        drones = list(self.get_drones())
-        for drone in drones:
-            logging.info('Queue cleanup_orphaned_containers at %s', drone)
+        for drone in self._drones.values():
+            logging.info('Queue cleanup_orphaned_containers at %s',
+                         drone.hostname)
             drone.queue_call('cleanup_orphaned_containers')
-        with self._timer.get_client('cleanup_orphaned_containers'):
-            # Each task will start a new process of lxc_cleanup in drone and
-            # exit, the wait time is about 2-3 seconds at most. If this call
-            # does not wait, the drone refresh may have a race condition when
-            # it tries to process all queued calls in a different thread. The
-            # race condition will lead to scheduler crash. Therefore, the tasks
-            # queued here will be waited for finishing. Considering it will
-            # only be called once a day, the overhead should be minimum.
-            self._refresh_task_queue.execute(drones, wait=True)
 
 
     def _get_drone_for_process(self, process):
diff --git a/scheduler/drones.py b/scheduler/drones.py
index c663e26..bd4dbab 100644
--- a/scheduler/drones.py
+++ b/scheduler/drones.py
@@ -127,6 +127,7 @@
     def queue_call(self, method, *args, **kwargs):
         self._calls.append(drone_utility.call(method, *args, **kwargs))
 
+
     def clear_call_queue(self):
         self._calls = []
 
diff --git a/scheduler/monitor_db.py b/scheduler/monitor_db.py
index 63d5e36..09d7dd1 100755
--- a/scheduler/monitor_db.py
+++ b/scheduler/monitor_db.py
@@ -302,6 +302,10 @@
     def initialize(self, recover_hosts=True):
         self._periodic_cleanup.initialize()
         self._24hr_upkeep.initialize()
+        # Execute all actions queued in the cleanup tasks. Scheduler tick will
+        # run a refresh task first. If there is any action in the queue, refresh
+        # will raise an exception.
+        _drone_manager.execute_actions()
 
         # always recover processes
         self._recover_processes()
@@ -331,8 +335,6 @@
         self._garbage_collection()
         self._log_tick_msg('Calling _drone_manager.trigger_refresh().')
         _drone_manager.trigger_refresh()
-        self._log_tick_msg('Calling _run_cleanup().')
-        self._run_cleanup()
         self._log_tick_msg('Calling _process_recurring_runs().')
         self._process_recurring_runs()
         self._log_tick_msg('Calling _schedule_delay_tasks().')
@@ -345,6 +347,14 @@
         self._schedule_new_jobs()
         self._log_tick_msg('Calling _drone_manager.sync_refresh().')
         _drone_manager.sync_refresh()
+        # _run_cleanup must be called between drone_manager.sync_refresh, and
+        # drone_manager.execute_actions, as sync_refresh will clear the calls
+        # queued in drones. Therefore, any action that calls drone.queue_call
+        # to add calls to the drone._calls, should be after drone refresh is
+        # completed and before drone_manager.execute_actions at the end of the
+        # tick.
+        self._log_tick_msg('Calling _run_cleanup().')
+        self._run_cleanup()
         self._log_tick_msg('Calling _find_aborting().')
         self._find_aborting()
         self._log_tick_msg('Calling _find_aborted_special_tasks().')
diff --git a/scheduler/monitor_db_cleanup.py b/scheduler/monitor_db_cleanup.py
index 04f6d1c..9a5d338 100644
--- a/scheduler/monitor_db_cleanup.py
+++ b/scheduler/monitor_db_cleanup.py
@@ -249,8 +249,7 @@
     def _cleanup(self):
         logging.info('Running 24 hour clean up')
         self._check_for_uncleanable_db_inconsistencies()
-        # TODO(dshi): crbug.com/484039, after the bug is fixed, re-enable this.
-        #self._cleanup_orphaned_containers()
+        self._cleanup_orphaned_containers()
 
 
     @timer.decorate