[autotest] Add retries to provisioning. There is now a setting in global_config.ini which controls provision retries, and this value can be reloaded on-the-fly in the scheduler. Be cautioned that provision failures are basically silently hid. There's currently no sort of reporting to indicate that a retry happened. Implementing this also pointed out the way to clean up the ProvisionTask code, so there's also some free cleanup packaged into this CL. BUG=chromium:279667 DEPLOY=scheduler TEST=forced provision failures, and watched the HQE get requeued a finite number of times. Change-Id: I66d967fb8f3ab9f199571764821e1a39d0e81f39 Reviewed-on: https://chromium-review.googlesource.com/167990 Reviewed-by: Dan Shi <dshi@chromium.org> Tested-by: Alex Miller <milleral@chromium.org> Reviewed-by: Prashanth Balasubramanian <beeps@chromium.org> Commit-Queue: Alex Miller <milleral@chromium.org>

commit: a4a78efbc0ab93bb57ddc1ea010a30b5c07a7ace [log] [tgz]
author: Alex Miller <milleral@chromium.org> Tue Sep 03 21:23:05 2013 -0700
committer: chrome-internal-fetch <chrome-internal-fetch@google.com> Wed Sep 04 23:59:41 2013 +0000
tree: 7079d78f5c416b3acbf8881200979b8d46e46a6d
parent: 6a704f7a70401f88599390f18c188b6c8010ef9a [diff]
diff --git a/global_config.ini b/global_config.ini
index 89d320d..bdc634d 100644
--- a/global_config.ini
+++ b/global_config.ini

@@ -103,8 +103,8 @@
 copy_parse_log_back: False
 tick_debug: True
 extra_debugging: False
-try_job_timeout_mins: 120
-hqe_maximum_abort_rate_float: .5
+max_repair_limit: 2
+max_provision_retries: 1
 
 
 [HOSTS]

diff --git a/scheduler/monitor_db.py b/scheduler/monitor_db.py
index c53e263..027096f 100755
--- a/scheduler/monitor_db.py
+++ b/scheduler/monitor_db.py

@@ -1898,15 +1898,12 @@
         if self.success:
             return
 
-        self._copy_to_results_repository()
-
         if self.host.protection == host_protections.Protection.DO_NOT_VERIFY:
             # effectively ignore failure for these hosts
             self.success = True
             return
 
         if self.queue_entry:
-            self.queue_entry.requeue()
             # If we requeue a HQE, we should cancel any remaining pre-job
             # tasks against this host, otherwise we'll be left in a state
             # where a queued HQE has special tasks to run against a host.
@@ -1915,9 +1912,35 @@
                     host__id=self.host.id,
                     is_complete=0).update(is_complete=1, success=0)
 
-            if models.SpecialTask.objects.filter(
+            previous_provisions = models.SpecialTask.objects.filter(
+                    task=models.SpecialTask.Task.PROVISION,
+                    queue_entry_id=self.queue_entry.id).count()
+            if (previous_provisions >=
+                scheduler_config.config.max_provision_retries):
+                self._actually_fail_queue_entry()
+                # This abort will mark the aborted bit on the HQE itself, to
+                # signify that we're killing it.  Technically it also will do
+                # the recursive aborting of all child jobs, but that shouldn't
+                # matter here, as only suites have children, and those are
+                # hostless and thus don't have provisioning.
+                # TODO(milleral) http://crbug.com/188217
+                # However, we can't actually do this yet, as if we set the
+                # abort bit the FinalReparseTask will set the status of the HQE
+                # to ABORTED, which then means that we don't show the status in
+                # run_suite.  So in the meantime, don't mark the HQE as
+                # aborted.
+                # queue_entry.abort()
+            else:
+                # requeue() must come after handling provision retries, since
+                # _actually_fail_queue_entry needs an execution subdir.
+                # We also don't want to requeue if we hit the provision retry
+                # limit, since then we overwrite the PARSING state of the HQE.
+                self.queue_entry.requeue()
+
+            previous_repairs = models.SpecialTask.objects.filter(
                     task=models.SpecialTask.Task.REPAIR,
-                    queue_entry__id=self.queue_entry.id):
+                    queue_entry_id=self.queue_entry.id).count()
+            if previous_repairs >= scheduler_config.config.max_repair_limit:
                 self.host.set_status(models.Host.Status.REPAIR_FAILED)
                 self._fail_queue_entry()
                 return
@@ -2129,53 +2152,9 @@
 
 
     def epilog(self):
-        # TODO(milleral) Here, we override the PreJobTask's epilog, because
-        # it's written with the idea that pre-job special task failures are a
-        # problem with the host and not with something about the HQE.
-        # In our case, the HQE's DEPENDENCIES specify what the provision task
-        # does, so if the provision fails, it can be the fault of the HQE, and
-        # thus we fail the HQE.  This difference is handled only here for now,
-        # but some refactoring of PreJobTask should likely happen sometime in
-        # the future?
-        # This call is needed to log the status and call into self.cleanup(),
-        # which is PreJobTasks's cleanup, which marks is_complete=1.
-        AgentTask.epilog(self)
+        super(ProvisionTask, self).epilog()
 
-        if not self.success:
-            # TODO(milleral) http://crbug.com/231452
-            # In our own setup, we don't really use the results
-            # repository, so I *think* this call can be elided.  However, I'd
-            # like to limit what I can possibly break for now, and it would be
-            # called if I called PreJobTask's epilog, so I'm keeping the call
-            # to it for now.
-            self._copy_to_results_repository()
-            # _actually_fail_queue_entry() is a hack around the fact that we do
-            # indeed want to abort the queue entry here, but the rest of the
-            # scheduler code expects that we will reschedule onto some other
-            # host.
-            self._actually_fail_queue_entry()
-            # This abort will mark the aborted bit on the HQE itself, to
-            # signify that we're killing it.  Technically it also will do
-            # the recursive aborting of all child jobs, but that shouldn't
-            # matter here, as only suites have children, and those
-            # are hostless and thus don't have provisioning.
-            # TODO(milleral) http://crbug.com/188217
-            # However, we can't actually do this yet, as if we set the abort bit
-            # the FinalReparseTask will set the status of the HQE to ABORTED,
-            # which then means that we don't show the status in run_suite.
-            # So in the meantime, don't mark the HQE as aborted.
-            queue_entry = models.HostQueueEntry.objects.get(
-                    id=self.queue_entry.id)
-            # queue_entry.abort()
-
-            # The machine is in some totally unknown state, so let's kick off
-            # a repair task to get it back to some known sane state.
-            models.SpecialTask.objects.create(
-                    host=models.Host.objects.get(id=self.host.id),
-                    task=models.SpecialTask.Task.REPAIR,
-                    queue_entry=queue_entry,
-                    requested_by=self.task.requested_by)
-        elif self._should_pending():
+        if self._should_pending():
             self.queue_entry.on_pending()
         else:
             self.host.set_status(models.Host.Status.READY)

diff --git a/scheduler/scheduler_config.py b/scheduler/scheduler_config.py
index 96d4371..91c8fb1 100644
--- a/scheduler/scheduler_config.py
+++ b/scheduler/scheduler_config.py

@@ -18,6 +18,8 @@
                   'secs_to_wait_for_atomic_group_hosts',
               'reverify_period_minutes': 'reverify_period_minutes',
               'reverify_max_hosts_at_once': 'reverify_max_hosts_at_once',
+              'max_repair_limit': 'max_repair_limit',
+              'max_provision_retries': 'max_provision_retries',
              }
commit	a4a78efbc0ab93bb57ddc1ea010a30b5c07a7ace	[log] [tgz]
author	Alex Miller <milleral@chromium.org>	Tue Sep 03 21:23:05 2013 -0700
committer	chrome-internal-fetch <chrome-internal-fetch@google.com>	Wed Sep 04 23:59:41 2013 +0000
tree	7079d78f5c416b3acbf8881200979b8d46e46a6d
parent	6a704f7a70401f88599390f18c188b6c8010ef9a [diff]