Add retries for flaky tests (enabled by default for now)
diff --git a/tools/run_tests/jobset.py b/tools/run_tests/jobset.py
index 2a86319..e696a0e 100755
--- a/tools/run_tests/jobset.py
+++ b/tools/run_tests/jobset.py
@@ -81,6 +81,7 @@
 
 _TAG_COLOR = {
     'FAILED': 'red',
+    'FLAKE': 'red',
     'WARNING': 'yellow',
     'TIMEOUT': 'red',
     'PASSED': 'green',
@@ -131,7 +132,7 @@
   """Specifies what to run for a job."""
 
   def __init__(self, cmdline, shortname=None, environ=None, hash_targets=None,
-               cwd=None, shell=False, timeout_seconds=5*60):
+               cwd=None, shell=False, timeout_seconds=5*60, flake_retries=5):
     """
     Arguments:
       cmdline: a list of arguments to pass as the command line
@@ -150,6 +151,7 @@
     self.cwd = cwd
     self.shell = shell
     self.timeout_seconds = timeout_seconds
+    self.flake_retries = flake_retries
 
   def identity(self):
     return '%r %r %r' % (self.cmdline, self.environ, self.hash_targets)
@@ -167,25 +169,28 @@
   def __init__(self, spec, bin_hash, newline_on_success, travis, add_env, xml_report):
     self._spec = spec
     self._bin_hash = bin_hash
-    self._tempfile = tempfile.TemporaryFile()
-    env = os.environ.copy()
-    for k, v in spec.environ.iteritems():
-      env[k] = v
-    for k, v in add_env.iteritems():
-      env[k] = v
-    self._start = time.time()
-    self._process = subprocess.Popen(args=spec.cmdline,
-                                     stderr=subprocess.STDOUT,
-                                     stdout=self._tempfile,
-                                     cwd=spec.cwd,
-                                     shell=spec.shell,
-                                     env=env)
-    self._state = _RUNNING
     self._newline_on_success = newline_on_success
     self._travis = travis
+    self._add_env = add_env.copy()
     self._xml_test = ET.SubElement(xml_report, 'testcase',
                                    name=self._spec.shortname) if xml_report is not None else None
+    self._retries = 0
     message('START', spec.shortname, do_newline=self._travis)
+    self.start()
+
+  def start(self):
+    self._tempfile = tempfile.TemporaryFile()
+    env = dict(os.environ)
+    env.update(self._spec.environ)
+    env.update(self._add_env)
+    self._start = time.time()
+    self._process = subprocess.Popen(args=self._spec.cmdline,
+                                     stderr=subprocess.STDOUT,
+                                     stdout=self._tempfile,
+                                     cwd=self._spec.cwd,
+                                     shell=self._spec.shell,
+                                     env=env)
+    self._state = _RUNNING
 
   def state(self, update_cache):
     """Poll current state of the job. Prints messages at completion."""
@@ -202,15 +207,22 @@
         self._xml_test.set('time', str(elapsed))
         ET.SubElement(self._xml_test, 'system-out').text = filtered_stdout
       if self._process.returncode != 0:
-        self._state = _FAILURE
-        message('FAILED', '%s [ret=%d, pid=%d]' % (
+        if self._retries < self._spec.flake_retries:
+          message('FLAKE', '%s [ret=%d, pid=%d]' % (
             self._spec.shortname, self._process.returncode, self._process.pid),
             stdout, do_newline=True)
-        if self._xml_test is not None:
-          ET.SubElement(self._xml_test, 'failure', message='Failure').text
+          self._retries += 1
+          self.start()
+        else:
+          self._state = _FAILURE
+          message('FAILED', '%s [ret=%d, pid=%d]' % (
+              self._spec.shortname, self._process.returncode, self._process.pid),
+              stdout, do_newline=True)
+          if self._xml_test is not None:
+            ET.SubElement(self._xml_test, 'failure', message='Failure').text
       else:
         self._state = _SUCCESS
-        message('PASSED', '%s [time=%.1fsec]' % (self._spec.shortname, elapsed),
+        message('PASSED', '%s [time=%.1fsec; retries=%d]' % (self._spec.shortname, elapsed, self._retries),
                 do_newline=self._newline_on_success or self._travis)
         if self._bin_hash:
           update_cache.finished(self._spec.identity(), self._bin_hash)