Robustness work for port_server startup
diff --git a/tools/run_tests/port_server.py b/tools/run_tests/port_server.py
index b953df9..4e473af 100755
--- a/tools/run_tests/port_server.py
+++ b/tools/run_tests/port_server.py
@@ -42,7 +42,7 @@
 # increment this number whenever making a change to ensure that
 # the changes are picked up by running CI servers
 # note that all changes must be backwards compatible
-_MY_VERSION = 2
+_MY_VERSION = 4
 
 
 if len(sys.argv) == 2 and sys.argv[1] == 'dump_version':
@@ -52,8 +52,13 @@
 
 argp = argparse.ArgumentParser(description='Server for httpcli_test')
 argp.add_argument('-p', '--port', default=12345, type=int)
+argp.add_argument('-l', '--logfile', default=None, type=str)
 args = argp.parse_args()
 
+if args.logfile is not None:
+  sys.stderr = open(args.logfile, 'w')
+  sys.stdout = sys.stderr
+
 print 'port server running on port %d' % args.port
 
 pool = []
@@ -146,6 +151,6 @@
 httpd = BaseHTTPServer.HTTPServer(('', args.port), Handler)
 while keep_running:
   httpd.handle_request()
+  sys.stderr.flush()
 
 print 'done'
-
diff --git a/tools/run_tests/run_tests.py b/tools/run_tests/run_tests.py
index 048ab90..7137759 100755
--- a/tools/run_tests/run_tests.py
+++ b/tools/run_tests/run_tests.py
@@ -43,6 +43,8 @@
 import socket
 import subprocess
 import sys
+import tempfile
+import traceback
 import time
 import xml.etree.cElementTree as ET
 import urllib2
@@ -704,35 +706,50 @@
       urllib2.urlopen('http://localhost:%d/quitquitquit' % port_server_port).read()
       time.sleep(1)
   if not running:
-    print 'starting port_server'
-    port_log = open('portlog.txt', 'w')
+    fd, logfile = tempfile.mkstemp()
+    os.close(fd)
+    print 'starting port_server, with log file %s' % logfile
     port_server = subprocess.Popen(
-        [sys.executable, 'tools/run_tests/port_server.py', '-p', '%d' % port_server_port],
-        stderr=subprocess.STDOUT,
-        stdout=port_log)
+        [sys.executable, 'tools/run_tests/port_server.py', '-p', '%d' % port_server_port, '-l', logfile],
+        close_fds=True)
+    time.sleep(1)
     # ensure port server is up
     waits = 0
     while True:
       if waits > 10:
+        print 'killing port server due to excessive start up waits'
         port_server.kill()
       if port_server.poll() is not None:
         print 'port_server failed to start'
-        port_log = open('portlog.txt', 'r').read()
-        print port_log
-        sys.exit(1)
+        # try one final time: maybe another build managed to start one
+        time.sleep(1)
+        try:
+          urllib2.urlopen('http://localhost:%d/get' % port_server_port,
+                          timeout=1).read()
+          print 'last ditch attempt to contact port server succeeded'
+          break
+        except:
+          traceback.print_exc();
+          port_log = open(logfile, 'r').read()
+          print port_log
+          sys.exit(1)
       try:
         urllib2.urlopen('http://localhost:%d/get' % port_server_port,
                         timeout=1).read()
+        print 'port server is up and ready'
         break
       except socket.timeout:
         print 'waiting for port_server: timeout'
-        time.sleep(0.5)
+        traceback.print_exc();
+        time.sleep(1)
         waits += 1
       except urllib2.URLError:
         print 'waiting for port_server: urlerror'
-        time.sleep(0.5)
+        traceback.print_exc();
+        time.sleep(1)
         waits += 1
       except:
+        traceback.print_exc();
         port_server.kill()
         raise