Issue #9205: concurrent.futures.ProcessPoolExecutor now detects killed
children and raises BrokenProcessPool in such a situation. Previously it
would reliably freeze/deadlock.
diff --git a/Lib/concurrent/futures/process.py b/Lib/concurrent/futures/process.py
index f0bf6d5..c2331e7 100644
--- a/Lib/concurrent/futures/process.py
+++ b/Lib/concurrent/futures/process.py
@@ -46,10 +46,11 @@
__author__ = 'Brian Quinlan (brian@sweetapp.com)'
import atexit
+import os
from concurrent.futures import _base
import queue
import multiprocessing
-from multiprocessing.queues import SimpleQueue
+from multiprocessing.queues import SimpleQueue, SentinelReady
import threading
import weakref
@@ -122,7 +123,7 @@
call_item = call_queue.get(block=True)
if call_item is None:
# Wake up queue management thread
- result_queue.put(None)
+ result_queue.put(os.getpid())
return
try:
r = call_item.fn(*call_item.args, **call_item.kwargs)
@@ -194,29 +195,63 @@
result_queue: A multiprocessing.Queue of _ResultItems generated by the
process workers.
"""
- nb_shutdown_processes = 0
- def shutdown_one_process():
- """Tell a worker to terminate, which will in turn wake us again"""
- nonlocal nb_shutdown_processes
- call_queue.put(None)
- nb_shutdown_processes += 1
+
+ def shutdown_worker():
+ # This is an upper bound
+ nb_children_alive = sum(p.is_alive() for p in processes.values())
+ for i in range(0, nb_children_alive):
+ call_queue.put(None)
+ # If .join() is not called on the created processes then
+ # some multiprocessing.Queue methods may deadlock on Mac OS
+ # X.
+ for p in processes.values():
+ p.join()
+
while True:
_add_call_item_to_queue(pending_work_items,
work_ids_queue,
call_queue)
- result_item = result_queue.get()
- if result_item is not None:
- work_item = pending_work_items[result_item.work_id]
- del pending_work_items[result_item.work_id]
-
- if result_item.exception:
- work_item.future.set_exception(result_item.exception)
- else:
- work_item.future.set_result(result_item.result)
- continue
- # If we come here, we either got a timeout or were explicitly woken up.
- # In either case, check whether we should start shutting down.
+ sentinels = [p.sentinel for p in processes.values()]
+ assert sentinels
+ try:
+ result_item = result_queue.get(sentinels=sentinels)
+ except SentinelReady as e:
+ # Mark the process pool broken so that submits fail right now.
+ executor = executor_reference()
+ if executor is not None:
+ executor._broken = True
+ executor._shutdown_thread = True
+ del executor
+ # All futures in flight must be marked failed
+ for work_id, work_item in pending_work_items.items():
+ work_item.future.set_exception(
+ BrokenProcessPool(
+ "A process in the process pool was "
+ "terminated abruptly while the future was "
+ "running or pending."
+ ))
+ pending_work_items.clear()
+ # Terminate remaining workers forcibly: the queues or their
+ # locks may be in a dirty state and block forever.
+ for p in processes.values():
+ p.terminate()
+ for p in processes.values():
+ p.join()
+ return
+ if isinstance(result_item, int):
+ # Clean shutdown of a worker using its PID
+ # (avoids marking the executor broken)
+ del processes[result_item]
+ elif result_item is not None:
+ work_item = pending_work_items.pop(result_item.work_id, None)
+ # work_item can be None if another process terminated (see above)
+ if work_item is not None:
+ if result_item.exception:
+ work_item.future.set_exception(result_item.exception)
+ else:
+ work_item.future.set_result(result_item.result)
+ # Check whether we should start shutting down.
executor = executor_reference()
# No more work items can be added if:
# - The interpreter is shutting down OR
@@ -226,17 +261,11 @@
# Since no new work items can be added, it is safe to shutdown
# this thread if there are no pending work items.
if not pending_work_items:
- while nb_shutdown_processes < len(processes):
- shutdown_one_process()
- # If .join() is not called on the created processes then
- # some multiprocessing.Queue methods may deadlock on Mac OS
- # X.
- for p in processes:
- p.join()
+ shutdown_worker()
return
else:
# Start shutting down by telling a process it can exit.
- shutdown_one_process()
+ call_queue.put(None)
del executor
_system_limits_checked = False
@@ -264,6 +293,14 @@
_system_limited = "system provides too few semaphores (%d available, 256 necessary)" % nsems_max
raise NotImplementedError(_system_limited)
+
+class BrokenProcessPool(RuntimeError):
+ """
+ Raised when a process in a ProcessPoolExecutor terminated abruptly
+ while a future was in the running state.
+ """
+
+
class ProcessPoolExecutor(_base.Executor):
def __init__(self, max_workers=None):
"""Initializes a new ProcessPoolExecutor instance.
@@ -288,11 +325,13 @@
self._result_queue = SimpleQueue()
self._work_ids = queue.Queue()
self._queue_management_thread = None
- self._processes = set()
+ # Map of pids to processes
+ self._processes = {}
# Shutdown is a two-step process.
self._shutdown_thread = False
self._shutdown_lock = threading.Lock()
+ self._broken = False
self._queue_count = 0
self._pending_work_items = {}
@@ -302,6 +341,8 @@
def weakref_cb(_, q=self._result_queue):
q.put(None)
if self._queue_management_thread is None:
+ # Start the processes so that their sentinels are known.
+ self._adjust_process_count()
self._queue_management_thread = threading.Thread(
target=_queue_management_worker,
args=(weakref.ref(self, weakref_cb),
@@ -321,10 +362,13 @@
args=(self._call_queue,
self._result_queue))
p.start()
- self._processes.add(p)
+ self._processes[p.pid] = p
def submit(self, fn, *args, **kwargs):
with self._shutdown_lock:
+ if self._broken:
+ raise BrokenProcessPool('A child process terminated '
+ 'abruptly, the process pool is not usable anymore')
if self._shutdown_thread:
raise RuntimeError('cannot schedule new futures after shutdown')
@@ -338,7 +382,6 @@
self._result_queue.put(None)
self._start_queue_management_thread()
- self._adjust_process_count()
return f
submit.__doc__ = _base.Executor.submit.__doc__
diff --git a/Lib/multiprocessing/connection.py b/Lib/multiprocessing/connection.py
index 415e210..ede2908 100644
--- a/Lib/multiprocessing/connection.py
+++ b/Lib/multiprocessing/connection.py
@@ -48,14 +48,18 @@
import _multiprocessing
from multiprocessing import current_process, AuthenticationError, BufferTooShort
-from multiprocessing.util import get_temp_dir, Finalize, sub_debug, debug
+from multiprocessing.util import (
+ get_temp_dir, Finalize, sub_debug, debug, _eintr_retry)
try:
from _multiprocessing import win32
+ from _subprocess import WAIT_OBJECT_0, WAIT_TIMEOUT, INFINITE
except ImportError:
if sys.platform == 'win32':
raise
win32 = None
+_select = _eintr_retry(select.select)
+
#
#
#
@@ -118,6 +122,15 @@
else:
raise ValueError('address type of %r unrecognized' % address)
+
+class SentinelReady(Exception):
+ """
+ Raised when a sentinel is ready when polling.
+ """
+ def __init__(self, *args):
+ Exception.__init__(self, *args)
+ self.sentinels = args[0]
+
#
# Connection classes
#
@@ -253,19 +266,17 @@
(offset + size) // itemsize])
return size
- def recv(self):
+ def recv(self, sentinels=None):
"""Receive a (picklable) object"""
self._check_closed()
self._check_readable()
- buf = self._recv_bytes()
+ buf = self._recv_bytes(sentinels=sentinels)
return pickle.loads(buf.getbuffer())
def poll(self, timeout=0.0):
"""Whether there is any input available to be read"""
self._check_closed()
self._check_readable()
- if timeout < 0.0:
- timeout = None
return self._poll(timeout)
@@ -274,61 +285,88 @@
class PipeConnection(_ConnectionBase):
"""
Connection class based on a Windows named pipe.
+ Overlapped I/O is used, so the handles must have been created
+ with FILE_FLAG_OVERLAPPED.
"""
+ _buffered = b''
def _close(self):
win32.CloseHandle(self._handle)
def _send_bytes(self, buf):
- nwritten = win32.WriteFile(self._handle, buf)
+ overlapped = win32.WriteFile(self._handle, buf, overlapped=True)
+ nwritten, complete = overlapped.GetOverlappedResult(True)
+ assert complete
assert nwritten == len(buf)
- def _recv_bytes(self, maxsize=None):
+ def _recv_bytes(self, maxsize=None, sentinels=()):
+ if sentinels:
+ self._poll(-1.0, sentinels)
buf = io.BytesIO()
- bufsize = 512
- if maxsize is not None:
- bufsize = min(bufsize, maxsize)
- try:
- firstchunk, complete = win32.ReadFile(self._handle, bufsize)
- except IOError as e:
- if e.errno == win32.ERROR_BROKEN_PIPE:
- raise EOFError
- raise
- lenfirstchunk = len(firstchunk)
- buf.write(firstchunk)
- if complete:
- return buf
+ firstchunk = self._buffered
+ if firstchunk:
+ lenfirstchunk = len(firstchunk)
+ buf.write(firstchunk)
+ self._buffered = b''
+ else:
+ # A reasonable size for the first chunk transfer
+ bufsize = 128
+ if maxsize is not None and maxsize < bufsize:
+ bufsize = maxsize
+ try:
+ overlapped = win32.ReadFile(self._handle, bufsize, overlapped=True)
+ lenfirstchunk, complete = overlapped.GetOverlappedResult(True)
+ firstchunk = overlapped.getbuffer()
+ assert lenfirstchunk == len(firstchunk)
+ except IOError as e:
+ if e.errno == win32.ERROR_BROKEN_PIPE:
+ raise EOFError
+ raise
+ buf.write(firstchunk)
+ if complete:
+ return buf
navail, nleft = win32.PeekNamedPipe(self._handle)
if maxsize is not None and lenfirstchunk + nleft > maxsize:
return None
- lastchunk, complete = win32.ReadFile(self._handle, nleft)
- assert complete
- buf.write(lastchunk)
+ if nleft > 0:
+ overlapped = win32.ReadFile(self._handle, nleft, overlapped=True)
+ res, complete = overlapped.GetOverlappedResult(True)
+ assert res == nleft
+ assert complete
+ buf.write(overlapped.getbuffer())
return buf
- def _poll(self, timeout):
+ def _poll(self, timeout, sentinels=()):
+ # Fast non-blocking path
navail, nleft = win32.PeekNamedPipe(self._handle)
if navail > 0:
return True
elif timeout == 0.0:
return False
- # Setup a polling loop (translated straight from old
- # pipe_connection.c)
+ # Blocking: use overlapped I/O
if timeout < 0.0:
- deadline = None
+ timeout = INFINITE
else:
- deadline = time.time() + timeout
- delay = 0.001
- max_delay = 0.02
- while True:
- time.sleep(delay)
- navail, nleft = win32.PeekNamedPipe(self._handle)
- if navail > 0:
- return True
- if deadline and time.time() > deadline:
- return False
- if delay < max_delay:
- delay += 0.001
+ timeout = int(timeout * 1000 + 0.5)
+ overlapped = win32.ReadFile(self._handle, 1, overlapped=True)
+ try:
+ handles = [overlapped.event]
+ handles += sentinels
+ res = win32.WaitForMultipleObjects(handles, False, timeout)
+ finally:
+ # Always cancel overlapped I/O in the same thread
+ # (because CancelIoEx() appears only in Vista)
+ overlapped.cancel()
+ if res == WAIT_TIMEOUT:
+ return False
+ idx = res - WAIT_OBJECT_0
+ if idx == 0:
+ # I/O was successful, store received data
+ overlapped.GetOverlappedResult(True)
+ self._buffered += overlapped.getbuffer()
+ return True
+ assert 0 < idx < len(handles)
+ raise SentinelReady([handles[idx]])
class Connection(_ConnectionBase):
@@ -357,11 +395,18 @@
break
buf = buf[n:]
- def _recv(self, size, read=_read):
+ def _recv(self, size, sentinels=(), read=_read):
buf = io.BytesIO()
+ handle = self._handle
+ if sentinels:
+ handles = [handle] + sentinels
remaining = size
while remaining > 0:
- chunk = read(self._handle, remaining)
+ if sentinels:
+ r = _select(handles, [], [])[0]
+ if handle not in r:
+ raise SentinelReady(r)
+ chunk = read(handle, remaining)
n = len(chunk)
if n == 0:
if remaining == size:
@@ -381,15 +426,17 @@
if n > 0:
self._send(buf)
- def _recv_bytes(self, maxsize=None):
- buf = self._recv(4)
+ def _recv_bytes(self, maxsize=None, sentinels=()):
+ buf = self._recv(4, sentinels)
size, = struct.unpack("=i", buf.getvalue())
if maxsize is not None and size > maxsize:
return None
- return self._recv(size)
+ return self._recv(size, sentinels)
def _poll(self, timeout):
- r = select.select([self._handle], [], [], timeout)[0]
+ if timeout < 0.0:
+ timeout = None
+ r = _select([self._handle], [], [], timeout)[0]
return bool(r)
@@ -495,23 +542,21 @@
obsize, ibsize = 0, BUFSIZE
h1 = win32.CreateNamedPipe(
- address, openmode,
+ address, openmode | win32.FILE_FLAG_OVERLAPPED,
win32.PIPE_TYPE_MESSAGE | win32.PIPE_READMODE_MESSAGE |
win32.PIPE_WAIT,
1, obsize, ibsize, win32.NMPWAIT_WAIT_FOREVER, win32.NULL
)
h2 = win32.CreateFile(
- address, access, 0, win32.NULL, win32.OPEN_EXISTING, 0, win32.NULL
+ address, access, 0, win32.NULL, win32.OPEN_EXISTING,
+ win32.FILE_FLAG_OVERLAPPED, win32.NULL
)
win32.SetNamedPipeHandleState(
h2, win32.PIPE_READMODE_MESSAGE, None, None
)
- try:
- win32.ConnectNamedPipe(h1, win32.NULL)
- except WindowsError as e:
- if e.args[0] != win32.ERROR_PIPE_CONNECTED:
- raise
+ overlapped = win32.ConnectNamedPipe(h1, overlapped=True)
+ overlapped.GetOverlappedResult(True)
c1 = PipeConnection(h1, writable=duplex)
c2 = PipeConnection(h2, readable=duplex)
diff --git a/Lib/multiprocessing/forking.py b/Lib/multiprocessing/forking.py
index 3c359cb..a2c61ef 100644
--- a/Lib/multiprocessing/forking.py
+++ b/Lib/multiprocessing/forking.py
@@ -35,6 +35,7 @@
import os
import sys
import signal
+import select
from multiprocessing import util, process
diff --git a/Lib/multiprocessing/queues.py b/Lib/multiprocessing/queues.py
index 3280a25..3324363 100644
--- a/Lib/multiprocessing/queues.py
+++ b/Lib/multiprocessing/queues.py
@@ -44,7 +44,7 @@
from queue import Empty, Full
import _multiprocessing
-from multiprocessing import Pipe
+from multiprocessing.connection import Pipe, SentinelReady
from multiprocessing.synchronize import Lock, BoundedSemaphore, Semaphore, Condition
from multiprocessing.util import debug, info, Finalize, register_after_fork
from multiprocessing.forking import assert_spawning
@@ -372,10 +372,10 @@
def _make_methods(self):
recv = self._reader.recv
racquire, rrelease = self._rlock.acquire, self._rlock.release
- def get():
+ def get(*, sentinels=None):
racquire()
try:
- return recv()
+ return recv(sentinels)
finally:
rrelease()
self.get = get
diff --git a/Lib/test/test_concurrent_futures.py b/Lib/test/test_concurrent_futures.py
index 7457f39..5968980 100644
--- a/Lib/test/test_concurrent_futures.py
+++ b/Lib/test/test_concurrent_futures.py
@@ -19,7 +19,7 @@
from concurrent import futures
from concurrent.futures._base import (
PENDING, RUNNING, CANCELLED, CANCELLED_AND_NOTIFIED, FINISHED, Future)
-import concurrent.futures.process
+from concurrent.futures.process import BrokenProcessPool
def create_future(state=PENDING, exception=None, result=None):
@@ -154,7 +154,7 @@
processes = self.executor._processes
self.executor.shutdown()
- for p in processes:
+ for p in processes.values():
p.join()
def test_context_manager_shutdown(self):
@@ -163,7 +163,7 @@
self.assertEqual(list(e.map(abs, range(-5, 5))),
[5, 4, 3, 2, 1, 0, 1, 2, 3, 4])
- for p in processes:
+ for p in processes.values():
p.join()
def test_del_shutdown(self):
@@ -174,7 +174,7 @@
del executor
queue_management_thread.join()
- for p in processes:
+ for p in processes.values():
p.join()
class WaitTests(unittest.TestCase):
@@ -381,7 +381,17 @@
class ProcessPoolExecutorTest(ProcessPoolMixin, ExecutorTest):
- pass
+ def test_killed_child(self):
+ # When a child process is abruptly terminated, the whole pool gets
+ # "broken".
+ futures = [self.executor.submit(time.sleep, 3)]
+ # Get one of the processes, and terminate (kill) it
+ p = next(iter(self.executor._processes.values()))
+ p.terminate()
+ for fut in futures:
+ self.assertRaises(BrokenProcessPool, fut.result)
+ # Submitting other jobs fails as well.
+ self.assertRaises(BrokenProcessPool, self.executor.submit, pow, 2, 8)
class FutureTests(unittest.TestCase):