bpo-33608: Factor out a private, per-interpreter _Py_AddPendingCall(). (gh-13714)

diff --git a/Python/pystate.c b/Python/pystate.c
index 2b7db0e..a9f3389 100644
--- a/Python/pystate.c
+++ b/Python/pystate.c
@@ -218,6 +218,13 @@
         return NULL;
     }
 
+    interp->ceval.pending.lock = PyThread_allocate_lock();
+    if (interp->ceval.pending.lock == NULL) {
+        PyErr_SetString(PyExc_RuntimeError,
+                        "failed to create interpreter ceval pending mutex");
+        return NULL;
+    }
+
     interp->eval_frame = _PyEval_EvalFrameDefault;
 #ifdef HAVE_DLOPEN
 #if HAVE_DECL_RTLD_NOW
@@ -345,6 +352,10 @@
     if (interp->id_mutex != NULL) {
         PyThread_free_lock(interp->id_mutex);
     }
+    if (interp->ceval.pending.lock != NULL) {
+        PyThread_free_lock(interp->ceval.pending.lock);
+        interp->ceval.pending.lock = NULL;
+    }
     PyMem_RawFree(interp);
 }
 
@@ -1014,7 +1025,7 @@
             p->async_exc = exc;
             HEAD_UNLOCK(runtime);
             Py_XDECREF(old_exc);
-            _PyEval_SignalAsyncExc(&runtime->ceval);
+            _PyEval_SignalAsyncExc(&runtime->ceval, &interp->ceval);
             return 1;
         }
     }
@@ -1444,7 +1455,7 @@
     return 0;
 }
 
-static void
+static int
 _release_xidata(void *arg)
 {
     _PyCrossInterpreterData *data = (_PyCrossInterpreterData *)arg;
@@ -1452,42 +1463,21 @@
         data->free(data->data);
     }
     Py_XDECREF(data->obj);
-}
-
-static void
-_call_in_interpreter(struct _gilstate_runtime_state *gilstate,
-                     PyInterpreterState *interp,
-                     void (*func)(void *), void *arg)
-{
-    /* We would use Py_AddPendingCall() if it weren't specific to the
-     * main interpreter (see bpo-33608).  In the meantime we take a
-     * naive approach.
-     */
-    PyThreadState *save_tstate = NULL;
-    if (interp != _PyRuntimeGILState_GetThreadState(gilstate)->interp) {
-        // XXX Using the "head" thread isn't strictly correct.
-        PyThreadState *tstate = PyInterpreterState_ThreadHead(interp);
-        // XXX Possible GILState issues?
-        save_tstate = _PyThreadState_Swap(gilstate, tstate);
-    }
-
-    func(arg);
-
-    // Switch back.
-    if (save_tstate != NULL) {
-        _PyThreadState_Swap(gilstate, save_tstate);
-    }
+    PyMem_Free(data);
+    return 0;
 }
 
 void
 _PyCrossInterpreterData_Release(_PyCrossInterpreterData *data)
 {
+    _PyRuntimeState *runtime = &_PyRuntime;
+
     if (data->data == NULL && data->obj == NULL) {
         // Nothing to release!
         return;
     }
 
-    // Switch to the original interpreter.
+    // Get the original interpreter.
     PyInterpreterState *interp = _PyInterpreterState_LookUpID(data->interp);
     if (interp == NULL) {
         // The intepreter was already destroyed.
@@ -1496,10 +1486,28 @@
         }
         return;
     }
+    // XXX There's an ever-so-slight race here...
+    if (interp->finalizing) {
+        // XXX Someone leaked some memory...
+        return;
+    }
 
     // "Release" the data and/or the object.
-    struct _gilstate_runtime_state *gilstate = &_PyRuntime.gilstate;
-    _call_in_interpreter(gilstate, interp, _release_xidata, data);
+    _PyCrossInterpreterData *copied = PyMem_Malloc(sizeof(_PyCrossInterpreterData));
+    if (copied == NULL) {
+        PyErr_SetString(PyExc_MemoryError,
+                        "Not enough memory to preserve cross-interpreter data");
+        PyErr_Print();
+        return;
+    }
+    memcpy(copied, data, sizeof(_PyCrossInterpreterData));
+    PyThreadState *tstate = _PyRuntimeState_GetThreadState(runtime);
+    int res = _PyEval_AddPendingCall(tstate,
+                                     &runtime->ceval, &interp->ceval,
+                                     0, _release_xidata, copied);
+    if (res != 0) {
+        // XXX Queue full or couldn't get lock.  Try again somehow?
+    }
 }
 
 PyObject *