[3.6] bpo-30604: Move co_extra_freefuncs to interpreter state to avoid crashes in threads (#2015)

* Move co_extra_freefuncs to interpreter state to avoid crashes in
multi-threaded scenarios involving deletion of code objects

* Don't require that extra be zero initialized

* Build test list instead of defining empty test class

* Ensure extra is always assigned on success

* Keep the old fields in the thread state object, just don't use them
Add new linked list of code extra objects on a per-interpreter basis
  so that interpreter state size isn't changed

* Rename __PyCodeExtraState_Get and add comment about it going away in 3.7
Fix sort order of import's in test_code.py

* Remove an extraneous space

* Remove docstrings for comments

* Touch up formatting

* Fix casing of coextra local

* Fix casing of another variable

* Prefix PyCodeExtraState with __ to match C API for getting it

* Update NEWS file for bpo-30604
diff --git a/Include/pystate.h b/Include/pystate.h
index afc3c0c..6ca463f 100644
--- a/Include/pystate.h
+++ b/Include/pystate.h
@@ -51,6 +51,16 @@
 } PyInterpreterState;
 #endif
 
+typedef struct _co_extra_state {
+    struct _co_extra_state *next;
+    PyInterpreterState* interp;
+
+    Py_ssize_t co_extra_user_count;
+    freefunc co_extra_freefuncs[MAX_CO_EXTRA_USERS];
+} __PyCodeExtraState;
+
+/* This is temporary for backwards compat in 3.6 and will be removed in 3.7 */
+__PyCodeExtraState* __PyCodeExtraState_Get();
 
 /* State unique per thread */
 
@@ -142,8 +152,10 @@
     PyObject *coroutine_wrapper;
     int in_coroutine_wrapper;
 
-    Py_ssize_t co_extra_user_count;
-    freefunc co_extra_freefuncs[MAX_CO_EXTRA_USERS];
+    /* Now used from PyInterpreterState, kept here for ABI
+       compatibility with PyThreadState */
+    Py_ssize_t _preserve_36_ABI_1;
+    freefunc _preserve_36_ABI_2[MAX_CO_EXTRA_USERS];
 
     PyObject *async_gen_firstiter;
     PyObject *async_gen_finalizer;
diff --git a/Lib/test/test_code.py b/Lib/test/test_code.py
index 7975ea0..9f9df9d 100644
--- a/Lib/test/test_code.py
+++ b/Lib/test/test_code.py
@@ -103,9 +103,11 @@
 """
 
 import sys
+import threading
 import unittest
 import weakref
-from test.support import run_doctest, run_unittest, cpython_only
+from test.support import (run_doctest, run_unittest, cpython_only,
+                          check_impl_detail)
 
 
 def consts(t):
@@ -212,11 +214,106 @@
         self.assertTrue(self.called)
 
 
+if check_impl_detail(cpython=True):
+    import ctypes
+    py = ctypes.pythonapi
+    freefunc = ctypes.CFUNCTYPE(None,ctypes.c_voidp)
+
+    RequestCodeExtraIndex = py._PyEval_RequestCodeExtraIndex
+    RequestCodeExtraIndex.argtypes = (freefunc,)
+    RequestCodeExtraIndex.restype = ctypes.c_ssize_t
+
+    SetExtra = py._PyCode_SetExtra
+    SetExtra.argtypes = (ctypes.py_object, ctypes.c_ssize_t, ctypes.c_voidp)
+    SetExtra.restype = ctypes.c_int
+
+    GetExtra = py._PyCode_GetExtra
+    GetExtra.argtypes = (ctypes.py_object, ctypes.c_ssize_t, 
+                         ctypes.POINTER(ctypes.c_voidp))
+    GetExtra.restype = ctypes.c_int
+
+    LAST_FREED = None
+    def myfree(ptr):
+        global LAST_FREED
+        LAST_FREED = ptr
+
+    FREE_FUNC = freefunc(myfree)
+    FREE_INDEX = RequestCodeExtraIndex(FREE_FUNC)
+
+    class CoExtra(unittest.TestCase):
+        def get_func(self):
+            # Defining a function causes the containing function to have a
+            # reference to the code object.  We need the code objects to go
+            # away, so we eval a lambda.
+            return eval('lambda:42')
+
+        def test_get_non_code(self):
+            f = self.get_func()
+
+            self.assertRaises(SystemError, SetExtra, 42, FREE_INDEX,
+                              ctypes.c_voidp(100))
+            self.assertRaises(SystemError, GetExtra, 42, FREE_INDEX,
+                              ctypes.c_voidp(100))
+
+        def test_bad_index(self):
+            f = self.get_func()
+            self.assertRaises(SystemError, SetExtra, f.__code__,
+                              FREE_INDEX+100, ctypes.c_voidp(100))
+            self.assertEqual(GetExtra(f.__code__, FREE_INDEX+100,
+                              ctypes.c_voidp(100)), 0)
+
+        def test_free_called(self):
+            # Verify that the provided free function gets invoked
+            # when the code object is cleaned up.
+            f = self.get_func()
+
+            SetExtra(f.__code__, FREE_INDEX, ctypes.c_voidp(100))
+            del f
+            self.assertEqual(LAST_FREED, 100)
+
+        def test_get_set(self):
+            # Test basic get/set round tripping.
+            f = self.get_func()
+
+            extra = ctypes.c_voidp()
+
+            SetExtra(f.__code__, FREE_INDEX, ctypes.c_voidp(200))
+            # reset should free...
+            SetExtra(f.__code__, FREE_INDEX, ctypes.c_voidp(300))
+            self.assertEqual(LAST_FREED, 200)
+
+            extra = ctypes.c_voidp()
+            GetExtra(f.__code__, FREE_INDEX, extra)
+            self.assertEqual(extra.value, 300)
+            del f
+
+        def test_free_different_thread(self):
+            # Freeing a code object on a different thread then
+            # where the co_extra was set should be safe.
+            f = self.get_func()
+            class ThreadTest(threading.Thread):
+                def __init__(self, f, test):
+                    super().__init__()
+                    self.f = f
+                    self.test = test
+                def run(self):
+                    del self.f
+                    self.test.assertEqual(LAST_FREED, 500)
+
+            SetExtra(f.__code__, FREE_INDEX, ctypes.c_voidp(500))
+            tt = ThreadTest(f, self)
+            del f
+            tt.start()
+            tt.join()
+            self.assertEqual(LAST_FREED, 500)
+
 def test_main(verbose=None):
     from test import test_code
     run_doctest(test_code, verbose)
-    run_unittest(CodeTest, CodeConstsTest, CodeWeakRefTest)
-
+    tests = [CodeTest, CodeConstsTest, CodeWeakRefTest]
+    if check_impl_detail(cpython=True):
+        tests.append(CoExtra)
+    run_unittest(*tests)
 
 if __name__ == "__main__":
     test_main()
diff --git a/Misc/NEWS b/Misc/NEWS
index a950731..f744160 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -1,4 +1,4 @@
-+++++++++++
++++++++++++
 Python News
 +++++++++++
 
@@ -10,6 +10,8 @@
 Core and Builtins
 -----------------
 
+- bpo-30604: Move co_extra_freefuncs to not be per-thread to avoid crashes
+
 - bpo-29104: Fixed parsing backslashes in f-strings.
 
 - bpo-27945: Fixed various segfaults with dict when input collections are
diff --git a/Objects/codeobject.c b/Objects/codeobject.c
index 22c4f85..d38f185 100644
--- a/Objects/codeobject.c
+++ b/Objects/codeobject.c
@@ -411,11 +411,11 @@
 code_dealloc(PyCodeObject *co)
 {
     if (co->co_extra != NULL) {
-        PyThreadState *tstate = PyThreadState_Get();
+        __PyCodeExtraState *state = __PyCodeExtraState_Get();
         _PyCodeObjectExtra *co_extra = co->co_extra;
 
         for (Py_ssize_t i = 0; i < co_extra->ce_size; i++) {
-            freefunc free_extra = tstate->co_extra_freefuncs[i];
+            freefunc free_extra = state->co_extra_freefuncs[i];
 
             if (free_extra != NULL) {
                 free_extra(co_extra->ce_extras[i]);
@@ -825,8 +825,6 @@
 int
 _PyCode_GetExtra(PyObject *code, Py_ssize_t index, void **extra)
 {
-    assert(*extra == NULL);
-
     if (!PyCode_Check(code)) {
         PyErr_BadInternalCall();
         return -1;
@@ -837,6 +835,7 @@
 
 
     if (co_extra == NULL || co_extra->ce_size <= index) {
+        *extra = NULL;
         return 0;
     }
 
@@ -848,10 +847,10 @@
 int
 _PyCode_SetExtra(PyObject *code, Py_ssize_t index, void *extra)
 {
-    PyThreadState *tstate = PyThreadState_Get();
+    __PyCodeExtraState *state = __PyCodeExtraState_Get();
 
     if (!PyCode_Check(code) || index < 0 ||
-            index >= tstate->co_extra_user_count) {
+            index >= state->co_extra_user_count) {
         PyErr_BadInternalCall();
         return -1;
     }
@@ -866,13 +865,13 @@
         }
 
         co_extra->ce_extras = PyMem_Malloc(
-            tstate->co_extra_user_count * sizeof(void*));
+            state->co_extra_user_count * sizeof(void*));
         if (co_extra->ce_extras == NULL) {
             PyMem_Free(co_extra);
             return -1;
         }
 
-        co_extra->ce_size = tstate->co_extra_user_count;
+        co_extra->ce_size = state->co_extra_user_count;
 
         for (Py_ssize_t i = 0; i < co_extra->ce_size; i++) {
             co_extra->ce_extras[i] = NULL;
@@ -882,20 +881,27 @@
     }
     else if (co_extra->ce_size <= index) {
         void** ce_extras = PyMem_Realloc(
-            co_extra->ce_extras, tstate->co_extra_user_count * sizeof(void*));
+            co_extra->ce_extras, state->co_extra_user_count * sizeof(void*));
 
         if (ce_extras == NULL) {
             return -1;
         }
 
         for (Py_ssize_t i = co_extra->ce_size;
-             i < tstate->co_extra_user_count;
+             i < state->co_extra_user_count;
              i++) {
             ce_extras[i] = NULL;
         }
 
         co_extra->ce_extras = ce_extras;
-        co_extra->ce_size = tstate->co_extra_user_count;
+        co_extra->ce_size = state->co_extra_user_count;
+    }
+
+    if (co_extra->ce_extras[index] != NULL) {
+        freefunc free = state->co_extra_freefuncs[index];
+        if (free != NULL) {
+            free(co_extra->ce_extras[index]);
+        }
     }
 
     co_extra->ce_extras[index] = extra;
diff --git a/Python/ceval.c b/Python/ceval.c
index eba892c..ea79f5f 100644
--- a/Python/ceval.c
+++ b/Python/ceval.c
@@ -5453,14 +5453,14 @@
 Py_ssize_t
 _PyEval_RequestCodeExtraIndex(freefunc free)
 {
-    PyThreadState *tstate = PyThreadState_Get();
+    __PyCodeExtraState *state = __PyCodeExtraState_Get();
     Py_ssize_t new_index;
 
-    if (tstate->co_extra_user_count == MAX_CO_EXTRA_USERS - 1) {
+    if (state->co_extra_user_count == MAX_CO_EXTRA_USERS - 1) {
         return -1;
     }
-    new_index = tstate->co_extra_user_count++;
-    tstate->co_extra_freefuncs[new_index] = free;
+    new_index = state->co_extra_user_count++;
+    state->co_extra_freefuncs[new_index] = free;
     return new_index;
 }
 
diff --git a/Python/pystate.c b/Python/pystate.c
index ccb0092..92d08c4 100644
--- a/Python/pystate.c
+++ b/Python/pystate.c
@@ -55,6 +55,7 @@
 #endif
 
 static PyInterpreterState *interp_head = NULL;
+static __PyCodeExtraState *coextra_head = NULL;
 
 /* Assuming the current thread holds the GIL, this is the
    PyThreadState for the current thread. */
@@ -73,6 +74,12 @@
                                  PyMem_RawMalloc(sizeof(PyInterpreterState));
 
     if (interp != NULL) {
+        __PyCodeExtraState* coextra = PyMem_RawMalloc(sizeof(__PyCodeExtraState));
+        if (coextra == NULL) {
+            PyMem_RawFree(interp);
+            return NULL;
+        }
+
         HEAD_INIT();
 #ifdef WITH_THREAD
         if (head_mutex == NULL)
@@ -92,6 +99,8 @@
         interp->importlib = NULL;
         interp->import_func = NULL;
         interp->eval_frame = _PyEval_EvalFrameDefault;
+        coextra->co_extra_user_count = 0;
+        coextra->interp = interp;
 #ifdef HAVE_DLOPEN
 #if HAVE_DECL_RTLD_NOW
         interp->dlopenflags = RTLD_NOW;
@@ -103,6 +112,8 @@
         HEAD_LOCK();
         interp->next = interp_head;
         interp_head = interp;
+        coextra->next = coextra_head;
+        coextra_head = coextra;
         HEAD_UNLOCK();
     }
 
@@ -147,9 +158,10 @@
 PyInterpreterState_Delete(PyInterpreterState *interp)
 {
     PyInterpreterState **p;
+    __PyCodeExtraState **pextra;
     zapthreads(interp);
     HEAD_LOCK();
-    for (p = &interp_head; ; p = &(*p)->next) {
+    for (p = &interp_head; /* N/A */; p = &(*p)->next) {
         if (*p == NULL)
             Py_FatalError(
                 "PyInterpreterState_Delete: invalid interp");
@@ -159,6 +171,18 @@
     if (interp->tstate_head != NULL)
         Py_FatalError("PyInterpreterState_Delete: remaining threads");
     *p = interp->next;
+
+    for (pextra = &coextra_head; ; pextra = &(*pextra)->next) {
+        if (*pextra == NULL)
+            Py_FatalError(
+                "PyInterpreterState_Delete: invalid extra");
+        __PyCodeExtraState* extra = *pextra;
+        if (extra->interp == interp) {
+            *pextra = extra->next;
+            PyMem_RawFree(extra);
+            break;
+        }
+    }
     HEAD_UNLOCK();
     PyMem_RawFree(interp);
 #ifdef WITH_THREAD
@@ -224,7 +248,6 @@
 
         tstate->coroutine_wrapper = NULL;
         tstate->in_coroutine_wrapper = 0;
-        tstate->co_extra_user_count = 0;
 
         tstate->async_gen_firstiter = NULL;
         tstate->async_gen_finalizer = NULL;
@@ -548,6 +571,23 @@
     return oldts;
 }
 
+__PyCodeExtraState* 
+__PyCodeExtraState_Get() {
+    PyInterpreterState* interp = PyThreadState_Get()->interp;
+
+    HEAD_LOCK();
+    for (__PyCodeExtraState* cur = coextra_head; cur != NULL; cur = cur->next) {
+        if (cur->interp == interp) {
+            HEAD_UNLOCK();
+            return cur;
+        }
+    }
+    HEAD_UNLOCK();
+
+    Py_FatalError("__PyCodeExtraState_Get: no code state for interpreter");
+    return NULL;
+}
+
 /* An extension mechanism to store arbitrary additional per-thread state.
    PyThreadState_GetDict() returns a dictionary that can be used to hold such
    state; the caller should pick a unique key and store its state there.  If