bpo-40521: Make dict free lists per-interpreter (GH-20645)

Each interpreter now has its own dict free list:

* Move dict free lists into PyInterpreterState.
* Move PyDict_MAXFREELIST define to pycore_interp.h
* Add _Py_dict_state structure.
* Add tstate parameter to _PyDict_ClearFreeList() and _PyDict_Fini().
* In debug mode, ensure that the dict free lists are not used after
  _PyDict_Fini() is called.
* Remove "#ifdef EXPERIMENTAL_ISOLATED_SUBINTERPRETERS".
diff --git a/Include/internal/pycore_gc.h b/Include/internal/pycore_gc.h
index fd3fb7f..da202a1 100644
--- a/Include/internal/pycore_gc.h
+++ b/Include/internal/pycore_gc.h
@@ -169,7 +169,7 @@
 extern void _PyTuple_ClearFreeList(PyThreadState *tstate);
 extern void _PyFloat_ClearFreeList(PyThreadState *tstate);
 extern void _PyList_ClearFreeList(PyThreadState *tstate);
-extern void _PyDict_ClearFreeList(void);
+extern void _PyDict_ClearFreeList(PyThreadState *tstate);
 extern void _PyAsyncGen_ClearFreeLists(PyThreadState *tstate);
 extern void _PyContext_ClearFreeList(PyThreadState *tstate);
 
diff --git a/Include/internal/pycore_interp.h b/Include/internal/pycore_interp.h
index 981b733..3f64edc 100644
--- a/Include/internal/pycore_interp.h
+++ b/Include/internal/pycore_interp.h
@@ -69,6 +69,14 @@
     struct _Py_unicode_fs_codec fs_codec;
 };
 
+struct _Py_float_state {
+    /* Special free list
+       free_list is a singly-linked list of available PyFloatObjects,
+       linked via abuse of their ob_type members. */
+    int numfree;
+    PyFloatObject *free_list;
+};
+
 /* Speed optimization to avoid frequent malloc/free of small tuples */
 #ifndef PyTuple_MAXSAVESIZE
    // Largest tuple to save on free list
@@ -99,12 +107,16 @@
     int numfree;
 };
 
-struct _Py_float_state {
-    /* Special free list
-       free_list is a singly-linked list of available PyFloatObjects,
-       linked via abuse of their ob_type members. */
+#ifndef PyDict_MAXFREELIST
+#  define PyDict_MAXFREELIST 80
+#endif
+
+struct _Py_dict_state {
+    /* Dictionary reuse scheme to save calls to malloc and free */
+    PyDictObject *free_list[PyDict_MAXFREELIST];
     int numfree;
-    PyFloatObject *free_list;
+    PyDictKeysObject *keys_free_list[PyDict_MAXFREELIST];
+    int keys_numfree;
 };
 
 struct _Py_frame_state {
@@ -136,7 +148,6 @@
 };
 
 
-
 /* interpreter state */
 
 #define _PY_NSMALLPOSINTS           257
@@ -182,8 +193,6 @@
     PyObject *codec_error_registry;
     int codecs_initialized;
 
-    struct _Py_unicode_state unicode;
-
     PyConfig config;
 #ifdef HAVE_DLOPEN
     int dlopenflags;
@@ -224,16 +233,18 @@
     */
     PyLongObject* small_ints[_PY_NSMALLNEGINTS + _PY_NSMALLPOSINTS];
 #endif
-    struct _Py_tuple_state tuple;
-    struct _Py_list_state list;
+    struct _Py_unicode_state unicode;
     struct _Py_float_state float_state;
-    struct _Py_frame_state frame;
-    struct _Py_async_gen_state async_gen;
-    struct _Py_context_state context;
-
     /* Using a cache is very effective since typically only a single slice is
        created and then deleted again. */
     PySliceObject *slice_cache;
+
+    struct _Py_tuple_state tuple;
+    struct _Py_list_state list;
+    struct _Py_dict_state dict_state;
+    struct _Py_frame_state frame;
+    struct _Py_async_gen_state async_gen;
+    struct _Py_context_state context;
 };
 
 /* Used by _PyImport_Cleanup() */
diff --git a/Include/internal/pycore_pylifecycle.h b/Include/internal/pycore_pylifecycle.h
index 3e36573..dc99737 100644
--- a/Include/internal/pycore_pylifecycle.h
+++ b/Include/internal/pycore_pylifecycle.h
@@ -59,7 +59,7 @@
 /* Various internal finalizers */
 
 extern void _PyFrame_Fini(PyThreadState *tstate);
-extern void _PyDict_Fini(void);
+extern void _PyDict_Fini(PyThreadState *tstate);
 extern void _PyTuple_Fini(PyThreadState *tstate);
 extern void _PyList_Fini(PyThreadState *tstate);
 extern void _PySet_Fini(void);
diff --git a/Misc/NEWS.d/next/Core and Builtins/2020-05-20-01-17-34.bpo-40521.wvAehI.rst b/Misc/NEWS.d/next/Core and Builtins/2020-05-20-01-17-34.bpo-40521.wvAehI.rst
index 39cb804..3406ca8 100644
--- a/Misc/NEWS.d/next/Core and Builtins/2020-05-20-01-17-34.bpo-40521.wvAehI.rst
+++ b/Misc/NEWS.d/next/Core and Builtins/2020-05-20-01-17-34.bpo-40521.wvAehI.rst
@@ -1,4 +1,5 @@
 The tuple free lists, the empty tuple singleton, the list free list, the float
-free list, the slice cache, the frame free list, the asynchronous generator
-free lists, and the context free list are no longer shared by all interpreters:
-each interpreter now its has own free lists and caches.
+free list, the slice cache, the dict free lists, the frame free list, the
+asynchronous generator free lists, and the context free list are no longer
+shared by all interpreters: each interpreter now its has own free lists and
+caches.
diff --git a/Modules/gcmodule.c b/Modules/gcmodule.c
index 110a48d..8833400 100644
--- a/Modules/gcmodule.c
+++ b/Modules/gcmodule.c
@@ -1038,7 +1038,7 @@
     _PyTuple_ClearFreeList(tstate);
     _PyFloat_ClearFreeList(tstate);
     _PyList_ClearFreeList(tstate);
-    _PyDict_ClearFreeList();
+    _PyDict_ClearFreeList(tstate);
     _PyAsyncGen_ClearFreeLists(tstate);
     _PyContext_ClearFreeList(tstate);
 }
diff --git a/Objects/dictobject.c b/Objects/dictobject.c
index 55bf4ae..f3b1157 100644
--- a/Objects/dictobject.c
+++ b/Objects/dictobject.c
@@ -247,58 +247,47 @@
 
 #define DICT_NEXT_VERSION() (++pydict_global_version)
 
-/* Dictionary reuse scheme to save calls to malloc and free */
-#ifndef PyDict_MAXFREELIST
-#define PyDict_MAXFREELIST 80
-#endif
-
-/* bpo-40521: dict free lists are shared by all interpreters. */
-#ifdef EXPERIMENTAL_ISOLATED_SUBINTERPRETERS
-#  undef PyDict_MAXFREELIST
-#  define PyDict_MAXFREELIST 0
-#endif
-
-#if PyDict_MAXFREELIST > 0
-static PyDictObject *free_list[PyDict_MAXFREELIST];
-static int numfree = 0;
-static PyDictKeysObject *keys_free_list[PyDict_MAXFREELIST];
-static int numfreekeys = 0;
-#endif
-
 #include "clinic/dictobject.c.h"
 
 void
-_PyDict_ClearFreeList(void)
+_PyDict_ClearFreeList(PyThreadState *tstate)
 {
-#if PyDict_MAXFREELIST > 0
-    while (numfree) {
-        PyDictObject *op = free_list[--numfree];
+    struct _Py_dict_state *state = &tstate->interp->dict_state;
+    while (state->numfree) {
+        PyDictObject *op = state->free_list[--state->numfree];
         assert(PyDict_CheckExact(op));
         PyObject_GC_Del(op);
     }
-    while (numfreekeys) {
-        PyObject_FREE(keys_free_list[--numfreekeys]);
+    while (state->keys_numfree) {
+        PyObject_FREE(state->keys_free_list[--state->keys_numfree]);
     }
+}
+
+
+void
+_PyDict_Fini(PyThreadState *tstate)
+{
+    _PyDict_ClearFreeList(tstate);
+#ifdef Py_DEBUG
+    PyInterpreterState *interp = _PyInterpreterState_GET();
+    struct _Py_dict_state *state = &interp->dict_state;
+    state->numfree = -1;
+    state->keys_numfree = -1;
 #endif
 }
 
+
 /* Print summary info about the state of the optimized allocator */
 void
 _PyDict_DebugMallocStats(FILE *out)
 {
-#if PyDict_MAXFREELIST > 0
-    _PyDebugAllocatorStats(out,
-                           "free PyDictObject", numfree, sizeof(PyDictObject));
-#endif
+    PyInterpreterState *interp = _PyInterpreterState_GET();
+    struct _Py_dict_state *state = &interp->dict_state;
+    _PyDebugAllocatorStats(out, "free PyDictObject",
+                           state->numfree, sizeof(PyDictObject));
 }
 
 
-void
-_PyDict_Fini(void)
-{
-    _PyDict_ClearFreeList();
-}
-
 #define DK_SIZE(dk) ((dk)->dk_size)
 #if SIZEOF_VOID_P > 4
 #define DK_IXSIZE(dk)                          \
@@ -543,7 +532,8 @@
 }
 
 
-static PyDictKeysObject *new_keys_object(Py_ssize_t size)
+static PyDictKeysObject*
+new_keys_object(Py_ssize_t size)
 {
     PyDictKeysObject *dk;
     Py_ssize_t es, usable;
@@ -567,12 +557,16 @@
         es = sizeof(Py_ssize_t);
     }
 
-#if PyDict_MAXFREELIST > 0
-    if (size == PyDict_MINSIZE && numfreekeys > 0) {
-        dk = keys_free_list[--numfreekeys];
+    PyInterpreterState *interp = _PyInterpreterState_GET();
+    struct _Py_dict_state *state = &interp->dict_state;
+#ifdef Py_DEBUG
+    // new_keys_object() must not be called after _PyDict_Fini()
+    assert(state->keys_numfree != -1);
+#endif
+    if (size == PyDict_MINSIZE && state->keys_numfree > 0) {
+        dk = state->keys_free_list[--state->keys_numfree];
     }
     else
-#endif
     {
         dk = PyObject_MALLOC(sizeof(PyDictKeysObject)
                              + es * size
@@ -604,12 +598,16 @@
         Py_XDECREF(entries[i].me_key);
         Py_XDECREF(entries[i].me_value);
     }
-#if PyDict_MAXFREELIST > 0
-    if (keys->dk_size == PyDict_MINSIZE && numfreekeys < PyDict_MAXFREELIST) {
-        keys_free_list[numfreekeys++] = keys;
+    PyInterpreterState *interp = _PyInterpreterState_GET();
+    struct _Py_dict_state *state = &interp->dict_state;
+#ifdef Py_DEBUG
+    // free_keys_object() must not be called after _PyDict_Fini()
+    assert(state->keys_numfree != -1);
+#endif
+    if (keys->dk_size == PyDict_MINSIZE && state->keys_numfree < PyDict_MAXFREELIST) {
+        state->keys_free_list[state->keys_numfree++] = keys;
         return;
     }
-#endif
     PyObject_FREE(keys);
 }
 
@@ -622,16 +620,19 @@
 {
     PyDictObject *mp;
     assert(keys != NULL);
-#if PyDict_MAXFREELIST > 0
-    if (numfree) {
-        mp = free_list[--numfree];
+    PyInterpreterState *interp = _PyInterpreterState_GET();
+    struct _Py_dict_state *state = &interp->dict_state;
+#ifdef Py_DEBUG
+    // new_dict() must not be called after _PyDict_Fini()
+    assert(state->numfree != -1);
+#endif
+    if (state->numfree) {
+        mp = state->free_list[--state->numfree];
         assert (mp != NULL);
         assert (Py_IS_TYPE(mp, &PyDict_Type));
         _Py_NewReference((PyObject *)mp);
     }
-    else
-#endif
-    {
+    else {
         mp = PyObject_GC_New(PyDictObject, &PyDict_Type);
         if (mp == NULL) {
             dictkeys_decref(keys);
@@ -1280,15 +1281,18 @@
 #ifdef Py_REF_DEBUG
         _Py_RefTotal--;
 #endif
-#if PyDict_MAXFREELIST > 0
-        if (oldkeys->dk_size == PyDict_MINSIZE &&
-            numfreekeys < PyDict_MAXFREELIST)
-        {
-            keys_free_list[numfreekeys++] = oldkeys;
-        }
-        else
+        PyInterpreterState *interp = _PyInterpreterState_GET();
+        struct _Py_dict_state *state = &interp->dict_state;
+#ifdef Py_DEBUG
+        // dictresize() must not be called after _PyDict_Fini()
+        assert(state->keys_numfree != -1);
 #endif
+        if (oldkeys->dk_size == PyDict_MINSIZE &&
+            state->keys_numfree < PyDict_MAXFREELIST)
         {
+            state->keys_free_list[state->keys_numfree++] = oldkeys;
+        }
+        else {
             PyObject_FREE(oldkeys);
         }
     }
@@ -2028,13 +2032,16 @@
         assert(keys->dk_refcnt == 1);
         dictkeys_decref(keys);
     }
-#if PyDict_MAXFREELIST > 0
-    if (numfree < PyDict_MAXFREELIST && Py_IS_TYPE(mp, &PyDict_Type)) {
-        free_list[numfree++] = mp;
-    }
-    else
+    PyInterpreterState *interp = _PyInterpreterState_GET();
+    struct _Py_dict_state *state = &interp->dict_state;
+#ifdef Py_DEBUG
+    // new_dict() must not be called after _PyDict_Fini()
+    assert(state->numfree != -1);
 #endif
-    {
+    if (state->numfree < PyDict_MAXFREELIST && Py_IS_TYPE(mp, &PyDict_Type)) {
+        state->free_list[state->numfree++] = mp;
+    }
+    else {
         Py_TYPE(mp)->tp_free((PyObject *)mp);
     }
     Py_TRASHCAN_END
diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c
index 87f25e6..1b4a3db 100644
--- a/Python/pylifecycle.c
+++ b/Python/pylifecycle.c
@@ -1258,9 +1258,7 @@
     if (is_main_interp) {
         _PySet_Fini();
     }
-    if (is_main_interp) {
-        _PyDict_Fini();
-    }
+    _PyDict_Fini(tstate);
     _PyList_Fini(tstate);
     _PyTuple_Fini(tstate);