bpo-39465: Fix _PyUnicode_FromId() for subinterpreters (GH-20058)

Make _PyUnicode_FromId() function compatible with subinterpreters.
Each interpreter now has an array of identifier objects (interned
strings decoded from UTF-8).

* Add PyInterpreterState.unicode.identifiers: array of identifiers
  objects.
* Add _PyRuntimeState.unicode_ids used to allocate unique indexes
  to _Py_Identifier.
* Rewrite the _Py_Identifier structure.

Microbenchmark on _PyUnicode_FromId(&PyId_a) with _Py_IDENTIFIER(a):

[ref] 2.42 ns +- 0.00 ns -> [atomic] 3.39 ns +- 0.00 ns: 1.40x slower

This change adds 1 ns per _PyUnicode_FromId() call in average.
diff --git a/Python/pystate.c b/Python/pystate.c
index ead25b0..231144b 100644
--- a/Python/pystate.c
+++ b/Python/pystate.c
@@ -73,18 +73,24 @@
 
     runtime->interpreters.mutex = PyThread_allocate_lock();
     if (runtime->interpreters.mutex == NULL) {
-        return _PyStatus_ERR("Can't initialize threads for interpreter");
+        return _PyStatus_NO_MEMORY();
     }
     runtime->interpreters.next_id = -1;
 
     runtime->xidregistry.mutex = PyThread_allocate_lock();
     if (runtime->xidregistry.mutex == NULL) {
-        return _PyStatus_ERR("Can't initialize threads for cross-interpreter data registry");
+        return _PyStatus_NO_MEMORY();
     }
 
     // Set it to the ID of the main thread of the main interpreter.
     runtime->main_thread = PyThread_get_thread_ident();
 
+    runtime->unicode_ids.lock = PyThread_allocate_lock();
+    if (runtime->unicode_ids.lock == NULL) {
+        return _PyStatus_NO_MEMORY();
+    }
+    runtime->unicode_ids.next_index = 0;
+
     return _PyStatus_OK();
 }
 
@@ -108,17 +114,17 @@
     /* Force the allocator used by _PyRuntimeState_Init(). */
     PyMemAllocatorEx old_alloc;
     _PyMem_SetDefaultAllocator(PYMEM_DOMAIN_RAW, &old_alloc);
-
-    if (runtime->interpreters.mutex != NULL) {
-        PyThread_free_lock(runtime->interpreters.mutex);
-        runtime->interpreters.mutex = NULL;
+#define FREE_LOCK(LOCK) \
+    if (LOCK != NULL) { \
+        PyThread_free_lock(LOCK); \
+        LOCK = NULL; \
     }
 
-    if (runtime->xidregistry.mutex != NULL) {
-        PyThread_free_lock(runtime->xidregistry.mutex);
-        runtime->xidregistry.mutex = NULL;
-    }
+    FREE_LOCK(runtime->interpreters.mutex);
+    FREE_LOCK(runtime->xidregistry.mutex);
+    FREE_LOCK(runtime->unicode_ids.lock);
 
+#undef FREE_LOCK
     PyMem_SetAllocator(PYMEM_DOMAIN_RAW, &old_alloc);
 }
 
@@ -139,12 +145,14 @@
     int reinit_interp = _PyThread_at_fork_reinit(&runtime->interpreters.mutex);
     int reinit_main_id = _PyThread_at_fork_reinit(&runtime->interpreters.main->id_mutex);
     int reinit_xidregistry = _PyThread_at_fork_reinit(&runtime->xidregistry.mutex);
+    int reinit_unicode_ids = _PyThread_at_fork_reinit(&runtime->unicode_ids.lock);
 
     PyMem_SetAllocator(PYMEM_DOMAIN_RAW, &old_alloc);
 
     if (reinit_interp < 0
         || reinit_main_id < 0
-        || reinit_xidregistry < 0)
+        || reinit_xidregistry < 0
+        || reinit_unicode_ids < 0)
     {
         return _PyStatus_ERR("Failed to reinitialize runtime locks");