bpo-40521: Make Unicode latin1 singletons per interpreter (GH-21101) Each interpreter now has its own Unicode latin1 singletons. Remove "ifdef EXPERIMENTAL_ISOLATED_SUBINTERPRETERS" and "ifdef LATIN1_SINGLETONS": always enable latin1 singletons. Optimize unicode_result_ready(): only attempt to get a latin1 singleton for PyUnicode_1BYTE_KIND.

commit: 2f9ada96e0d420fed0d09a032b37197f08ef167a [log] [tgz]
author: Victor Stinner <vstinner@python.org> Wed Jun 24 02:22:21 2020 +0200
committer: GitHub <noreply@github.com> Wed Jun 24 02:22:21 2020 +0200
tree: 7b358df9980ee0e83c91d9e40626a5ccfd6d6a02
parent: bbf36e8903f8e86dcad8131c818e122537c30f9e [diff]
diff --git a/Include/internal/pycore_interp.h b/Include/internal/pycore_interp.h
index d8947e7..bf1769e 100644
--- a/Include/internal/pycore_interp.h
+++ b/Include/internal/pycore_interp.h

@@ -73,6 +73,9 @@
 struct _Py_unicode_state {
     // The empty Unicode object is a singleton to improve performance.
     PyObject *empty;
+    /* Single character Unicode strings in the Latin-1 range are being
+       shared as well. */
+    PyObject *latin1[256];
     struct _Py_unicode_fs_codec fs_codec;
 };
 

diff --git a/Misc/NEWS.d/next/Core and Builtins/2020-05-20-01-17-34.bpo-40521.wvAehI.rst b/Misc/NEWS.d/next/Core and Builtins/2020-05-20-01-17-34.bpo-40521.wvAehI.rst
index e970551..4322693 100644
--- a/Misc/NEWS.d/next/Core and Builtins/2020-05-20-01-17-34.bpo-40521.wvAehI.rst
+++ b/Misc/NEWS.d/next/Core and Builtins/2020-05-20-01-17-34.bpo-40521.wvAehI.rst

@@ -3,7 +3,7 @@
 * Free lists: float, tuple, list, dict, frame, context,
   asynchronous generator, MemoryError.
 * Singletons: empty tuple, empty bytes string, empty Unicode string,
-  single byte character.
+  single byte character, single Unicode (latin1) character.
 * Slice cache.
 
 They are no longer shared by all interpreters.

diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index e4235b1..5ba9951 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c

@@ -303,17 +303,6 @@
 /* List of static strings. */
 static _Py_Identifier *static_strings = NULL;
 
-/* bpo-40521: Latin1 singletons are shared by all interpreters. */
-#ifndef EXPERIMENTAL_ISOLATED_SUBINTERPRETERS
-#  define LATIN1_SINGLETONS
-#endif
-
-#ifdef LATIN1_SINGLETONS
-/* Single character Unicode strings in the Latin-1 range are being
-   shared as well. */
-static PyObject *unicode_latin1[256] = {NULL};
-#endif
-
 /* Fast detection of the most frequent whitespace characters */
 const unsigned char _Py_ascii_whitespace[] = {
     0, 0, 0, 0, 0, 0, 0, 0,
@@ -657,9 +646,8 @@
     if (len == 1) {
         wchar_t ch = _PyUnicode_WSTR(unicode)[0];
         if ((Py_UCS4)ch < 256) {
-            PyObject *latin1_char = get_latin1_char((unsigned char)ch);
             Py_DECREF(unicode);
-            return latin1_char;
+            return get_latin1_char((unsigned char)ch);
         }
     }
 
@@ -692,13 +680,13 @@
         return empty;
     }
 
-#ifdef LATIN1_SINGLETONS
     if (length == 1) {
-        const void *data = PyUnicode_DATA(unicode);
         int kind = PyUnicode_KIND(unicode);
-        Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
-        if (ch < 256) {
-            PyObject *latin1_char = unicode_latin1[ch];
+        if (kind == PyUnicode_1BYTE_KIND) {
+            Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
+            Py_UCS1 ch = data[0];
+            struct _Py_unicode_state *state = get_unicode_state();
+            PyObject *latin1_char = state->latin1[ch];
             if (latin1_char != NULL) {
                 if (unicode != latin1_char) {
                     Py_INCREF(latin1_char);
@@ -709,12 +697,14 @@
             else {
                 assert(_PyUnicode_CheckConsistency(unicode, 1));
                 Py_INCREF(unicode);
-                unicode_latin1[ch] = unicode;
+                state->latin1[ch] = unicode;
                 return unicode;
             }
         }
+        else {
+            assert(PyUnicode_READ_CHAR(unicode, 0) >= 256);
+        }
     }
-#endif
 
     assert(_PyUnicode_CheckConsistency(unicode, 1));
     return unicode;
@@ -1981,18 +1971,18 @@
 static int
 unicode_is_singleton(PyObject *unicode)
 {
-    if (unicode == unicode_get_empty()) {
+    struct _Py_unicode_state *state = get_unicode_state();
+    if (unicode == state->empty) {
         return 1;
     }
-#ifdef LATIN1_SINGLETONS
     PyASCIIObject *ascii = (PyASCIIObject *)unicode;
     if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
     {
         Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
-        if (ch < 256 && unicode_latin1[ch] == unicode)
+        if (ch < 256 && state->latin1[ch] == unicode) {
             return 1;
+        }
     }
-#endif
     return 0;
 }
 #endif
@@ -2130,17 +2120,15 @@
 }
 
 static PyObject*
-get_latin1_char(unsigned char ch)
+get_latin1_char(Py_UCS1 ch)
 {
-    PyObject *unicode;
+    struct _Py_unicode_state *state = get_unicode_state();
 
-#ifdef LATIN1_SINGLETONS
-    unicode = unicode_latin1[ch];
+    PyObject *unicode = state->latin1[ch];
     if (unicode) {
         Py_INCREF(unicode);
         return unicode;
     }
-#endif
 
     unicode = PyUnicode_New(1, ch);
     if (!unicode) {
@@ -2150,10 +2138,8 @@
     PyUnicode_1BYTE_DATA(unicode)[0] = ch;
     assert(_PyUnicode_CheckConsistency(unicode, 1));
 
-#ifdef LATIN1_SINGLETONS
     Py_INCREF(unicode);
-    unicode_latin1[ch] = unicode;
-#endif
+    state->latin1[ch] = unicode;
     return unicode;
 }
 
@@ -2164,8 +2150,9 @@
 
     assert(ch <= MAX_UNICODE);
 
-    if (ch < 256)
+    if (ch < 256) {
         return get_latin1_char(ch);
+    }
 
     unicode = PyUnicode_New(1, ch);
     if (unicode == NULL)
@@ -2367,11 +2354,13 @@
     PyObject *res;
     unsigned char max_char;
 
-    if (size == 0)
+    if (size == 0) {
         _Py_RETURN_UNICODE_EMPTY();
+    }
     assert(size > 0);
-    if (size == 1)
+    if (size == 1) {
         return get_latin1_char(u[0]);
+    }
 
     max_char = ucs1lib_find_max_char(u, u + size);
     res = PyUnicode_New(size, max_char);
@@ -5008,8 +4997,9 @@
 
     /* ASCII is equivalent to the first 128 ordinals in Unicode. */
     if (size == 1 && (unsigned char)s[0] < 128) {
-        if (consumed)
+        if (consumed) {
             *consumed = 1;
+        }
         return get_latin1_char((unsigned char)s[0]);
     }
 
@@ -7176,8 +7166,9 @@
         _Py_RETURN_UNICODE_EMPTY();
 
     /* ASCII is equivalent to the first 128 ordinals in Unicode. */
-    if (size == 1 && (unsigned char)s[0] < 128)
+    if (size == 1 && (unsigned char)s[0] < 128) {
         return get_latin1_char((unsigned char)s[0]);
+    }
 
     // Shortcut for simple case
     PyObject *u = PyUnicode_New(size, 127);
@@ -16234,12 +16225,11 @@
 
     Py_CLEAR(state->empty);
 
+    for (Py_ssize_t i = 0; i < 256; i++) {
+        Py_CLEAR(state->latin1[i]);
+    }
+
     if (is_main_interp) {
-#ifdef LATIN1_SINGLETONS
-        for (Py_ssize_t i = 0; i < 256; i++) {
-            Py_CLEAR(unicode_latin1[i]);
-        }
-#endif
         unicode_clear_static_strings();
     }
commit	2f9ada96e0d420fed0d09a032b37197f08ef167a	[log] [tgz]
author	Victor Stinner <vstinner@python.org>	Wed Jun 24 02:22:21 2020 +0200
committer	GitHub <noreply@github.com>	Wed Jun 24 02:22:21 2020 +0200
tree	7b358df9980ee0e83c91d9e40626a5ccfd6d6a02
parent	bbf36e8903f8e86dcad8131c818e122537c30f9e [diff]