bpo-40521: Make Unicode latin1 singletons per interpreter (GH-21101)
Each interpreter now has its own Unicode latin1 singletons.
Remove "ifdef EXPERIMENTAL_ISOLATED_SUBINTERPRETERS"
and "ifdef LATIN1_SINGLETONS": always enable latin1 singletons.
Optimize unicode_result_ready(): only attempt to get a latin1
singleton for PyUnicode_1BYTE_KIND.
diff --git a/Include/internal/pycore_interp.h b/Include/internal/pycore_interp.h
index d8947e7..bf1769e 100644
--- a/Include/internal/pycore_interp.h
+++ b/Include/internal/pycore_interp.h
@@ -73,6 +73,9 @@
struct _Py_unicode_state {
// The empty Unicode object is a singleton to improve performance.
PyObject *empty;
+ /* Single character Unicode strings in the Latin-1 range are being
+ shared as well. */
+ PyObject *latin1[256];
struct _Py_unicode_fs_codec fs_codec;
};
diff --git a/Misc/NEWS.d/next/Core and Builtins/2020-05-20-01-17-34.bpo-40521.wvAehI.rst b/Misc/NEWS.d/next/Core and Builtins/2020-05-20-01-17-34.bpo-40521.wvAehI.rst
index e970551..4322693 100644
--- a/Misc/NEWS.d/next/Core and Builtins/2020-05-20-01-17-34.bpo-40521.wvAehI.rst
+++ b/Misc/NEWS.d/next/Core and Builtins/2020-05-20-01-17-34.bpo-40521.wvAehI.rst
@@ -3,7 +3,7 @@
* Free lists: float, tuple, list, dict, frame, context,
asynchronous generator, MemoryError.
* Singletons: empty tuple, empty bytes string, empty Unicode string,
- single byte character.
+ single byte character, single Unicode (latin1) character.
* Slice cache.
They are no longer shared by all interpreters.
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index e4235b1..5ba9951 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -303,17 +303,6 @@
/* List of static strings. */
static _Py_Identifier *static_strings = NULL;
-/* bpo-40521: Latin1 singletons are shared by all interpreters. */
-#ifndef EXPERIMENTAL_ISOLATED_SUBINTERPRETERS
-# define LATIN1_SINGLETONS
-#endif
-
-#ifdef LATIN1_SINGLETONS
-/* Single character Unicode strings in the Latin-1 range are being
- shared as well. */
-static PyObject *unicode_latin1[256] = {NULL};
-#endif
-
/* Fast detection of the most frequent whitespace characters */
const unsigned char _Py_ascii_whitespace[] = {
0, 0, 0, 0, 0, 0, 0, 0,
@@ -657,9 +646,8 @@
if (len == 1) {
wchar_t ch = _PyUnicode_WSTR(unicode)[0];
if ((Py_UCS4)ch < 256) {
- PyObject *latin1_char = get_latin1_char((unsigned char)ch);
Py_DECREF(unicode);
- return latin1_char;
+ return get_latin1_char((unsigned char)ch);
}
}
@@ -692,13 +680,13 @@
return empty;
}
-#ifdef LATIN1_SINGLETONS
if (length == 1) {
- const void *data = PyUnicode_DATA(unicode);
int kind = PyUnicode_KIND(unicode);
- Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
- if (ch < 256) {
- PyObject *latin1_char = unicode_latin1[ch];
+ if (kind == PyUnicode_1BYTE_KIND) {
+ Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
+ Py_UCS1 ch = data[0];
+ struct _Py_unicode_state *state = get_unicode_state();
+ PyObject *latin1_char = state->latin1[ch];
if (latin1_char != NULL) {
if (unicode != latin1_char) {
Py_INCREF(latin1_char);
@@ -709,12 +697,14 @@
else {
assert(_PyUnicode_CheckConsistency(unicode, 1));
Py_INCREF(unicode);
- unicode_latin1[ch] = unicode;
+ state->latin1[ch] = unicode;
return unicode;
}
}
+ else {
+ assert(PyUnicode_READ_CHAR(unicode, 0) >= 256);
+ }
}
-#endif
assert(_PyUnicode_CheckConsistency(unicode, 1));
return unicode;
@@ -1981,18 +1971,18 @@
static int
unicode_is_singleton(PyObject *unicode)
{
- if (unicode == unicode_get_empty()) {
+ struct _Py_unicode_state *state = get_unicode_state();
+ if (unicode == state->empty) {
return 1;
}
-#ifdef LATIN1_SINGLETONS
PyASCIIObject *ascii = (PyASCIIObject *)unicode;
if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
{
Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
- if (ch < 256 && unicode_latin1[ch] == unicode)
+ if (ch < 256 && state->latin1[ch] == unicode) {
return 1;
+ }
}
-#endif
return 0;
}
#endif
@@ -2130,17 +2120,15 @@
}
static PyObject*
-get_latin1_char(unsigned char ch)
+get_latin1_char(Py_UCS1 ch)
{
- PyObject *unicode;
+ struct _Py_unicode_state *state = get_unicode_state();
-#ifdef LATIN1_SINGLETONS
- unicode = unicode_latin1[ch];
+ PyObject *unicode = state->latin1[ch];
if (unicode) {
Py_INCREF(unicode);
return unicode;
}
-#endif
unicode = PyUnicode_New(1, ch);
if (!unicode) {
@@ -2150,10 +2138,8 @@
PyUnicode_1BYTE_DATA(unicode)[0] = ch;
assert(_PyUnicode_CheckConsistency(unicode, 1));
-#ifdef LATIN1_SINGLETONS
Py_INCREF(unicode);
- unicode_latin1[ch] = unicode;
-#endif
+ state->latin1[ch] = unicode;
return unicode;
}
@@ -2164,8 +2150,9 @@
assert(ch <= MAX_UNICODE);
- if (ch < 256)
+ if (ch < 256) {
return get_latin1_char(ch);
+ }
unicode = PyUnicode_New(1, ch);
if (unicode == NULL)
@@ -2367,11 +2354,13 @@
PyObject *res;
unsigned char max_char;
- if (size == 0)
+ if (size == 0) {
_Py_RETURN_UNICODE_EMPTY();
+ }
assert(size > 0);
- if (size == 1)
+ if (size == 1) {
return get_latin1_char(u[0]);
+ }
max_char = ucs1lib_find_max_char(u, u + size);
res = PyUnicode_New(size, max_char);
@@ -5008,8 +4997,9 @@
/* ASCII is equivalent to the first 128 ordinals in Unicode. */
if (size == 1 && (unsigned char)s[0] < 128) {
- if (consumed)
+ if (consumed) {
*consumed = 1;
+ }
return get_latin1_char((unsigned char)s[0]);
}
@@ -7176,8 +7166,9 @@
_Py_RETURN_UNICODE_EMPTY();
/* ASCII is equivalent to the first 128 ordinals in Unicode. */
- if (size == 1 && (unsigned char)s[0] < 128)
+ if (size == 1 && (unsigned char)s[0] < 128) {
return get_latin1_char((unsigned char)s[0]);
+ }
// Shortcut for simple case
PyObject *u = PyUnicode_New(size, 127);
@@ -16234,12 +16225,11 @@
Py_CLEAR(state->empty);
+ for (Py_ssize_t i = 0; i < 256; i++) {
+ Py_CLEAR(state->latin1[i]);
+ }
+
if (is_main_interp) {
-#ifdef LATIN1_SINGLETONS
- for (Py_ssize_t i = 0; i < 256; i++) {
- Py_CLEAR(unicode_latin1[i]);
- }
-#endif
unicode_clear_static_strings();
}