Issue #16330: Use surrogate-related macros
Patch written by Serhiy Storchaka.
diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h
index fa21c1c..363776b 100644
--- a/Include/unicodeobject.h
+++ b/Include/unicodeobject.h
@@ -180,9 +180,9 @@
} while (0)
/* macros to work with surrogates */
-#define Py_UNICODE_IS_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDFFF)
-#define Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF)
-#define Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF)
+#define Py_UNICODE_IS_SURROGATE(ch) (0xD800 <= (ch) && (ch) <= 0xDFFF)
+#define Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= (ch) && (ch) <= 0xDBFF)
+#define Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= (ch) && (ch) <= 0xDFFF)
/* Join two surrogate characters and return a single Py_UCS4 value. */
#define Py_UNICODE_JOIN_SURROGATES(high, low) \
(((((Py_UCS4)(high) & 0x03FF) << 10) | \
diff --git a/Modules/_json.c b/Modules/_json.c
index fb8bd59..2538b05 100644
--- a/Modules/_json.c
+++ b/Modules/_json.c
@@ -174,14 +174,13 @@
default:
if (c >= 0x10000) {
/* UTF-16 surrogate pair */
- Py_UCS4 v = c - 0x10000;
- c = 0xd800 | ((v >> 10) & 0x3ff);
+ Py_UCS4 v = Py_UNICODE_HIGH_SURROGATE(c);
output[chars++] = 'u';
- output[chars++] = Py_hexdigits[(c >> 12) & 0xf];
- output[chars++] = Py_hexdigits[(c >> 8) & 0xf];
- output[chars++] = Py_hexdigits[(c >> 4) & 0xf];
- output[chars++] = Py_hexdigits[(c ) & 0xf];
- c = 0xdc00 | (v & 0x3ff);
+ output[chars++] = Py_hexdigits[(v >> 12) & 0xf];
+ output[chars++] = Py_hexdigits[(v >> 8) & 0xf];
+ output[chars++] = Py_hexdigits[(v >> 4) & 0xf];
+ output[chars++] = Py_hexdigits[(v ) & 0xf];
+ c = Py_UNICODE_LOW_SURROGATE(c);
output[chars++] = '\\';
}
output[chars++] = 'u';
@@ -431,7 +430,7 @@
}
}
/* Surrogate pair */
- if ((c & 0xfc00) == 0xd800) {
+ if (Py_UNICODE_IS_HIGH_SURROGATE(c)) {
Py_UCS4 c2 = 0;
if (end + 6 >= len) {
raise_errmsg("Unpaired high surrogate", pystr, end - 5);
@@ -462,13 +461,13 @@
goto bail;
}
}
- if ((c2 & 0xfc00) != 0xdc00) {
+ if (!Py_UNICODE_IS_LOW_SURROGATE(c2)) {
raise_errmsg("Unpaired high surrogate", pystr, end - 5);
goto bail;
}
- c = 0x10000 + (((c - 0xd800) << 10) | (c2 - 0xdc00));
+ c = Py_UNICODE_JOIN_SURROGATES(c, c2);
}
- else if ((c & 0xfc00) == 0xdc00) {
+ else if (Py_UNICODE_IS_LOW_SURROGATE(c)) {
raise_errmsg("Unpaired low surrogate", pystr, end - 5);
goto bail;
}
diff --git a/Modules/cjkcodecs/cjkcodecs.h b/Modules/cjkcodecs/cjkcodecs.h
index ab0682a..fb41bdd 100644
--- a/Modules/cjkcodecs/cjkcodecs.h
+++ b/Modules/cjkcodecs/cjkcodecs.h
@@ -148,8 +148,8 @@
#if Py_UNICODE_SIZE == 2
# define WRITEUCS4(c) \
REQUIRE_OUTBUF(2) \
- (*outbuf)[0] = 0xd800 + (((c) - 0x10000) >> 10); \
- (*outbuf)[1] = 0xdc00 + (((c) - 0x10000) & 0x3ff); \
+ (*outbuf)[0] = Py_UNICODE_HIGH_SURROGATE(c); \
+ (*outbuf)[1] = Py_UNICODE_LOW_SURROGATE(c); \
NEXT_OUT(2)
#else
# define WRITEUCS4(c) \
@@ -188,11 +188,10 @@
#if Py_UNICODE_SIZE == 2
#define DECODE_SURROGATE(c) \
- if (c >> 10 == 0xd800 >> 10) { /* high surrogate */ \
+ if (Py_UNICODE_IS_HIGH_SURROGATE(c)) { \
REQUIRE_INBUF(2) \
- if (IN2 >> 10 == 0xdc00 >> 10) { /* low surrogate */ \
- c = 0x10000 + ((ucs4_t)(c - 0xd800) << 10) + \
- ((ucs4_t)(IN2) - 0xdc00); \
+ if (Py_UNICODE_IS_LOW_SURROGATE(IN2)) { \
+ c = Py_UNICODE_JOIN_SURROGATES(c, IN2) \
} \
}
#define GET_INSIZE(c) ((c) > 0xffff ? 2 : 1)
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 0a3712e..3e2e8e3 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -4412,7 +4412,7 @@
/* code first surrogate */
base64bits += 16;
- base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
+ base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
while (base64bits >= 6) {
*out++ = TO_BASE64(base64buffer >> (base64bits-6));
base64bits -= 6;
@@ -7052,9 +7052,8 @@
charsize = 1;
}
else {
- ch -= 0x10000;
- chars[0] = 0xd800 + (ch >> 10);
- chars[1] = 0xdc00 + (ch & 0x3ff);
+ chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
+ chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
charsize = 2;
}
diff --git a/Python/codecs.c b/Python/codecs.c
index 5470500..5cfb1c9 100644
--- a/Python/codecs.c
+++ b/Python/codecs.c
@@ -761,7 +761,7 @@
for (i = start; i < end; i++) {
/* object is guaranteed to be "ready" */
Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
- if (ch < 0xd800 || ch > 0xdfff) {
+ if (!Py_UNICODE_IS_SURROGATE(ch)) {
/* Not a surrogate, fail with original exception */
PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
Py_DECREF(res);
@@ -797,7 +797,7 @@
(p[2] & 0xc0) == 0x80)) {
/* it's a three-byte code */
ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
- if (ch < 0xd800 || ch > 0xdfff)
+ if (!Py_UNICODE_IS_SURROGATE(ch))
/* it's not a surrogate - fail */
ch = 0;
}
diff --git a/Python/fileutils.c b/Python/fileutils.c
index 501cb8c..526751d 100644
--- a/Python/fileutils.c
+++ b/Python/fileutils.c
@@ -85,7 +85,7 @@
/* Only use the result if it contains no
surrogate characters. */
for (tmp = res; *tmp != 0 &&
- (*tmp < 0xd800 || *tmp > 0xdfff); tmp++)
+ !Py_UNICODE_IS_SURROGATE(*tmp); tmp++)
;
if (*tmp == 0) {
if (size != NULL)
@@ -131,7 +131,7 @@
memset(&mbs, 0, sizeof mbs);
continue;
}
- if (*out >= 0xd800 && *out <= 0xdfff) {
+ if (Py_UNICODE_IS_SURROGATE(*out)) {
/* Surrogate character. Escape the original
byte sequence with surrogateescape. */
argsize -= converted;