Issue #11489: JSON decoder now accepts lone surrogates.
diff --git a/Lib/json/decoder.py b/Lib/json/decoder.py
index dfcc628..1b43238 100644
--- a/Lib/json/decoder.py
+++ b/Lib/json/decoder.py
@@ -62,6 +62,16 @@
DEFAULT_ENCODING = "utf-8"
+def _decode_uXXXX(s, pos):
+ esc = s[pos + 1:pos + 5]
+ if len(esc) == 4 and esc[1] not in 'xX':
+ try:
+ return int(esc, 16)
+ except ValueError:
+ pass
+ msg = "Invalid \\uXXXX escape"
+ raise ValueError(errmsg(msg, s, pos))
+
def py_scanstring(s, end, encoding=None, strict=True,
_b=BACKSLASH, _m=STRINGCHUNK.match):
"""Scan the string s for a JSON string. End is the index of the
@@ -116,25 +126,16 @@
end += 1
else:
# Unicode escape sequence
- esc = s[end + 1:end + 5]
- next_end = end + 5
- if len(esc) != 4:
- msg = "Invalid \\uXXXX escape"
- raise ValueError(errmsg(msg, s, end))
- uni = int(esc, 16)
+ uni = _decode_uXXXX(s, end)
+ end += 5
# Check for surrogate pair on UCS-4 systems
- if 0xd800 <= uni <= 0xdbff and sys.maxunicode > 65535:
- msg = "Invalid \\uXXXX\\uXXXX surrogate pair"
- if not s[end + 5:end + 7] == '\\u':
- raise ValueError(errmsg(msg, s, end))
- esc2 = s[end + 7:end + 11]
- if len(esc2) != 4:
- raise ValueError(errmsg(msg, s, end))
- uni2 = int(esc2, 16)
- uni = 0x10000 + (((uni - 0xd800) << 10) | (uni2 - 0xdc00))
- next_end += 6
+ if sys.maxunicode > 65535 and \
+ 0xd800 <= uni <= 0xdbff and s[end:end + 2] == '\\u':
+ uni2 = _decode_uXXXX(s, end + 1)
+ if 0xdc00 <= uni2 <= 0xdfff:
+ uni = 0x10000 + (((uni - 0xd800) << 10) | (uni2 - 0xdc00))
+ end += 6
char = unichr(uni)
- end = next_end
# Append the unescaped character
_append(char)
return u''.join(chunks), end
diff --git a/Lib/json/tests/test_scanstring.py b/Lib/json/tests/test_scanstring.py
index 4fef8cb..d17ebdd 100644
--- a/Lib/json/tests/test_scanstring.py
+++ b/Lib/json/tests/test_scanstring.py
@@ -5,10 +5,6 @@
class TestScanstring(object):
def test_scanstring(self):
scanstring = self.json.decoder.scanstring
- self.assertEqual(
- scanstring('"z\\ud834\\udd20x"', 1, None, True),
- (u'z\U0001d120x', 16))
-
if sys.maxunicode == 65535:
self.assertEqual(
scanstring(u'"z\U0001d120x"', 1, None, True),
@@ -94,6 +90,57 @@
scanstring('["Bad value", truth]', 2, None, True),
(u'Bad value', 12))
+ def test_surrogates(self):
+ scanstring = self.json.decoder.scanstring
+ def assertScan(given, expect):
+ self.assertEqual(scanstring(given, 1, None, True),
+ (expect, len(given)))
+ if not isinstance(given, unicode):
+ given = unicode(given)
+ self.assertEqual(scanstring(given, 1, None, True),
+ (expect, len(given)))
+
+ assertScan('"z\\ud834\\u0079x"', u'z\ud834yx')
+ assertScan('"z\\ud834\\udd20x"', u'z\U0001d120x')
+ assertScan('"z\\ud834\\ud834\\udd20x"', u'z\ud834\U0001d120x')
+ assertScan('"z\\ud834x"', u'z\ud834x')
+ assertScan(u'"z\\ud834\udd20x12345"', u'z\ud834\udd20x12345')
+ assertScan('"z\\udd20x"', u'z\udd20x')
+ assertScan(u'"z\ud834\udd20x"', u'z\ud834\udd20x')
+ assertScan(u'"z\ud834\\udd20x"', u'z\ud834\udd20x')
+ assertScan(u'"z\ud834x"', u'z\ud834x')
+
+ def test_bad_escapes(self):
+ scanstring = self.json.decoder.scanstring
+ bad_escapes = [
+ '"\\"',
+ '"\\x"',
+ '"\\u"',
+ '"\\u0"',
+ '"\\u01"',
+ '"\\u012"',
+ '"\\uz012"',
+ '"\\u0z12"',
+ '"\\u01z2"',
+ '"\\u012z"',
+ '"\\u0x12"',
+ '"\\u0X12"',
+ '"\\ud834\\"',
+ '"\\ud834\\u"',
+ '"\\ud834\\ud"',
+ '"\\ud834\\udd"',
+ '"\\ud834\\udd2"',
+ '"\\ud834\\uzdd2"',
+ '"\\ud834\\udzd2"',
+ '"\\ud834\\uddz2"',
+ '"\\ud834\\udd2z"',
+ '"\\ud834\\u0x20"',
+ '"\\ud834\\u0X20"',
+ ]
+ for s in bad_escapes:
+ with self.assertRaises(ValueError):
+ scanstring(s, 1, None, True)
+
def test_issue3623(self):
self.assertRaises(ValueError, self.json.decoder.scanstring, b"xxx", 1,
"xxx")