bpo-32583: Fix possible crashing in builtin Unicode decoders (#5325) When using customized decode error handlers, it is possible for builtin decoders to write out-of-bounds and then crash.

commit: 2c7fd46e11333ef5e5cce34212f7d087694f3658 [log] [tgz]
author: Xiang Zhang <angwerzx@126.com> Wed Jan 31 20:48:05 2018 +0800
committer: GitHub <noreply@github.com> Wed Jan 31 20:48:05 2018 +0800
tree: 0497c3b1fa32112a475fe3b7da5390b59205f7fd
parent: 84521047e413d7d1150aaa1c333580b683b3f4b1 [diff]
diff --git a/Lib/test/test_codeccallbacks.py b/Lib/test/test_codeccallbacks.py
index 0c066e6..e2e7463 100644
--- a/Lib/test/test_codeccallbacks.py
+++ b/Lib/test/test_codeccallbacks.py

@@ -1044,6 +1044,58 @@
             for (encoding, data) in baddata:
                 self.assertEqual(data.decode(encoding, "test.mutating"), "\u4242")
 
+    # issue32583
+    def test_crashing_decode_handler(self):
+        # better generating one more character to fill the extra space slot
+        # so in debug build it can steadily fail
+        def forward_shorter_than_end(exc):
+            if isinstance(exc, UnicodeDecodeError):
+                # size one character, 0 < forward < exc.end
+                return ('\ufffd', exc.start+1)
+            else:
+                raise TypeError("don't know how to handle %r" % exc)
+        codecs.register_error(
+            "test.forward_shorter_than_end", forward_shorter_than_end)
+
+        self.assertEqual(
+            b'\xd8\xd8\xd8\xd8\xd8\x00\x00\x00'.decode(
+                'utf-16-le', 'test.forward_shorter_than_end'),
+            '\ufffd\ufffd\ufffd\ufffd\xd8\x00'
+        )
+        self.assertEqual(
+            b'\xd8\xd8\xd8\xd8\x00\xd8\x00\x00'.decode(
+                'utf-16-be', 'test.forward_shorter_than_end'),
+            '\ufffd\ufffd\ufffd\ufffd\xd8\x00'
+        )
+        self.assertEqual(
+            b'\x11\x11\x11\x11\x11\x00\x00\x00\x00\x00\x00'.decode(
+                'utf-32-le', 'test.forward_shorter_than_end'),
+            '\ufffd\ufffd\ufffd\u1111\x00'
+        )
+        self.assertEqual(
+            b'\x11\x11\x11\x00\x00\x11\x11\x00\x00\x00\x00'.decode(
+                'utf-32-be', 'test.forward_shorter_than_end'),
+            '\ufffd\ufffd\ufffd\u1111\x00'
+        )
+
+        def replace_with_long(exc):
+            if isinstance(exc, UnicodeDecodeError):
+                exc.object = b"\x00" * 8
+                return ('\ufffd', exc.start)
+            else:
+                raise TypeError("don't know how to handle %r" % exc)
+        codecs.register_error("test.replace_with_long", replace_with_long)
+
+        self.assertEqual(
+            b'\x00'.decode('utf-16', 'test.replace_with_long'),
+            '\ufffd\x00\x00\x00\x00'
+        )
+        self.assertEqual(
+            b'\x00'.decode('utf-32', 'test.replace_with_long'),
+            '\ufffd\x00\x00'
+        )
+
+
     def test_fake_error_class(self):
         handlers = [
             codecs.strict_errors,

diff --git a/Misc/NEWS.d/next/Core and Builtins/2018-01-26-21-20-21.bpo-32583.Fh3fau.rst b/Misc/NEWS.d/next/Core and Builtins/2018-01-26-21-20-21.bpo-32583.Fh3fau.rst
new file mode 100644
index 0000000..45f1d04
--- /dev/null
+++ b/Misc/NEWS.d/next/Core and Builtins/2018-01-26-21-20-21.bpo-32583.Fh3fau.rst

@@ -0,0 +1,2 @@
+Fix possible crashing in builtin Unicode decoders caused by write
+out-of-bound errors when using customized decode error handlers.

diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 775bd15..3d9e09d 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c

@@ -4190,7 +4190,10 @@
     Py_ssize_t insize;
     Py_ssize_t newpos;
     Py_ssize_t replen;
+    Py_ssize_t remain;
     PyObject *inputobj = NULL;
+    int need_to_grow = 0;
+    const char *new_inptr;
 
     if (*errorHandler == NULL) {
         *errorHandler = PyCodec_LookupError(errors);
@@ -4221,6 +4224,7 @@
     inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
     if (!inputobj)
         goto onError;
+    remain = *inend - *input - *endinpos;
     *input = PyBytes_AS_STRING(inputobj);
     insize = PyBytes_GET_SIZE(inputobj);
     *inend = *input + insize;
@@ -4238,6 +4242,19 @@
     replen = PyUnicode_GET_LENGTH(repunicode);
     if (replen > 1) {
         writer->min_length += replen - 1;
+        need_to_grow = 1;
+    }
+    new_inptr = *input + newpos;
+    if (*inend - new_inptr > remain) {
+        /* We don't know the decoding algorithm here so we make the worst
+           assumption that one byte decodes to one unicode character.
+           If unfortunately one byte could decode to more unicode characters,
+           the decoder may write out-of-bound then.  Is it possible for the
+           algorithms using this function? */
+        writer->min_length += *inend - new_inptr - remain;
+        need_to_grow = 1;
+    }
+    if (need_to_grow) {
         writer->overallocate = 1;
         if (_PyUnicodeWriter_Prepare(writer, writer->min_length,
                             PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
@@ -4247,7 +4264,7 @@
         goto onError;
 
     *endinpos = newpos;
-    *inptr = *input + newpos;
+    *inptr = new_inptr;
 
     /* we made it! */
     Py_DECREF(restuple);
@@ -5572,7 +5589,8 @@
 #endif
 
     /* Note: size will always be longer than the resulting Unicode
-       character count */
+       character count normally.  Error handler will take care of
+       resizing when needed. */
     _PyUnicodeWriter_Init(&writer);
     writer.min_length = (e - q + 1) / 2;
     if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
commit	2c7fd46e11333ef5e5cce34212f7d087694f3658	[log] [tgz]
author	Xiang Zhang <angwerzx@126.com>	Wed Jan 31 20:48:05 2018 +0800
committer	GitHub <noreply@github.com>	Wed Jan 31 20:48:05 2018 +0800
tree	0497c3b1fa32112a475fe3b7da5390b59205f7fd
parent	84521047e413d7d1150aaa1c333580b683b3f4b1 [diff]