Issue #3672: Reject surrogates in utf-8 codec; add surrogates error handler.

commit: db12d454e6176e9c933babe3ce40b225307c6305 [log] [tgz]
author: Martin v. Löwis <martin@v.loewis.de> Sat May 02 18:52:14 2009 +0000
committer: Martin v. Löwis <martin@v.loewis.de> Sat May 02 18:52:14 2009 +0000
tree: 28b09c64e9dfd797da58a98725bfb93b4dae7077
parent: 02953d244fdb2fe99853d2fe5db905df53c6596f [diff] [blame]
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 68d4fc4..cc70bad 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c

@@ -154,6 +154,11 @@
     0, 0, 0, 0, 0, 0, 0, 0
 };
 
+static PyObject *unicode_encode_call_errorhandler(const char *errors,
+       PyObject **errorHandler,const char *encoding, const char *reason,
+       const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
+       Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
+
 /* Same for linebreaks */
 static unsigned char ascii_linebreak[] = {
     0, 0, 0, 0, 0, 0, 0, 0,
@@ -2214,14 +2219,7 @@
                 goto utf8Error;
             }
             ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
-            if (ch < 0x0800) {
-                /* Note: UTF-8 encodings of surrogates are considered
-                   legal UTF-8 sequences;
-
-                   XXX For wide builds (UCS-4) we should probably try
-                   to recombine the surrogates into a single code
-                   unit.
-                */
+            if (ch < 0x0800 || (ch >= 0xd800 && ch <= 0xDFFF)) {
                 errmsg = "illegal encoding";
                 startinpos = s-starts;
                 endinpos = startinpos+3;
@@ -2328,6 +2326,8 @@
     Py_ssize_t nallocated;      /* number of result bytes allocated */
     Py_ssize_t nneeded;            /* number of result bytes needed */
     char stackbuf[MAX_SHORT_UNICHARS * 4];
+    PyObject *errorHandler = NULL;
+    PyObject *exc = NULL;
 
     assert(s != NULL);
     assert(size >= 0);
@@ -2367,6 +2367,7 @@
         else {
             /* Encode UCS2 Unicode ordinals */
             if (ch < 0x10000) {
+#ifndef Py_UNICODE_WIDE
                 /* Special case: check for high surrogate */
                 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
                     Py_UCS4 ch2 = s[i];
@@ -2379,6 +2380,36 @@
                     }
                     /* Fall through: handles isolated high surrogates */
                 }
+#endif
+                if (ch >= 0xd800 && ch <= 0xdfff) {
+                    Py_ssize_t newpos;
+                    PyObject *rep;
+                    char *prep;
+                    int k;
+                    rep = unicode_encode_call_errorhandler
+                        (errors, &errorHandler, "utf-8", "surrogates not allowed", 
+                         s, size, &exc, i-1, i, &newpos);
+                    if (!rep)
+                        goto error;
+                    /* Implementation limitations: only support error handler that return
+                       bytes, and only support up to four replacement bytes. */
+                    if (!PyBytes_Check(rep)) {
+                        PyErr_SetString(PyExc_TypeError, "error handler should have returned bytes");
+                        Py_DECREF(rep);
+                        goto error;
+                    }
+                    if (PyBytes_Size(rep) > 4) {
+                        PyErr_SetString(PyExc_TypeError, "error handler returned too many bytes");
+                        Py_DECREF(rep);
+                        goto error;
+                    }
+                    prep = PyBytes_AsString(rep);
+                    for(k = PyBytes_Size(rep); k > 0; k--)
+                        *p++ = *prep++;
+                    Py_DECREF(rep);
+                    continue;
+                    
+                }
                 *p++ = (char)(0xe0 | (ch >> 12));
                 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
                 *p++ = (char)(0x80 | (ch & 0x3f));
@@ -2405,7 +2436,14 @@
         assert(nneeded <= nallocated);
         _PyBytes_Resize(&result, nneeded);
     }
+    Py_XDECREF(errorHandler);
+    Py_XDECREF(exc);
     return result;
+ error:
+    Py_XDECREF(errorHandler);
+    Py_XDECREF(exc);
+    Py_XDECREF(result);
+    return NULL;
 
 #undef MAX_SHORT_UNICHARS
 }
@@ -3897,7 +3935,7 @@
                                                   Py_ssize_t startpos, Py_ssize_t endpos,
                                                   Py_ssize_t *newpos)
 {
-    static char *argparse = "O!n;encoding error handler must return (str, int) tuple";
+    static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
 
     PyObject *restuple;
     PyObject *resunicode;
@@ -3918,15 +3956,20 @@
     if (restuple == NULL)
         return NULL;
     if (!PyTuple_Check(restuple)) {
-        PyErr_SetString(PyExc_TypeError, &argparse[4]);
+        PyErr_SetString(PyExc_TypeError, &argparse[3]);
         Py_DECREF(restuple);
         return NULL;
     }
-    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
+    if (!PyArg_ParseTuple(restuple, argparse,
                           &resunicode, newpos)) {
         Py_DECREF(restuple);
         return NULL;
     }
+    if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
+        PyErr_SetString(PyExc_TypeError, &argparse[3]);
+        Py_DECREF(restuple);
+        return NULL;
+    }
     if (*newpos<0)
         *newpos = size+*newpos;
     if (*newpos<0 || *newpos>size) {
@@ -4064,6 +4107,12 @@
                                                               collstart-startp, collend-startp, &newpos);
                 if (repunicode == NULL)
                     goto onError;
+                if (!PyUnicode_Check(repunicode)) {
+                    /* Implementation limitation: byte results not supported yet. */
+                    PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
+                    Py_DECREF(repunicode);
+                    goto onError;
+                }
                 /* need more space? (at least enough for what we
                    have+the replacement+the rest of the string, so
                    we won't have to check space for encodable characters) */
@@ -5027,6 +5076,12 @@
                                                       collstartpos, collendpos, &newpos);
         if (repunicode == NULL)
             return -1;
+        if (!PyUnicode_Check(repunicode)) {
+            /* Implementation limitation: byte results not supported yet. */
+            PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
+            Py_DECREF(repunicode);
+            return -1;
+        }
         /* generate replacement  */
         repsize = PyUnicode_GET_SIZE(repunicode);
         for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
@@ -5588,6 +5643,12 @@
                                                           collstart-s, collend-s, &newpos);
             if (repunicode == NULL)
                 goto onError;
+            if (!PyUnicode_Check(repunicode)) {
+                /* Implementation limitation: byte results not supported yet. */
+                PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
+                Py_DECREF(repunicode);
+                goto onError;
+            }
             /* generate replacement  */
             repsize = PyUnicode_GET_SIZE(repunicode);
             for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
commit	db12d454e6176e9c933babe3ce40b225307c6305	[log] [tgz]
author	Martin v. Löwis <martin@v.loewis.de>	Sat May 02 18:52:14 2009 +0000
committer	Martin v. Löwis <martin@v.loewis.de>	Sat May 02 18:52:14 2009 +0000
tree	28b09c64e9dfd797da58a98725bfb93b4dae7077
parent	02953d244fdb2fe99853d2fe5db905df53c6596f [diff] [blame]