Change PyUnicode_FromString[AndSize] to expect UTF-8.

commit: 9c121069d3a61868f4586ad2ba2e5435a82af061 [log] [tgz]
author: Martin v. Löwis <martin@v.loewis.de> Sun Aug 05 20:26:11 2007 +0000
committer: Martin v. Löwis <martin@v.loewis.de> Sun Aug 05 20:26:11 2007 +0000
tree: 2b855fe92ed298ec849c14a4f01a9c0402a6fff7
parent: 64ce5052e1c2495bcbc78f732e8ece2f4c8375ac [diff]
diff --git a/Objects/bytesobject.c b/Objects/bytesobject.c
index 25f7763..47ee8a4 100644
--- a/Objects/bytesobject.c
+++ b/Objects/bytesobject.c

@@ -2724,11 +2724,13 @@
 static PyObject *
 bytes_reduce(PyBytesObject *self)
 {
-    return Py_BuildValue("(O(s#s))",
-                         Py_Type(self),
-                         self->ob_bytes == NULL ? "" : self->ob_bytes,
-                         Py_Size(self),
-                         "latin-1");
+    PyObject *latin1;
+    if (self->ob_bytes)
+	latin1 = PyUnicode_DecodeLatin1(self->ob_bytes, 
+					Py_Size(self), NULL);
+    else
+	latin1 = PyUnicode_FromString("");
+    return Py_BuildValue("(O(Ns))", Py_Type(self), latin1, "latin-1");
 }
 
 static PySequenceMethods bytes_as_sequence = {

diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index d1b5747..27fedca 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c

@@ -427,7 +427,9 @@
 {
     PyUnicodeObject *unicode;
     /* If the Unicode data is known at construction time, we can apply
-       some optimizations which share commonly used objects. */
+       some optimizations which share commonly used objects.
+       Also, this means the input must be UTF-8, so fall back to the
+       UTF-8 decoder at the end. */
     if (u != NULL) {
 
 	/* Optimization for empty strings */
@@ -436,8 +438,9 @@
 	    return (PyObject *)unicode_empty;
 	}
 
-	/* Single characters are shared when using this constructor */
-	if (size == 1) {
+	/* Single characters are shared when using this constructor.
+           Restrict to ASCII, since the input must be UTF-8. */
+	if (size == 1 && Py_CHARMASK(*u) < 128) {
 	    unicode = unicode_latin1[Py_CHARMASK(*u)];
 	    if (!unicode) {
 		unicode = _PyUnicode_New(1);
@@ -449,21 +452,14 @@
 	    Py_INCREF(unicode);
 	    return (PyObject *)unicode;
 	}
+
+        return PyUnicode_DecodeUTF8(u, size, NULL);
     }
 
     unicode = _PyUnicode_New(size);
     if (!unicode)
         return NULL;
 
-    /* Copy the Unicode data into the new object */
-    if (u != NULL) {
-        Py_UNICODE *p = unicode->str;
-        while (size--)
-            *p++ = Py_CHARMASK(*u++);
-        /* Don't need to write trailing 0 because
-           that's already done by _PyUnicode_New */
-    }
-
     return (PyObject *)unicode;
 }
commit	9c121069d3a61868f4586ad2ba2e5435a82af061	[log] [tgz]
author	Martin v. Löwis <martin@v.loewis.de>	Sun Aug 05 20:26:11 2007 +0000
committer	Martin v. Löwis <martin@v.loewis.de>	Sun Aug 05 20:26:11 2007 +0000
tree	2b855fe92ed298ec849c14a4f01a9c0402a6fff7
parent	64ce5052e1c2495bcbc78f732e8ece2f4c8375ac [diff]