Encode surrogates in UTF-8 even for a wide Py_UNICODE. Implement sys.maxunicode. Explicitly wrap around upper/lower computations for wide Py_UNICODE. When decoding large characters with UTF-8, represent expected test results using the \U notation.

commit: ce9b5a55e164f1128756478b6a2bb548abec1980 [log] [tgz]
author: Martin v. Löwis <martin@v.loewis.de> Wed Jun 27 06:28:56 2001 +0000
committer: Martin v. Löwis <martin@v.loewis.de> Wed Jun 27 06:28:56 2001 +0000
tree: 0b616e0fae5ec7204f723235d196ae2b7c124d78
parent: 236d8b79748fec890d57ad0dd99ea3f1c3ba57df [diff]
diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h
index 87e01af..d89537f 100644
--- a/Include/unicodeobject.h
+++ b/Include/unicodeobject.h

@@ -274,6 +274,9 @@
     PyObject *unicode	 	/* Unicode object */
     );
 
+/* Get the maximum ordinal for a Unicode character. */
+extern DL_IMPORT(Py_UNICODE) PyUnicode_GetMax(void);
+
 /* Resize an already allocated Unicode object to the new size length.
 
    *unicode is modified to point to the new (resized) object and 0

diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py
index c82ac69..c9732d6 100644
--- a/Lib/test/test_unicode.py
+++ b/Lib/test/test_unicode.py

@@ -386,9 +386,9 @@
        ''.join((chr(0xf0), chr(0xa3), chr(0x91), chr(0x96))) )
 # UTF-8 specific decoding tests
 verify(unicode(''.join((chr(0xf0), chr(0xa3), chr(0x91), chr(0x96))),
-               'utf-8') == u'\ud84d\udc56' )
+               'utf-8') == u'\U00023456' )
 verify(unicode(''.join((chr(0xf0), chr(0x90), chr(0x80), chr(0x82))),
-               'utf-8') == u'\ud800\udc02' )
+               'utf-8') == u'\U00010002' )
 verify(unicode(''.join((chr(0xe2), chr(0x82), chr(0xac))),
                'utf-8') == u'\u20ac' )
 

diff --git a/Objects/unicodectype.c b/Objects/unicodectype.c
index 3bc19b2..13fc612 100644
--- a/Objects/unicodectype.c
+++ b/Objects/unicodectype.c

@@ -59,14 +59,21 @@
 /* Returns the titlecase Unicode characters corresponding to ch or just
    ch if no titlecase mapping is known. */
 
-Py_UNICODE _PyUnicode_ToTitlecase(register const Py_UNICODE ch)
+Py_UNICODE _PyUnicode_ToTitlecase(register Py_UNICODE ch)
 {
     const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
 
     if (ctype->title)
-        return ch + ctype->title;
+        ch += ctype->title;
+    else
+	ch += ctype->upper;
 
-    return ch + ctype->upper;
+#ifdef USE_UCS4_STORAGE
+    /* The database assumes that the values wrap around at 0x10000. */
+    if (ch > 0x10000)
+	ch -= 0x10000;
+#endif
+    return ch;
 }
 
 /* Returns 1 for Unicode characters having the category 'Lt', 0
@@ -348,21 +355,33 @@
 /* Returns the uppercase Unicode characters corresponding to ch or just
    ch if no uppercase mapping is known. */
 
-Py_UNICODE _PyUnicode_ToUppercase(register const Py_UNICODE ch)
+Py_UNICODE _PyUnicode_ToUppercase(register Py_UNICODE ch)
 {
     const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
 
-    return ch + ctype->upper;
+    ch += ctype->upper;
+#ifdef USE_UCS4_STORAGE
+    /* The database assumes that the values wrap around at 0x10000. */
+    if (ch > 0x10000)
+	ch -= 0x10000;
+#endif
+    return ch;
 }
 
 /* Returns the lowercase Unicode characters corresponding to ch or just
    ch if no lowercase mapping is known. */
 
-Py_UNICODE _PyUnicode_ToLowercase(register const Py_UNICODE ch)
+Py_UNICODE _PyUnicode_ToLowercase(register Py_UNICODE ch)
 {
     const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
 
-    return ch + ctype->lower;
+    ch += ctype->lower;
+#ifdef USE_UCS4_STORAGE
+    /* The database assumes that the values wrap around at 0x10000. */
+    if (ch > 0x10000)
+	ch -= 0x10000;
+#endif
+    return ch;
 }
 
 /* Returns 1 for Unicode characters having the category 'Ll', 'Lu', 'Lt',

diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index ffac371..2f66c3c 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c

@@ -103,6 +103,18 @@
 */
 static char unicode_default_encoding[100];
 
+Py_UNICODE
+PyUnicode_GetMax()
+{
+#ifdef USE_UCS4_STORAGE
+	return 0x10FFFF;
+#else
+	/* This is actually an illegal character, so it should
+	   not be passed to unichr. */
+	return 0xFFFF;
+#endif
+}
+
 /* --- Unicode Object ----------------------------------------------------- */
 
 static
@@ -884,12 +896,6 @@
             cbWritten += 2;
         }
         else if (ch < 0x10000) {
-#if Py_UNICODE_SIZE == 4
-	    *p++ = 0xe0 | (ch>>12);
-            *p++ = 0x80 | ((ch>>6) & 0x3f);
-            *p++ = 0x80 | (ch & 0x3f);
-            cbWritten += 3;
-#else
             /* Check for high surrogate */
             if (0xD800 <= ch && ch <= 0xDBFF) {
                 if (i != size) {
@@ -920,7 +926,6 @@
             }
             *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
             *p++ = (char)(0x80 | (ch & 0x3f));
-#endif
         } else {
             *p++ = 0xf0 | (ch>>18);
             *p++ = 0x80 | ((ch>>12) & 0x3f);

diff --git a/Python/sysmodule.c b/Python/sysmodule.c
index 62e0841..fe880d5 100644
--- a/Python/sysmodule.c
+++ b/Python/sysmodule.c

@@ -533,6 +533,7 @@
 Static objects:\n\
 \n\
 maxint -- the largest supported integer (the smallest is -maxint-1)\n\
+maxunicode -- the largest supported character\n\
 builtin_module_names -- tuple of module names built into this intepreter\n\
 version -- the version of this interpreter as a string\n\
 version_info -- version information as a tuple\n\
@@ -643,6 +644,9 @@
 	PyDict_SetItemString(sysdict, "maxint",
 			     v = PyInt_FromLong(PyInt_GetMax()));
 	Py_XDECREF(v);
+	PyDict_SetItemString(sysdict, "maxunicode",
+			     v = PyInt_FromLong(PyUnicode_GetMax()));
+	Py_XDECREF(v);
 	PyDict_SetItemString(sysdict, "builtin_module_names",
 		   v = list_builtin_module_names());
 	Py_XDECREF(v);
commit	ce9b5a55e164f1128756478b6a2bb548abec1980	[log] [tgz]
author	Martin v. Löwis <martin@v.loewis.de>	Wed Jun 27 06:28:56 2001 +0000
committer	Martin v. Löwis <martin@v.loewis.de>	Wed Jun 27 06:28:56 2001 +0000
tree	0b616e0fae5ec7204f723235d196ae2b7c124d78
parent	236d8b79748fec890d57ad0dd99ea3f1c3ba57df [diff]