blob: 37e292d06c5ae1d242e84d39c0a8d27d494bf33b [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009--------------------------------------------------------------------
10The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000038
39#include "Python.h"
40
Guido van Rossumd57fd912000-03-10 22:53:23 +000041#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000042#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000043
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000044#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000045#include <windows.h>
46#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000047
Guido van Rossumd57fd912000-03-10 22:53:23 +000048/* Limit for the Unicode object free list */
49
50#define MAX_UNICODE_FREELIST_SIZE 1024
51
52/* Limit for the Unicode object free list stay alive optimization.
53
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000056 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000057
Barry Warsaw51ac5802000-03-20 16:36:48 +000058 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000059 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000060 malloc()-overhead) bytes of unused garbage.
61
62 Setting the limit to 0 effectively turns the feature off.
63
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000066
67*/
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71/* Endianness switches; defaults to little endian */
72
73#ifdef WORDS_BIGENDIAN
74# define BYTEORDER_IS_BIG_ENDIAN
75#else
76# define BYTEORDER_IS_LITTLE_ENDIAN
77#endif
78
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000079/* --- Globals ------------------------------------------------------------
80
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
83
84*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000085
Guido van Rossumd57fd912000-03-10 22:53:23 +000086/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000087static PyUnicodeObject *unicode_freelist;
88static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000090/* The empty Unicode object is shared to improve performance. */
91static PyUnicodeObject *unicode_empty;
92
93/* Single character Unicode strings in the Latin-1 range are being
94 shared as well. */
95static PyUnicodeObject *unicode_latin1[256];
96
Fred Drakee4315f52000-05-09 19:53:39 +000097/* Default encoding to use and assume when NULL is passed as encoding
98 parameter; it is initialized by _PyUnicode_Init().
99
100 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000101 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000102
103*/
Fred Drakee4315f52000-05-09 19:53:39 +0000104static char unicode_default_encoding[100];
105
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000106Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000107PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000108{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000109#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000110 return 0x10FFFF;
111#else
112 /* This is actually an illegal character, so it should
113 not be passed to unichr. */
114 return 0xFFFF;
115#endif
116}
117
Guido van Rossumd57fd912000-03-10 22:53:23 +0000118/* --- Unicode Object ----------------------------------------------------- */
119
120static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000121int unicode_resize(register PyUnicodeObject *unicode,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000122 int length)
123{
124 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000125
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000126 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000127 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000128 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000129
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000130 /* Resizing shared object (unicode_empty or single character
131 objects) in-place is not allowed. Use PyUnicode_Resize()
132 instead ! */
Tim Petersced69f82003-09-16 20:30:58 +0000133 if (unicode == unicode_empty ||
134 (unicode->length == 1 &&
135 /* MvL said unicode->str[] may be signed. Python generally assumes
136 * an int contains at least 32 bits, and we don't use more than
137 * 32 bits even in a UCS4 build, so casting to unsigned int should
138 * be correct.
139 */
140 (unsigned int)unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000141 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000142 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000143 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000144 return -1;
145 }
146
147 /* We allocate one more byte to make sure the string is
148 Ux0000 terminated -- XXX is this needed ? */
149 oldstr = unicode->str;
150 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
151 if (!unicode->str) {
152 unicode->str = oldstr;
153 PyErr_NoMemory();
154 return -1;
155 }
156 unicode->str[length] = 0;
157 unicode->length = length;
158
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000159 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000160 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000161 if (unicode->defenc) {
162 Py_DECREF(unicode->defenc);
163 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000164 }
165 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000166
Guido van Rossumd57fd912000-03-10 22:53:23 +0000167 return 0;
168}
169
170/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000171 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000172
173 XXX This allocator could further be enhanced by assuring that the
174 free list never reduces its size below 1.
175
176*/
177
178static
179PyUnicodeObject *_PyUnicode_New(int length)
180{
181 register PyUnicodeObject *unicode;
182
Tim Petersced69f82003-09-16 20:30:58 +0000183 /* Optimization fo empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000184 if (length == 0 && unicode_empty != NULL) {
185 Py_INCREF(unicode_empty);
186 return unicode_empty;
187 }
188
189 /* Unicode freelist & memory allocation */
190 if (unicode_freelist) {
191 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000192 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000193 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000194 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000195 /* Keep-Alive optimization: we only upsize the buffer,
196 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000197 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000198 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000199 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000200 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000201 }
202 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000203 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000204 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000205 }
206 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000207 }
208 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000209 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000210 if (unicode == NULL)
211 return NULL;
212 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
213 }
214
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000215 if (!unicode->str) {
216 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000217 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000218 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000219 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000220 * the caller fails before initializing str -- unicode_resize()
221 * reads str[0], and the Keep-Alive optimization can keep memory
222 * allocated for str alive across a call to unicode_dealloc(unicode).
223 * We don't want unicode_resize to read uninitialized memory in
224 * that case.
225 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000226 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000227 unicode->str[length] = 0;
228 unicode->length = length;
229 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000230 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000231 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000232
233 onError:
234 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000235 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000236 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000237}
238
239static
Guido van Rossum9475a232001-10-05 20:51:39 +0000240void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000241{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000242 if (PyUnicode_CheckExact(unicode) &&
243 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000244 /* Keep-Alive optimization */
245 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000246 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000247 unicode->str = NULL;
248 unicode->length = 0;
249 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000250 if (unicode->defenc) {
251 Py_DECREF(unicode->defenc);
252 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000253 }
254 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000255 *(PyUnicodeObject **)unicode = unicode_freelist;
256 unicode_freelist = unicode;
257 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000258 }
259 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000260 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000261 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000262 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263 }
264}
265
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000266int PyUnicode_Resize(PyObject **unicode, int length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000267{
268 register PyUnicodeObject *v;
269
270 /* Argument checks */
271 if (unicode == NULL) {
272 PyErr_BadInternalCall();
273 return -1;
274 }
275 v = (PyUnicodeObject *)*unicode;
Guido van Rossum049cd6b2002-10-11 00:43:48 +0000276 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000277 PyErr_BadInternalCall();
278 return -1;
279 }
280
281 /* Resizing unicode_empty and single character objects is not
282 possible since these are being shared. We simply return a fresh
283 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000284 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000285 (v == unicode_empty || v->length == 1)) {
286 PyUnicodeObject *w = _PyUnicode_New(length);
287 if (w == NULL)
288 return -1;
289 Py_UNICODE_COPY(w->str, v->str,
290 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000291 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000292 *unicode = (PyObject *)w;
293 return 0;
294 }
295
296 /* Note that we don't have to modify *unicode for unshared Unicode
297 objects, since we can modify them in-place. */
298 return unicode_resize(v, length);
299}
300
301/* Internal API for use in unicodeobject.c only ! */
302#define _PyUnicode_Resize(unicodevar, length) \
303 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
304
Guido van Rossumd57fd912000-03-10 22:53:23 +0000305PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
306 int size)
307{
308 PyUnicodeObject *unicode;
309
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000310 /* If the Unicode data is known at construction time, we can apply
311 some optimizations which share commonly used objects. */
312 if (u != NULL) {
313
314 /* Optimization for empty strings */
315 if (size == 0 && unicode_empty != NULL) {
316 Py_INCREF(unicode_empty);
317 return (PyObject *)unicode_empty;
318 }
319
320 /* Single character Unicode objects in the Latin-1 range are
321 shared when using this constructor */
322 if (size == 1 && *u < 256) {
323 unicode = unicode_latin1[*u];
324 if (!unicode) {
325 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000326 if (!unicode)
327 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000328 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000329 unicode_latin1[*u] = unicode;
330 }
331 Py_INCREF(unicode);
332 return (PyObject *)unicode;
333 }
334 }
Tim Petersced69f82003-09-16 20:30:58 +0000335
Guido van Rossumd57fd912000-03-10 22:53:23 +0000336 unicode = _PyUnicode_New(size);
337 if (!unicode)
338 return NULL;
339
340 /* Copy the Unicode data into the new object */
341 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000342 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000343
344 return (PyObject *)unicode;
345}
346
347#ifdef HAVE_WCHAR_H
348
349PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
350 int size)
351{
352 PyUnicodeObject *unicode;
353
354 if (w == NULL) {
355 PyErr_BadInternalCall();
356 return NULL;
357 }
358
359 unicode = _PyUnicode_New(size);
360 if (!unicode)
361 return NULL;
362
363 /* Copy the wchar_t data into the new object */
364#ifdef HAVE_USABLE_WCHAR_T
365 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000366#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000367 {
368 register Py_UNICODE *u;
369 register int i;
370 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000371 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000372 *u++ = *w++;
373 }
374#endif
375
376 return (PyObject *)unicode;
377}
378
379int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
380 register wchar_t *w,
381 int size)
382{
383 if (unicode == NULL) {
384 PyErr_BadInternalCall();
385 return -1;
386 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000387
388 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000389 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000390 size = PyUnicode_GET_SIZE(unicode) + 1;
391
Guido van Rossumd57fd912000-03-10 22:53:23 +0000392#ifdef HAVE_USABLE_WCHAR_T
393 memcpy(w, unicode->str, size * sizeof(wchar_t));
394#else
395 {
396 register Py_UNICODE *u;
397 register int i;
398 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000399 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000400 *w++ = *u++;
401 }
402#endif
403
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000404 if (size > PyUnicode_GET_SIZE(unicode))
405 return PyUnicode_GET_SIZE(unicode);
406 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000407 return size;
408}
409
410#endif
411
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000412PyObject *PyUnicode_FromOrdinal(int ordinal)
413{
Hye-Shik Chang40574832004-04-06 07:24:51 +0000414 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000415
416#ifdef Py_UNICODE_WIDE
417 if (ordinal < 0 || ordinal > 0x10ffff) {
418 PyErr_SetString(PyExc_ValueError,
419 "unichr() arg not in range(0x110000) "
420 "(wide Python build)");
421 return NULL;
422 }
423#else
424 if (ordinal < 0 || ordinal > 0xffff) {
425 PyErr_SetString(PyExc_ValueError,
426 "unichr() arg not in range(0x10000) "
427 "(narrow Python build)");
428 return NULL;
429 }
430#endif
431
Hye-Shik Chang40574832004-04-06 07:24:51 +0000432 s[0] = (Py_UNICODE)ordinal;
433 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000434}
435
Guido van Rossumd57fd912000-03-10 22:53:23 +0000436PyObject *PyUnicode_FromObject(register PyObject *obj)
437{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000438 /* XXX Perhaps we should make this API an alias of
439 PyObject_Unicode() instead ?! */
440 if (PyUnicode_CheckExact(obj)) {
441 Py_INCREF(obj);
442 return obj;
443 }
444 if (PyUnicode_Check(obj)) {
445 /* For a Unicode subtype that's not a Unicode object,
446 return a true Unicode object with the same data. */
447 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
448 PyUnicode_GET_SIZE(obj));
449 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000450 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
451}
452
453PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
454 const char *encoding,
455 const char *errors)
456{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000457 const char *s = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000458 int len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000459 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000460
Guido van Rossumd57fd912000-03-10 22:53:23 +0000461 if (obj == NULL) {
462 PyErr_BadInternalCall();
463 return NULL;
464 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000465
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000466#if 0
467 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000468 that no encodings is given and then redirect to
469 PyObject_Unicode() which then applies the additional logic for
470 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000471
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000472 NOTE: This API should really only be used for object which
473 represent *encoded* Unicode !
474
475 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000476 if (PyUnicode_Check(obj)) {
477 if (encoding) {
478 PyErr_SetString(PyExc_TypeError,
479 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000480 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000481 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000482 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000483 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000484#else
485 if (PyUnicode_Check(obj)) {
486 PyErr_SetString(PyExc_TypeError,
487 "decoding Unicode is not supported");
488 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000489 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000490#endif
491
492 /* Coerce object */
493 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000494 s = PyString_AS_STRING(obj);
495 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000496 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000497 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
498 /* Overwrite the error message with something more useful in
499 case of a TypeError. */
500 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000501 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000502 "coercing to Unicode: need string or buffer, "
503 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000504 obj->ob_type->tp_name);
505 goto onError;
506 }
Tim Petersced69f82003-09-16 20:30:58 +0000507
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000508 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000509 if (len == 0) {
510 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000511 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000512 }
Tim Petersced69f82003-09-16 20:30:58 +0000513 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000514 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000515
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000516 return v;
517
518 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000519 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000520}
521
522PyObject *PyUnicode_Decode(const char *s,
523 int size,
524 const char *encoding,
525 const char *errors)
526{
527 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000528
529 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000530 encoding = PyUnicode_GetDefaultEncoding();
531
532 /* Shortcuts for common default encodings */
533 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000534 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000535 else if (strcmp(encoding, "latin-1") == 0)
536 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000537#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
538 else if (strcmp(encoding, "mbcs") == 0)
539 return PyUnicode_DecodeMBCS(s, size, errors);
540#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000541 else if (strcmp(encoding, "ascii") == 0)
542 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000543
544 /* Decode via the codec registry */
545 buffer = PyBuffer_FromMemory((void *)s, size);
546 if (buffer == NULL)
547 goto onError;
548 unicode = PyCodec_Decode(buffer, encoding, errors);
549 if (unicode == NULL)
550 goto onError;
551 if (!PyUnicode_Check(unicode)) {
552 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000553 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000554 unicode->ob_type->tp_name);
555 Py_DECREF(unicode);
556 goto onError;
557 }
558 Py_DECREF(buffer);
559 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000560
Guido van Rossumd57fd912000-03-10 22:53:23 +0000561 onError:
562 Py_XDECREF(buffer);
563 return NULL;
564}
565
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000566PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
567 const char *encoding,
568 const char *errors)
569{
570 PyObject *v;
571
572 if (!PyUnicode_Check(unicode)) {
573 PyErr_BadArgument();
574 goto onError;
575 }
576
577 if (encoding == NULL)
578 encoding = PyUnicode_GetDefaultEncoding();
579
580 /* Decode via the codec registry */
581 v = PyCodec_Decode(unicode, encoding, errors);
582 if (v == NULL)
583 goto onError;
584 return v;
585
586 onError:
587 return NULL;
588}
589
Guido van Rossumd57fd912000-03-10 22:53:23 +0000590PyObject *PyUnicode_Encode(const Py_UNICODE *s,
591 int size,
592 const char *encoding,
593 const char *errors)
594{
595 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000596
Guido van Rossumd57fd912000-03-10 22:53:23 +0000597 unicode = PyUnicode_FromUnicode(s, size);
598 if (unicode == NULL)
599 return NULL;
600 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
601 Py_DECREF(unicode);
602 return v;
603}
604
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000605PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
606 const char *encoding,
607 const char *errors)
608{
609 PyObject *v;
610
611 if (!PyUnicode_Check(unicode)) {
612 PyErr_BadArgument();
613 goto onError;
614 }
615
616 if (encoding == NULL)
617 encoding = PyUnicode_GetDefaultEncoding();
618
619 /* Encode via the codec registry */
620 v = PyCodec_Encode(unicode, encoding, errors);
621 if (v == NULL)
622 goto onError;
623 return v;
624
625 onError:
626 return NULL;
627}
628
Guido van Rossumd57fd912000-03-10 22:53:23 +0000629PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
630 const char *encoding,
631 const char *errors)
632{
633 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000634
Guido van Rossumd57fd912000-03-10 22:53:23 +0000635 if (!PyUnicode_Check(unicode)) {
636 PyErr_BadArgument();
637 goto onError;
638 }
Fred Drakee4315f52000-05-09 19:53:39 +0000639
Tim Petersced69f82003-09-16 20:30:58 +0000640 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000641 encoding = PyUnicode_GetDefaultEncoding();
642
643 /* Shortcuts for common default encodings */
644 if (errors == NULL) {
645 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000646 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000647 else if (strcmp(encoding, "latin-1") == 0)
648 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000649#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
650 else if (strcmp(encoding, "mbcs") == 0)
651 return PyUnicode_AsMBCSString(unicode);
652#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000653 else if (strcmp(encoding, "ascii") == 0)
654 return PyUnicode_AsASCIIString(unicode);
655 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000656
657 /* Encode via the codec registry */
658 v = PyCodec_Encode(unicode, encoding, errors);
659 if (v == NULL)
660 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000661 if (!PyString_Check(v)) {
662 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000663 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000664 v->ob_type->tp_name);
665 Py_DECREF(v);
666 goto onError;
667 }
668 return v;
Tim Petersced69f82003-09-16 20:30:58 +0000669
Guido van Rossumd57fd912000-03-10 22:53:23 +0000670 onError:
671 return NULL;
672}
673
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000674PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
675 const char *errors)
676{
677 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
678
679 if (v)
680 return v;
681 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
682 if (v && errors == NULL)
683 ((PyUnicodeObject *)unicode)->defenc = v;
684 return v;
685}
686
Guido van Rossumd57fd912000-03-10 22:53:23 +0000687Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
688{
689 if (!PyUnicode_Check(unicode)) {
690 PyErr_BadArgument();
691 goto onError;
692 }
693 return PyUnicode_AS_UNICODE(unicode);
694
695 onError:
696 return NULL;
697}
698
699int PyUnicode_GetSize(PyObject *unicode)
700{
701 if (!PyUnicode_Check(unicode)) {
702 PyErr_BadArgument();
703 goto onError;
704 }
705 return PyUnicode_GET_SIZE(unicode);
706
707 onError:
708 return -1;
709}
710
Thomas Wouters78890102000-07-22 19:25:51 +0000711const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000712{
713 return unicode_default_encoding;
714}
715
716int PyUnicode_SetDefaultEncoding(const char *encoding)
717{
718 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000719
Fred Drakee4315f52000-05-09 19:53:39 +0000720 /* Make sure the encoding is valid. As side effect, this also
721 loads the encoding into the codec registry cache. */
722 v = _PyCodec_Lookup(encoding);
723 if (v == NULL)
724 goto onError;
725 Py_DECREF(v);
726 strncpy(unicode_default_encoding,
Tim Petersced69f82003-09-16 20:30:58 +0000727 encoding,
Fred Drakee4315f52000-05-09 19:53:39 +0000728 sizeof(unicode_default_encoding));
729 return 0;
730
731 onError:
732 return -1;
733}
734
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000735/* error handling callback helper:
736 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +0000737 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000738 and adjust various state variables.
739 return 0 on success, -1 on error
740*/
741
742static
743int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
744 const char *encoding, const char *reason,
745 const char *input, int insize, int *startinpos, int *endinpos, PyObject **exceptionObject, const char **inptr,
746 PyObject **output, int *outpos, Py_UNICODE **outptr)
747{
748 static char *argparse = "O!i;decoding error handler must return (unicode, int) tuple";
749
750 PyObject *restuple = NULL;
751 PyObject *repunicode = NULL;
752 int outsize = PyUnicode_GET_SIZE(*output);
753 int requiredsize;
754 int newpos;
755 Py_UNICODE *repptr;
756 int repsize;
757 int res = -1;
758
759 if (*errorHandler == NULL) {
760 *errorHandler = PyCodec_LookupError(errors);
761 if (*errorHandler == NULL)
762 goto onError;
763 }
764
765 if (*exceptionObject == NULL) {
766 *exceptionObject = PyUnicodeDecodeError_Create(
767 encoding, input, insize, *startinpos, *endinpos, reason);
768 if (*exceptionObject == NULL)
769 goto onError;
770 }
771 else {
772 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
773 goto onError;
774 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
775 goto onError;
776 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
777 goto onError;
778 }
779
780 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
781 if (restuple == NULL)
782 goto onError;
783 if (!PyTuple_Check(restuple)) {
784 PyErr_Format(PyExc_TypeError, &argparse[4]);
785 goto onError;
786 }
787 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
788 goto onError;
789 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000790 newpos = insize+newpos;
791 if (newpos<0 || newpos>insize) {
792 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", newpos);
793 goto onError;
794 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000795
796 /* need more space? (at least enough for what we
797 have+the replacement+the rest of the string (starting
798 at the new input position), so we won't have to check space
799 when there are no errors in the rest of the string) */
800 repptr = PyUnicode_AS_UNICODE(repunicode);
801 repsize = PyUnicode_GET_SIZE(repunicode);
802 requiredsize = *outpos + repsize + insize-newpos;
803 if (requiredsize > outsize) {
804 if (requiredsize<2*outsize)
805 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000806 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000807 goto onError;
808 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
809 }
810 *endinpos = newpos;
811 *inptr = input + newpos;
812 Py_UNICODE_COPY(*outptr, repptr, repsize);
813 *outptr += repsize;
814 *outpos += repsize;
815 /* we made it! */
816 res = 0;
817
818 onError:
819 Py_XDECREF(restuple);
820 return res;
821}
822
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000823/* --- UTF-7 Codec -------------------------------------------------------- */
824
825/* see RFC2152 for details */
826
Tim Petersced69f82003-09-16 20:30:58 +0000827static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000828char utf7_special[128] = {
829 /* indicate whether a UTF-7 character is special i.e. cannot be directly
830 encoded:
831 0 - not special
832 1 - special
833 2 - whitespace (optional)
834 3 - RFC2152 Set O (optional) */
835 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
836 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
837 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
838 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
839 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
840 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
841 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
842 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
843
844};
845
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000846/* Note: The comparison (c) <= 0 is a trick to work-around gcc
847 warnings about the comparison always being false; since
848 utf7_special[0] is 1, we can safely make that one comparison
849 true */
850
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000851#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000852 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000853 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000854 (encodeO && (utf7_special[(c)] == 3)))
855
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000856#define B64(n) \
857 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
858#define B64CHAR(c) \
859 (isalnum(c) || (c) == '+' || (c) == '/')
860#define UB64(c) \
861 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
862 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000863
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000864#define ENCODE(out, ch, bits) \
865 while (bits >= 6) { \
866 *out++ = B64(ch >> (bits-6)); \
867 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000868 }
869
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000870#define DECODE(out, ch, bits, surrogate) \
871 while (bits >= 16) { \
872 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
873 bits -= 16; \
874 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000875 /* We have already generated an error for the high surrogate \
876 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000877 surrogate = 0; \
878 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000879 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000880 it in a 16-bit character */ \
881 surrogate = 1; \
882 errmsg = "code pairs are not supported"; \
883 goto utf7Error; \
884 } else { \
885 *out++ = outCh; \
886 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000887 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000888
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000889PyObject *PyUnicode_DecodeUTF7(const char *s,
890 int size,
891 const char *errors)
892{
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000893 const char *starts = s;
894 int startinpos;
895 int endinpos;
896 int outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000897 const char *e;
898 PyUnicodeObject *unicode;
899 Py_UNICODE *p;
900 const char *errmsg = "";
901 int inShift = 0;
902 unsigned int bitsleft = 0;
903 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000904 int surrogate = 0;
905 PyObject *errorHandler = NULL;
906 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000907
908 unicode = _PyUnicode_New(size);
909 if (!unicode)
910 return NULL;
911 if (size == 0)
912 return (PyObject *)unicode;
913
914 p = unicode->str;
915 e = s + size;
916
917 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000918 Py_UNICODE ch;
919 restart:
920 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000921
922 if (inShift) {
923 if ((ch == '-') || !B64CHAR(ch)) {
924 inShift = 0;
925 s++;
Tim Petersced69f82003-09-16 20:30:58 +0000926
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000927 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
928 if (bitsleft >= 6) {
929 /* The shift sequence has a partial character in it. If
930 bitsleft < 6 then we could just classify it as padding
931 but that is not the case here */
932
933 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000934 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000935 }
936 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +0000937 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000938 here so indicate the potential of a misencoded character. */
939
940 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
941 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
942 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000943 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000944 }
945
946 if (ch == '-') {
947 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +0000948 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000949 inShift = 1;
950 }
951 } else if (SPECIAL(ch,0,0)) {
952 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +0000953 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000954 } else {
955 *p++ = ch;
956 }
957 } else {
958 charsleft = (charsleft << 6) | UB64(ch);
959 bitsleft += 6;
960 s++;
961 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
962 }
963 }
964 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000965 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000966 s++;
967 if (s < e && *s == '-') {
968 s++;
969 *p++ = '+';
970 } else
971 {
972 inShift = 1;
973 bitsleft = 0;
974 }
975 }
976 else if (SPECIAL(ch,0,0)) {
977 errmsg = "unexpected special character";
978 s++;
Tim Petersced69f82003-09-16 20:30:58 +0000979 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000980 }
981 else {
982 *p++ = ch;
983 s++;
984 }
985 continue;
986 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000987 outpos = p-PyUnicode_AS_UNICODE(unicode);
988 endinpos = s-starts;
989 if (unicode_decode_call_errorhandler(
990 errors, &errorHandler,
991 "utf7", errmsg,
992 starts, size, &startinpos, &endinpos, &exc, &s,
993 (PyObject **)&unicode, &outpos, &p))
994 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000995 }
996
997 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000998 outpos = p-PyUnicode_AS_UNICODE(unicode);
999 endinpos = size;
1000 if (unicode_decode_call_errorhandler(
1001 errors, &errorHandler,
1002 "utf7", "unterminated shift sequence",
1003 starts, size, &startinpos, &endinpos, &exc, &s,
1004 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001005 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001006 if (s < e)
1007 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001008 }
1009
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001010 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001011 goto onError;
1012
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001013 Py_XDECREF(errorHandler);
1014 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001015 return (PyObject *)unicode;
1016
1017onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001018 Py_XDECREF(errorHandler);
1019 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001020 Py_DECREF(unicode);
1021 return NULL;
1022}
1023
1024
1025PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
1026 int size,
1027 int encodeSetO,
1028 int encodeWhiteSpace,
1029 const char *errors)
1030{
1031 PyObject *v;
1032 /* It might be possible to tighten this worst case */
1033 unsigned int cbAllocated = 5 * size;
1034 int inShift = 0;
1035 int i = 0;
1036 unsigned int bitsleft = 0;
1037 unsigned long charsleft = 0;
1038 char * out;
1039 char * start;
1040
1041 if (size == 0)
1042 return PyString_FromStringAndSize(NULL, 0);
1043
1044 v = PyString_FromStringAndSize(NULL, cbAllocated);
1045 if (v == NULL)
1046 return NULL;
1047
1048 start = out = PyString_AS_STRING(v);
1049 for (;i < size; ++i) {
1050 Py_UNICODE ch = s[i];
1051
1052 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001053 if (ch == '+') {
1054 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001055 *out++ = '-';
1056 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1057 charsleft = ch;
1058 bitsleft = 16;
1059 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001060 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001061 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001062 } else {
1063 *out++ = (char) ch;
1064 }
1065 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001066 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1067 *out++ = B64(charsleft << (6-bitsleft));
1068 charsleft = 0;
1069 bitsleft = 0;
1070 /* Characters not in the BASE64 set implicitly unshift the sequence
1071 so no '-' is required, except if the character is itself a '-' */
1072 if (B64CHAR(ch) || ch == '-') {
1073 *out++ = '-';
1074 }
1075 inShift = 0;
1076 *out++ = (char) ch;
1077 } else {
1078 bitsleft += 16;
1079 charsleft = (charsleft << 16) | ch;
1080 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1081
1082 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001083 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001084 or '-' then the shift sequence will be terminated implicitly and we
1085 don't have to insert a '-'. */
1086
1087 if (bitsleft == 0) {
1088 if (i + 1 < size) {
1089 Py_UNICODE ch2 = s[i+1];
1090
1091 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001092
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001093 } else if (B64CHAR(ch2) || ch2 == '-') {
1094 *out++ = '-';
1095 inShift = 0;
1096 } else {
1097 inShift = 0;
1098 }
1099
1100 }
1101 else {
1102 *out++ = '-';
1103 inShift = 0;
1104 }
1105 }
Tim Petersced69f82003-09-16 20:30:58 +00001106 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001107 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001108 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001109 if (bitsleft) {
1110 *out++= B64(charsleft << (6-bitsleft) );
1111 *out++ = '-';
1112 }
1113
Tim Peters5de98422002-04-27 18:44:32 +00001114 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001115 return v;
1116}
1117
1118#undef SPECIAL
1119#undef B64
1120#undef B64CHAR
1121#undef UB64
1122#undef ENCODE
1123#undef DECODE
1124
Guido van Rossumd57fd912000-03-10 22:53:23 +00001125/* --- UTF-8 Codec -------------------------------------------------------- */
1126
Tim Petersced69f82003-09-16 20:30:58 +00001127static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001128char utf8_code_length[256] = {
1129 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1130 illegal prefix. see RFC 2279 for details */
1131 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1132 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1133 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1134 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1135 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1136 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1137 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1138 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1139 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1140 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1141 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1142 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1143 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1144 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1145 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1146 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1147};
1148
Guido van Rossumd57fd912000-03-10 22:53:23 +00001149PyObject *PyUnicode_DecodeUTF8(const char *s,
1150 int size,
1151 const char *errors)
1152{
Walter Dörwald69652032004-09-07 20:24:22 +00001153 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1154}
1155
1156PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
1157 int size,
1158 const char *errors,
1159 int *consumed)
1160{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001161 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001162 int n;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001163 int startinpos;
1164 int endinpos;
1165 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001166 const char *e;
1167 PyUnicodeObject *unicode;
1168 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001169 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001170 PyObject *errorHandler = NULL;
1171 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001172
1173 /* Note: size will always be longer than the resulting Unicode
1174 character count */
1175 unicode = _PyUnicode_New(size);
1176 if (!unicode)
1177 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001178 if (size == 0) {
1179 if (consumed)
1180 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001181 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001182 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001183
1184 /* Unpack UTF-8 encoded data */
1185 p = unicode->str;
1186 e = s + size;
1187
1188 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001189 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001190
1191 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001192 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001193 s++;
1194 continue;
1195 }
1196
1197 n = utf8_code_length[ch];
1198
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001199 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001200 if (consumed)
1201 break;
1202 else {
1203 errmsg = "unexpected end of data";
1204 startinpos = s-starts;
1205 endinpos = size;
1206 goto utf8Error;
1207 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001208 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001209
1210 switch (n) {
1211
1212 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001213 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001214 startinpos = s-starts;
1215 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001216 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001217
1218 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001219 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001220 startinpos = s-starts;
1221 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001222 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001223
1224 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001225 if ((s[1] & 0xc0) != 0x80) {
1226 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001227 startinpos = s-starts;
1228 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001229 goto utf8Error;
1230 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001231 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001232 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001233 startinpos = s-starts;
1234 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001235 errmsg = "illegal encoding";
1236 goto utf8Error;
1237 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001238 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001239 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001240 break;
1241
1242 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001243 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001244 (s[2] & 0xc0) != 0x80) {
1245 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001246 startinpos = s-starts;
1247 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001248 goto utf8Error;
1249 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001250 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001251 if (ch < 0x0800) {
1252 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001253 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001254
1255 XXX For wide builds (UCS-4) we should probably try
1256 to recombine the surrogates into a single code
1257 unit.
1258 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001259 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001260 startinpos = s-starts;
1261 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001262 goto utf8Error;
1263 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001264 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001265 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001266 break;
1267
1268 case 4:
1269 if ((s[1] & 0xc0) != 0x80 ||
1270 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001271 (s[3] & 0xc0) != 0x80) {
1272 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001273 startinpos = s-starts;
1274 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001275 goto utf8Error;
1276 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001277 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1278 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1279 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001280 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001281 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001282 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001283 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001284 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001285 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001286 startinpos = s-starts;
1287 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001288 goto utf8Error;
1289 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001290#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001291 *p++ = (Py_UNICODE)ch;
1292#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001293 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001294
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001295 /* translate from 10000..10FFFF to 0..FFFF */
1296 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001297
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001298 /* high surrogate = top 10 bits added to D800 */
1299 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001300
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001301 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001302 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001303#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001304 break;
1305
1306 default:
1307 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001308 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001309 startinpos = s-starts;
1310 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001311 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001312 }
1313 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001314 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001315
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001316 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001317 outpos = p-PyUnicode_AS_UNICODE(unicode);
1318 if (unicode_decode_call_errorhandler(
1319 errors, &errorHandler,
1320 "utf8", errmsg,
1321 starts, size, &startinpos, &endinpos, &exc, &s,
1322 (PyObject **)&unicode, &outpos, &p))
1323 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001324 }
Walter Dörwald69652032004-09-07 20:24:22 +00001325 if (consumed)
1326 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001327
1328 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001329 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001330 goto onError;
1331
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001332 Py_XDECREF(errorHandler);
1333 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001334 return (PyObject *)unicode;
1335
1336onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001337 Py_XDECREF(errorHandler);
1338 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001339 Py_DECREF(unicode);
1340 return NULL;
1341}
1342
Tim Peters602f7402002-04-27 18:03:26 +00001343/* Allocation strategy: if the string is short, convert into a stack buffer
1344 and allocate exactly as much space needed at the end. Else allocate the
1345 maximum possible needed (4 result bytes per Unicode character), and return
1346 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001347*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001348PyObject *
1349PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1350 int size,
1351 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001352{
Tim Peters602f7402002-04-27 18:03:26 +00001353#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001354
Tim Peters602f7402002-04-27 18:03:26 +00001355 int i; /* index into s of next input byte */
1356 PyObject *v; /* result string object */
1357 char *p; /* next free byte in output buffer */
1358 int nallocated; /* number of result bytes allocated */
1359 int nneeded; /* number of result bytes needed */
1360 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001361
Tim Peters602f7402002-04-27 18:03:26 +00001362 assert(s != NULL);
1363 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001364
Tim Peters602f7402002-04-27 18:03:26 +00001365 if (size <= MAX_SHORT_UNICHARS) {
1366 /* Write into the stack buffer; nallocated can't overflow.
1367 * At the end, we'll allocate exactly as much heap space as it
1368 * turns out we need.
1369 */
1370 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1371 v = NULL; /* will allocate after we're done */
1372 p = stackbuf;
1373 }
1374 else {
1375 /* Overallocate on the heap, and give the excess back at the end. */
1376 nallocated = size * 4;
1377 if (nallocated / 4 != size) /* overflow! */
1378 return PyErr_NoMemory();
1379 v = PyString_FromStringAndSize(NULL, nallocated);
1380 if (v == NULL)
1381 return NULL;
1382 p = PyString_AS_STRING(v);
1383 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001384
Tim Peters602f7402002-04-27 18:03:26 +00001385 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001386 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001387
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001388 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001389 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001390 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001391
Guido van Rossumd57fd912000-03-10 22:53:23 +00001392 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001393 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001394 *p++ = (char)(0xc0 | (ch >> 6));
1395 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001396 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001397 else {
Tim Peters602f7402002-04-27 18:03:26 +00001398 /* Encode UCS2 Unicode ordinals */
1399 if (ch < 0x10000) {
1400 /* Special case: check for high surrogate */
1401 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1402 Py_UCS4 ch2 = s[i];
1403 /* Check for low surrogate and combine the two to
1404 form a UCS4 value */
1405 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001406 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001407 i++;
1408 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001409 }
Tim Peters602f7402002-04-27 18:03:26 +00001410 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001411 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001412 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001413 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1414 *p++ = (char)(0x80 | (ch & 0x3f));
1415 continue;
1416 }
1417encodeUCS4:
1418 /* Encode UCS4 Unicode ordinals */
1419 *p++ = (char)(0xf0 | (ch >> 18));
1420 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1421 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1422 *p++ = (char)(0x80 | (ch & 0x3f));
1423 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001424 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001425
Tim Peters602f7402002-04-27 18:03:26 +00001426 if (v == NULL) {
1427 /* This was stack allocated. */
1428 nneeded = Py_SAFE_DOWNCAST(p - stackbuf, long, int);
1429 assert(nneeded <= nallocated);
1430 v = PyString_FromStringAndSize(stackbuf, nneeded);
1431 }
1432 else {
1433 /* Cut back to size actually needed. */
1434 nneeded = Py_SAFE_DOWNCAST(p - PyString_AS_STRING(v), long, int);
1435 assert(nneeded <= nallocated);
1436 _PyString_Resize(&v, nneeded);
1437 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001438 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001439
Tim Peters602f7402002-04-27 18:03:26 +00001440#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001441}
1442
Guido van Rossumd57fd912000-03-10 22:53:23 +00001443PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1444{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001445 if (!PyUnicode_Check(unicode)) {
1446 PyErr_BadArgument();
1447 return NULL;
1448 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001449 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1450 PyUnicode_GET_SIZE(unicode),
1451 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001452}
1453
1454/* --- UTF-16 Codec ------------------------------------------------------- */
1455
Tim Peters772747b2001-08-09 22:21:55 +00001456PyObject *
1457PyUnicode_DecodeUTF16(const char *s,
1458 int size,
1459 const char *errors,
1460 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001461{
Walter Dörwald69652032004-09-07 20:24:22 +00001462 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
1463}
1464
1465PyObject *
1466PyUnicode_DecodeUTF16Stateful(const char *s,
1467 int size,
1468 const char *errors,
1469 int *byteorder,
1470 int *consumed)
1471{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001472 const char *starts = s;
1473 int startinpos;
1474 int endinpos;
1475 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001476 PyUnicodeObject *unicode;
1477 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001478 const unsigned char *q, *e;
1479 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001480 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001481 /* Offsets from q for retrieving byte pairs in the right order. */
1482#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1483 int ihi = 1, ilo = 0;
1484#else
1485 int ihi = 0, ilo = 1;
1486#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001487 PyObject *errorHandler = NULL;
1488 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001489
1490 /* Note: size will always be longer than the resulting Unicode
1491 character count */
1492 unicode = _PyUnicode_New(size);
1493 if (!unicode)
1494 return NULL;
1495 if (size == 0)
1496 return (PyObject *)unicode;
1497
1498 /* Unpack UTF-16 encoded data */
1499 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001500 q = (unsigned char *)s;
1501 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001502
1503 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001504 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001505
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001506 /* Check for BOM marks (U+FEFF) in the input and adjust current
1507 byte order setting accordingly. In native mode, the leading BOM
1508 mark is skipped, in all other modes, it is copied to the output
1509 stream as-is (giving a ZWNBSP character). */
1510 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00001511 if (size >= 2) {
1512 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001513#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00001514 if (bom == 0xFEFF) {
1515 q += 2;
1516 bo = -1;
1517 }
1518 else if (bom == 0xFFFE) {
1519 q += 2;
1520 bo = 1;
1521 }
Tim Petersced69f82003-09-16 20:30:58 +00001522#else
Walter Dörwald69652032004-09-07 20:24:22 +00001523 if (bom == 0xFEFF) {
1524 q += 2;
1525 bo = 1;
1526 }
1527 else if (bom == 0xFFFE) {
1528 q += 2;
1529 bo = -1;
1530 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001531#endif
Walter Dörwald69652032004-09-07 20:24:22 +00001532 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001533 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001534
Tim Peters772747b2001-08-09 22:21:55 +00001535 if (bo == -1) {
1536 /* force LE */
1537 ihi = 1;
1538 ilo = 0;
1539 }
1540 else if (bo == 1) {
1541 /* force BE */
1542 ihi = 0;
1543 ilo = 1;
1544 }
1545
1546 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001547 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00001548 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001549 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00001550 if (consumed)
1551 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001552 errmsg = "truncated data";
1553 startinpos = ((const char *)q)-starts;
1554 endinpos = ((const char *)e)-starts;
1555 goto utf16Error;
1556 /* The remaining input chars are ignored if the callback
1557 chooses to skip the input */
1558 }
1559 ch = (q[ihi] << 8) | q[ilo];
1560
Tim Peters772747b2001-08-09 22:21:55 +00001561 q += 2;
1562
Guido van Rossumd57fd912000-03-10 22:53:23 +00001563 if (ch < 0xD800 || ch > 0xDFFF) {
1564 *p++ = ch;
1565 continue;
1566 }
1567
1568 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001569 if (q >= e) {
1570 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001571 startinpos = (((const char *)q)-2)-starts;
1572 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001573 goto utf16Error;
1574 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001575 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001576 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1577 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001578 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001579#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001580 *p++ = ch;
1581 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001582#else
1583 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001584#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001585 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001586 }
1587 else {
1588 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001589 startinpos = (((const char *)q)-4)-starts;
1590 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001591 goto utf16Error;
1592 }
1593
Guido van Rossumd57fd912000-03-10 22:53:23 +00001594 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001595 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001596 startinpos = (((const char *)q)-2)-starts;
1597 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001598 /* Fall through to report the error */
1599
1600 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001601 outpos = p-PyUnicode_AS_UNICODE(unicode);
1602 if (unicode_decode_call_errorhandler(
1603 errors, &errorHandler,
1604 "utf16", errmsg,
1605 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1606 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001607 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001608 }
1609
1610 if (byteorder)
1611 *byteorder = bo;
1612
Walter Dörwald69652032004-09-07 20:24:22 +00001613 if (consumed)
1614 *consumed = (const char *)q-starts;
1615
Guido van Rossumd57fd912000-03-10 22:53:23 +00001616 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001617 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001618 goto onError;
1619
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001620 Py_XDECREF(errorHandler);
1621 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001622 return (PyObject *)unicode;
1623
1624onError:
1625 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001626 Py_XDECREF(errorHandler);
1627 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001628 return NULL;
1629}
1630
Tim Peters772747b2001-08-09 22:21:55 +00001631PyObject *
1632PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1633 int size,
1634 const char *errors,
1635 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001636{
1637 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001638 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001639#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00001640 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001641#else
1642 const int pairs = 0;
1643#endif
Tim Peters772747b2001-08-09 22:21:55 +00001644 /* Offsets from p for storing byte pairs in the right order. */
1645#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1646 int ihi = 1, ilo = 0;
1647#else
1648 int ihi = 0, ilo = 1;
1649#endif
1650
1651#define STORECHAR(CH) \
1652 do { \
1653 p[ihi] = ((CH) >> 8) & 0xff; \
1654 p[ilo] = (CH) & 0xff; \
1655 p += 2; \
1656 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001657
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001658#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001659 for (i = pairs = 0; i < size; i++)
1660 if (s[i] >= 0x10000)
1661 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001662#endif
Tim Petersced69f82003-09-16 20:30:58 +00001663 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001664 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001665 if (v == NULL)
1666 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001667
Tim Peters772747b2001-08-09 22:21:55 +00001668 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001669 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001670 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001671 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001672 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001673
1674 if (byteorder == -1) {
1675 /* force LE */
1676 ihi = 1;
1677 ilo = 0;
1678 }
1679 else if (byteorder == 1) {
1680 /* force BE */
1681 ihi = 0;
1682 ilo = 1;
1683 }
1684
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001685 while (size-- > 0) {
1686 Py_UNICODE ch = *s++;
1687 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001688#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001689 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001690 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1691 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001692 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001693#endif
Tim Peters772747b2001-08-09 22:21:55 +00001694 STORECHAR(ch);
1695 if (ch2)
1696 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001697 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001698 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001699#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001700}
1701
1702PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1703{
1704 if (!PyUnicode_Check(unicode)) {
1705 PyErr_BadArgument();
1706 return NULL;
1707 }
1708 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1709 PyUnicode_GET_SIZE(unicode),
1710 NULL,
1711 0);
1712}
1713
1714/* --- Unicode Escape Codec ----------------------------------------------- */
1715
Fredrik Lundh06d12682001-01-24 07:59:11 +00001716static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001717
Guido van Rossumd57fd912000-03-10 22:53:23 +00001718PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1719 int size,
1720 const char *errors)
1721{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001722 const char *starts = s;
1723 int startinpos;
1724 int endinpos;
1725 int outpos;
1726 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001727 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001728 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001729 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001730 char* message;
1731 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001732 PyObject *errorHandler = NULL;
1733 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001734
Guido van Rossumd57fd912000-03-10 22:53:23 +00001735 /* Escaped strings will always be longer than the resulting
1736 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001737 length after conversion to the true value.
1738 (but if the error callback returns a long replacement string
1739 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001740 v = _PyUnicode_New(size);
1741 if (v == NULL)
1742 goto onError;
1743 if (size == 0)
1744 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001745
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001746 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001747 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001748
Guido van Rossumd57fd912000-03-10 22:53:23 +00001749 while (s < end) {
1750 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001751 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001752 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001753
1754 /* Non-escape characters are interpreted as Unicode ordinals */
1755 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001756 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001757 continue;
1758 }
1759
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001760 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001761 /* \ - Escapes */
1762 s++;
1763 switch (*s++) {
1764
1765 /* \x escapes */
1766 case '\n': break;
1767 case '\\': *p++ = '\\'; break;
1768 case '\'': *p++ = '\''; break;
1769 case '\"': *p++ = '\"'; break;
1770 case 'b': *p++ = '\b'; break;
1771 case 'f': *p++ = '\014'; break; /* FF */
1772 case 't': *p++ = '\t'; break;
1773 case 'n': *p++ = '\n'; break;
1774 case 'r': *p++ = '\r'; break;
1775 case 'v': *p++ = '\013'; break; /* VT */
1776 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1777
1778 /* \OOO (octal) escapes */
1779 case '0': case '1': case '2': case '3':
1780 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001781 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001782 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001783 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001784 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001785 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001786 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001787 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001788 break;
1789
Fredrik Lundhccc74732001-02-18 22:13:49 +00001790 /* hex escapes */
1791 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001792 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001793 digits = 2;
1794 message = "truncated \\xXX escape";
1795 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001796
Fredrik Lundhccc74732001-02-18 22:13:49 +00001797 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001798 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001799 digits = 4;
1800 message = "truncated \\uXXXX escape";
1801 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001802
Fredrik Lundhccc74732001-02-18 22:13:49 +00001803 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001804 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001805 digits = 8;
1806 message = "truncated \\UXXXXXXXX escape";
1807 hexescape:
1808 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001809 outpos = p-PyUnicode_AS_UNICODE(v);
1810 if (s+digits>end) {
1811 endinpos = size;
1812 if (unicode_decode_call_errorhandler(
1813 errors, &errorHandler,
1814 "unicodeescape", "end of string in escape sequence",
1815 starts, size, &startinpos, &endinpos, &exc, &s,
1816 (PyObject **)&v, &outpos, &p))
1817 goto onError;
1818 goto nextByte;
1819 }
1820 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001821 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001822 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001823 endinpos = (s+i+1)-starts;
1824 if (unicode_decode_call_errorhandler(
1825 errors, &errorHandler,
1826 "unicodeescape", message,
1827 starts, size, &startinpos, &endinpos, &exc, &s,
1828 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001829 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001830 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001831 }
1832 chr = (chr<<4) & ~0xF;
1833 if (c >= '0' && c <= '9')
1834 chr += c - '0';
1835 else if (c >= 'a' && c <= 'f')
1836 chr += 10 + c - 'a';
1837 else
1838 chr += 10 + c - 'A';
1839 }
1840 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00001841 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001842 /* _decoding_error will have already written into the
1843 target buffer. */
1844 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001845 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001846 /* when we get here, chr is a 32-bit unicode character */
1847 if (chr <= 0xffff)
1848 /* UCS-2 character */
1849 *p++ = (Py_UNICODE) chr;
1850 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001851 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00001852 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001853#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001854 *p++ = chr;
1855#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001856 chr -= 0x10000L;
1857 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001858 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001859#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001860 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001861 endinpos = s-starts;
1862 outpos = p-PyUnicode_AS_UNICODE(v);
1863 if (unicode_decode_call_errorhandler(
1864 errors, &errorHandler,
1865 "unicodeescape", "illegal Unicode character",
1866 starts, size, &startinpos, &endinpos, &exc, &s,
1867 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001868 goto onError;
1869 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001870 break;
1871
1872 /* \N{name} */
1873 case 'N':
1874 message = "malformed \\N character escape";
1875 if (ucnhash_CAPI == NULL) {
1876 /* load the unicode data module */
1877 PyObject *m, *v;
1878 m = PyImport_ImportModule("unicodedata");
1879 if (m == NULL)
1880 goto ucnhashError;
1881 v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1882 Py_DECREF(m);
1883 if (v == NULL)
1884 goto ucnhashError;
1885 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1886 Py_DECREF(v);
1887 if (ucnhash_CAPI == NULL)
1888 goto ucnhashError;
1889 }
1890 if (*s == '{') {
1891 const char *start = s+1;
1892 /* look for the closing brace */
1893 while (*s != '}' && s < end)
1894 s++;
1895 if (s > start && s < end && *s == '}') {
1896 /* found a name. look it up in the unicode database */
1897 message = "unknown Unicode character name";
1898 s++;
1899 if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1900 goto store;
1901 }
1902 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001903 endinpos = s-starts;
1904 outpos = p-PyUnicode_AS_UNICODE(v);
1905 if (unicode_decode_call_errorhandler(
1906 errors, &errorHandler,
1907 "unicodeescape", message,
1908 starts, size, &startinpos, &endinpos, &exc, &s,
1909 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001910 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001911 break;
1912
1913 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00001914 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001915 message = "\\ at end of string";
1916 s--;
1917 endinpos = s-starts;
1918 outpos = p-PyUnicode_AS_UNICODE(v);
1919 if (unicode_decode_call_errorhandler(
1920 errors, &errorHandler,
1921 "unicodeescape", message,
1922 starts, size, &startinpos, &endinpos, &exc, &s,
1923 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00001924 goto onError;
1925 }
1926 else {
1927 *p++ = '\\';
1928 *p++ = (unsigned char)s[-1];
1929 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001930 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001931 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001932 nextByte:
1933 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001934 }
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001935 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001936 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00001937 Py_XDECREF(errorHandler);
1938 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001939 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00001940
Fredrik Lundhccc74732001-02-18 22:13:49 +00001941ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001942 PyErr_SetString(
1943 PyExc_UnicodeError,
1944 "\\N escapes not supported (can't load unicodedata module)"
1945 );
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001946 Py_XDECREF(errorHandler);
1947 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00001948 return NULL;
1949
Fredrik Lundhccc74732001-02-18 22:13:49 +00001950onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001951 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001952 Py_XDECREF(errorHandler);
1953 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001954 return NULL;
1955}
1956
1957/* Return a Unicode-Escape string version of the Unicode object.
1958
1959 If quotes is true, the string is enclosed in u"" or u'' quotes as
1960 appropriate.
1961
1962*/
1963
Barry Warsaw51ac5802000-03-20 16:36:48 +00001964static const Py_UNICODE *findchar(const Py_UNICODE *s,
1965 int size,
1966 Py_UNICODE ch);
1967
Guido van Rossumd57fd912000-03-10 22:53:23 +00001968static
1969PyObject *unicodeescape_string(const Py_UNICODE *s,
1970 int size,
1971 int quotes)
1972{
1973 PyObject *repr;
1974 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001975
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001976 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001977
1978 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1979 if (repr == NULL)
1980 return NULL;
1981
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001982 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001983
1984 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001985 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00001986 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00001987 !findchar(s, size, '"')) ? '"' : '\'';
1988 }
1989 while (size-- > 0) {
1990 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001991
Hye-Shik Chang835b2432005-12-17 04:38:31 +00001992 /* Escape quotes and backslashes */
1993 if ((quotes &&
1994 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001995 *p++ = '\\';
1996 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00001997 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001998 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001999
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002000#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002001 /* Map 21-bit characters to '\U00xxxxxx' */
2002 else if (ch >= 0x10000) {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002003 int offset = p - PyString_AS_STRING(repr);
Tim Petersced69f82003-09-16 20:30:58 +00002004
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002005 /* Resize the string if necessary */
2006 if (offset + 12 > PyString_GET_SIZE(repr)) {
2007 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
Tim Peters5de98422002-04-27 18:44:32 +00002008 return NULL;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002009 p = PyString_AS_STRING(repr) + offset;
2010 }
2011
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002012 *p++ = '\\';
2013 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002014 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
2015 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
2016 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
2017 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
2018 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
2019 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
2020 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002021 *p++ = hexdigit[ch & 0x0000000F];
2022 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002023 }
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002024#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002025 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
2026 else if (ch >= 0xD800 && ch < 0xDC00) {
2027 Py_UNICODE ch2;
2028 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002029
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002030 ch2 = *s++;
2031 size--;
2032 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2033 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2034 *p++ = '\\';
2035 *p++ = 'U';
2036 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
2037 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
2038 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
2039 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
2040 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
2041 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
2042 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
2043 *p++ = hexdigit[ucs & 0x0000000F];
2044 continue;
2045 }
2046 /* Fall through: isolated surrogates are copied as-is */
2047 s--;
2048 size++;
2049 }
2050
Guido van Rossumd57fd912000-03-10 22:53:23 +00002051 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002052 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002053 *p++ = '\\';
2054 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002055 *p++ = hexdigit[(ch >> 12) & 0x000F];
2056 *p++ = hexdigit[(ch >> 8) & 0x000F];
2057 *p++ = hexdigit[(ch >> 4) & 0x000F];
2058 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002059 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002060
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002061 /* Map special whitespace to '\t', \n', '\r' */
2062 else if (ch == '\t') {
2063 *p++ = '\\';
2064 *p++ = 't';
2065 }
2066 else if (ch == '\n') {
2067 *p++ = '\\';
2068 *p++ = 'n';
2069 }
2070 else if (ch == '\r') {
2071 *p++ = '\\';
2072 *p++ = 'r';
2073 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002074
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002075 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002076 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002077 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002078 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002079 *p++ = hexdigit[(ch >> 4) & 0x000F];
2080 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002081 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002082
Guido van Rossumd57fd912000-03-10 22:53:23 +00002083 /* Copy everything else as-is */
2084 else
2085 *p++ = (char) ch;
2086 }
2087 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002088 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002089
2090 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002091 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002092 return repr;
2093}
2094
2095PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2096 int size)
2097{
2098 return unicodeescape_string(s, size, 0);
2099}
2100
2101PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2102{
2103 if (!PyUnicode_Check(unicode)) {
2104 PyErr_BadArgument();
2105 return NULL;
2106 }
2107 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2108 PyUnicode_GET_SIZE(unicode));
2109}
2110
2111/* --- Raw Unicode Escape Codec ------------------------------------------- */
2112
2113PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
2114 int size,
2115 const char *errors)
2116{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002117 const char *starts = s;
2118 int startinpos;
2119 int endinpos;
2120 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002121 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002122 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002123 const char *end;
2124 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002125 PyObject *errorHandler = NULL;
2126 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002127
Guido van Rossumd57fd912000-03-10 22:53:23 +00002128 /* Escaped strings will always be longer than the resulting
2129 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002130 length after conversion to the true value. (But decoding error
2131 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002132 v = _PyUnicode_New(size);
2133 if (v == NULL)
2134 goto onError;
2135 if (size == 0)
2136 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002137 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002138 end = s + size;
2139 while (s < end) {
2140 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002141 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002142 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002143 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002144
2145 /* Non-escape characters are interpreted as Unicode ordinals */
2146 if (*s != '\\') {
2147 *p++ = (unsigned char)*s++;
2148 continue;
2149 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002150 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002151
2152 /* \u-escapes are only interpreted iff the number of leading
2153 backslashes if odd */
2154 bs = s;
2155 for (;s < end;) {
2156 if (*s != '\\')
2157 break;
2158 *p++ = (unsigned char)*s++;
2159 }
2160 if (((s - bs) & 1) == 0 ||
2161 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002162 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002163 continue;
2164 }
2165 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002166 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002167 s++;
2168
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002169 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002170 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002171 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002172 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002173 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002174 endinpos = s-starts;
2175 if (unicode_decode_call_errorhandler(
2176 errors, &errorHandler,
2177 "rawunicodeescape", "truncated \\uXXXX",
2178 starts, size, &startinpos, &endinpos, &exc, &s,
2179 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002180 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002181 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002182 }
2183 x = (x<<4) & ~0xF;
2184 if (c >= '0' && c <= '9')
2185 x += c - '0';
2186 else if (c >= 'a' && c <= 'f')
2187 x += 10 + c - 'a';
2188 else
2189 x += 10 + c - 'A';
2190 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002191#ifndef Py_UNICODE_WIDE
2192 if (x > 0x10000) {
2193 if (unicode_decode_call_errorhandler(
2194 errors, &errorHandler,
2195 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2196 starts, size, &startinpos, &endinpos, &exc, &s,
2197 (PyObject **)&v, &outpos, &p))
2198 goto onError;
2199 }
2200#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002201 *p++ = x;
2202 nextByte:
2203 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002204 }
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002205 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002206 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002207 Py_XDECREF(errorHandler);
2208 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002209 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002210
Guido van Rossumd57fd912000-03-10 22:53:23 +00002211 onError:
2212 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002213 Py_XDECREF(errorHandler);
2214 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002215 return NULL;
2216}
2217
2218PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
2219 int size)
2220{
2221 PyObject *repr;
2222 char *p;
2223 char *q;
2224
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002225 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002226
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002227#ifdef Py_UNICODE_WIDE
2228 repr = PyString_FromStringAndSize(NULL, 10 * size);
2229#else
Guido van Rossumd57fd912000-03-10 22:53:23 +00002230 repr = PyString_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002231#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002232 if (repr == NULL)
2233 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002234 if (size == 0)
2235 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002236
2237 p = q = PyString_AS_STRING(repr);
2238 while (size-- > 0) {
2239 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002240#ifdef Py_UNICODE_WIDE
2241 /* Map 32-bit characters to '\Uxxxxxxxx' */
2242 if (ch >= 0x10000) {
2243 *p++ = '\\';
2244 *p++ = 'U';
2245 *p++ = hexdigit[(ch >> 28) & 0xf];
2246 *p++ = hexdigit[(ch >> 24) & 0xf];
2247 *p++ = hexdigit[(ch >> 20) & 0xf];
2248 *p++ = hexdigit[(ch >> 16) & 0xf];
2249 *p++ = hexdigit[(ch >> 12) & 0xf];
2250 *p++ = hexdigit[(ch >> 8) & 0xf];
2251 *p++ = hexdigit[(ch >> 4) & 0xf];
2252 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002253 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002254 else
2255#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002256 /* Map 16-bit characters to '\uxxxx' */
2257 if (ch >= 256) {
2258 *p++ = '\\';
2259 *p++ = 'u';
2260 *p++ = hexdigit[(ch >> 12) & 0xf];
2261 *p++ = hexdigit[(ch >> 8) & 0xf];
2262 *p++ = hexdigit[(ch >> 4) & 0xf];
2263 *p++ = hexdigit[ch & 15];
2264 }
2265 /* Copy everything else as-is */
2266 else
2267 *p++ = (char) ch;
2268 }
2269 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002270 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002271 return repr;
2272}
2273
2274PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2275{
2276 if (!PyUnicode_Check(unicode)) {
2277 PyErr_BadArgument();
2278 return NULL;
2279 }
2280 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2281 PyUnicode_GET_SIZE(unicode));
2282}
2283
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002284/* --- Unicode Internal Codec ------------------------------------------- */
2285
2286PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
2287 int size,
2288 const char *errors)
2289{
2290 const char *starts = s;
2291 int startinpos;
2292 int endinpos;
2293 int outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002294 PyUnicodeObject *v;
2295 Py_UNICODE *p;
2296 const char *end;
2297 const char *reason;
2298 PyObject *errorHandler = NULL;
2299 PyObject *exc = NULL;
2300
Neal Norwitzd43069c2006-01-08 01:12:10 +00002301#ifdef Py_UNICODE_WIDE
2302 Py_UNICODE unimax = PyUnicode_GetMax();
2303#endif
2304
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002305 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
2306 if (v == NULL)
2307 goto onError;
2308 if (PyUnicode_GetSize((PyObject *)v) == 0)
2309 return (PyObject *)v;
2310 p = PyUnicode_AS_UNICODE(v);
2311 end = s + size;
2312
2313 while (s < end) {
2314 *p = *(Py_UNICODE *)s;
2315 /* We have to sanity check the raw data, otherwise doom looms for
2316 some malformed UCS-4 data. */
2317 if (
2318 #ifdef Py_UNICODE_WIDE
2319 *p > unimax || *p < 0 ||
2320 #endif
2321 end-s < Py_UNICODE_SIZE
2322 )
2323 {
2324 startinpos = s - starts;
2325 if (end-s < Py_UNICODE_SIZE) {
2326 endinpos = end-starts;
2327 reason = "truncated input";
2328 }
2329 else {
2330 endinpos = s - starts + Py_UNICODE_SIZE;
2331 reason = "illegal code point (> 0x10FFFF)";
2332 }
2333 outpos = p - PyUnicode_AS_UNICODE(v);
2334 if (unicode_decode_call_errorhandler(
2335 errors, &errorHandler,
2336 "unicode_internal", reason,
2337 starts, size, &startinpos, &endinpos, &exc, &s,
2338 (PyObject **)&v, &outpos, &p)) {
2339 goto onError;
2340 }
2341 }
2342 else {
2343 p++;
2344 s += Py_UNICODE_SIZE;
2345 }
2346 }
2347
2348 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
2349 goto onError;
2350 Py_XDECREF(errorHandler);
2351 Py_XDECREF(exc);
2352 return (PyObject *)v;
2353
2354 onError:
2355 Py_XDECREF(v);
2356 Py_XDECREF(errorHandler);
2357 Py_XDECREF(exc);
2358 return NULL;
2359}
2360
Guido van Rossumd57fd912000-03-10 22:53:23 +00002361/* --- Latin-1 Codec ------------------------------------------------------ */
2362
2363PyObject *PyUnicode_DecodeLatin1(const char *s,
2364 int size,
2365 const char *errors)
2366{
2367 PyUnicodeObject *v;
2368 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002369
Guido van Rossumd57fd912000-03-10 22:53:23 +00002370 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002371 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002372 Py_UNICODE r = *(unsigned char*)s;
2373 return PyUnicode_FromUnicode(&r, 1);
2374 }
2375
Guido van Rossumd57fd912000-03-10 22:53:23 +00002376 v = _PyUnicode_New(size);
2377 if (v == NULL)
2378 goto onError;
2379 if (size == 0)
2380 return (PyObject *)v;
2381 p = PyUnicode_AS_UNICODE(v);
2382 while (size-- > 0)
2383 *p++ = (unsigned char)*s++;
2384 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002385
Guido van Rossumd57fd912000-03-10 22:53:23 +00002386 onError:
2387 Py_XDECREF(v);
2388 return NULL;
2389}
2390
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002391/* create or adjust a UnicodeEncodeError */
2392static void make_encode_exception(PyObject **exceptionObject,
2393 const char *encoding,
2394 const Py_UNICODE *unicode, int size,
2395 int startpos, int endpos,
2396 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002397{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002398 if (*exceptionObject == NULL) {
2399 *exceptionObject = PyUnicodeEncodeError_Create(
2400 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002401 }
2402 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002403 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2404 goto onError;
2405 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2406 goto onError;
2407 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2408 goto onError;
2409 return;
2410 onError:
2411 Py_DECREF(*exceptionObject);
2412 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002413 }
2414}
2415
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002416/* raises a UnicodeEncodeError */
2417static void raise_encode_exception(PyObject **exceptionObject,
2418 const char *encoding,
2419 const Py_UNICODE *unicode, int size,
2420 int startpos, int endpos,
2421 const char *reason)
2422{
2423 make_encode_exception(exceptionObject,
2424 encoding, unicode, size, startpos, endpos, reason);
2425 if (*exceptionObject != NULL)
2426 PyCodec_StrictErrors(*exceptionObject);
2427}
2428
2429/* error handling callback helper:
2430 build arguments, call the callback and check the arguments,
2431 put the result into newpos and return the replacement string, which
2432 has to be freed by the caller */
2433static PyObject *unicode_encode_call_errorhandler(const char *errors,
2434 PyObject **errorHandler,
2435 const char *encoding, const char *reason,
2436 const Py_UNICODE *unicode, int size, PyObject **exceptionObject,
2437 int startpos, int endpos,
2438 int *newpos)
2439{
2440 static char *argparse = "O!i;encoding error handler must return (unicode, int) tuple";
2441
2442 PyObject *restuple;
2443 PyObject *resunicode;
2444
2445 if (*errorHandler == NULL) {
2446 *errorHandler = PyCodec_LookupError(errors);
2447 if (*errorHandler == NULL)
2448 return NULL;
2449 }
2450
2451 make_encode_exception(exceptionObject,
2452 encoding, unicode, size, startpos, endpos, reason);
2453 if (*exceptionObject == NULL)
2454 return NULL;
2455
2456 restuple = PyObject_CallFunctionObjArgs(
2457 *errorHandler, *exceptionObject, NULL);
2458 if (restuple == NULL)
2459 return NULL;
2460 if (!PyTuple_Check(restuple)) {
2461 PyErr_Format(PyExc_TypeError, &argparse[4]);
2462 Py_DECREF(restuple);
2463 return NULL;
2464 }
2465 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2466 &resunicode, newpos)) {
2467 Py_DECREF(restuple);
2468 return NULL;
2469 }
2470 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002471 *newpos = size+*newpos;
2472 if (*newpos<0 || *newpos>size) {
2473 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", *newpos);
2474 Py_DECREF(restuple);
2475 return NULL;
2476 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002477 Py_INCREF(resunicode);
2478 Py_DECREF(restuple);
2479 return resunicode;
2480}
2481
2482static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
2483 int size,
2484 const char *errors,
2485 int limit)
2486{
2487 /* output object */
2488 PyObject *res;
2489 /* pointers to the beginning and end+1 of input */
2490 const Py_UNICODE *startp = p;
2491 const Py_UNICODE *endp = p + size;
2492 /* pointer to the beginning of the unencodable characters */
2493 /* const Py_UNICODE *badp = NULL; */
2494 /* pointer into the output */
2495 char *str;
2496 /* current output position */
2497 int respos = 0;
2498 int ressize;
2499 char *encoding = (limit == 256) ? "latin-1" : "ascii";
2500 char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
2501 PyObject *errorHandler = NULL;
2502 PyObject *exc = NULL;
2503 /* the following variable is used for caching string comparisons
2504 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2505 int known_errorHandler = -1;
2506
2507 /* allocate enough for a simple encoding without
2508 replacements, if we need more, we'll resize */
2509 res = PyString_FromStringAndSize(NULL, size);
2510 if (res == NULL)
2511 goto onError;
2512 if (size == 0)
2513 return res;
2514 str = PyString_AS_STRING(res);
2515 ressize = size;
2516
2517 while (p<endp) {
2518 Py_UNICODE c = *p;
2519
2520 /* can we encode this? */
2521 if (c<limit) {
2522 /* no overflow check, because we know that the space is enough */
2523 *str++ = (char)c;
2524 ++p;
2525 }
2526 else {
2527 int unicodepos = p-startp;
2528 int requiredsize;
2529 PyObject *repunicode;
2530 int repsize;
2531 int newpos;
2532 int respos;
2533 Py_UNICODE *uni2;
2534 /* startpos for collecting unencodable chars */
2535 const Py_UNICODE *collstart = p;
2536 const Py_UNICODE *collend = p;
2537 /* find all unecodable characters */
2538 while ((collend < endp) && ((*collend)>=limit))
2539 ++collend;
2540 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2541 if (known_errorHandler==-1) {
2542 if ((errors==NULL) || (!strcmp(errors, "strict")))
2543 known_errorHandler = 1;
2544 else if (!strcmp(errors, "replace"))
2545 known_errorHandler = 2;
2546 else if (!strcmp(errors, "ignore"))
2547 known_errorHandler = 3;
2548 else if (!strcmp(errors, "xmlcharrefreplace"))
2549 known_errorHandler = 4;
2550 else
2551 known_errorHandler = 0;
2552 }
2553 switch (known_errorHandler) {
2554 case 1: /* strict */
2555 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2556 goto onError;
2557 case 2: /* replace */
2558 while (collstart++<collend)
2559 *str++ = '?'; /* fall through */
2560 case 3: /* ignore */
2561 p = collend;
2562 break;
2563 case 4: /* xmlcharrefreplace */
2564 respos = str-PyString_AS_STRING(res);
2565 /* determine replacement size (temporarily (mis)uses p) */
2566 for (p = collstart, repsize = 0; p < collend; ++p) {
2567 if (*p<10)
2568 repsize += 2+1+1;
2569 else if (*p<100)
2570 repsize += 2+2+1;
2571 else if (*p<1000)
2572 repsize += 2+3+1;
2573 else if (*p<10000)
2574 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00002575#ifndef Py_UNICODE_WIDE
2576 else
2577 repsize += 2+5+1;
2578#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002579 else if (*p<100000)
2580 repsize += 2+5+1;
2581 else if (*p<1000000)
2582 repsize += 2+6+1;
2583 else
2584 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002585#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002586 }
2587 requiredsize = respos+repsize+(endp-collend);
2588 if (requiredsize > ressize) {
2589 if (requiredsize<2*ressize)
2590 requiredsize = 2*ressize;
2591 if (_PyString_Resize(&res, requiredsize))
2592 goto onError;
2593 str = PyString_AS_STRING(res) + respos;
2594 ressize = requiredsize;
2595 }
2596 /* generate replacement (temporarily (mis)uses p) */
2597 for (p = collstart; p < collend; ++p) {
2598 str += sprintf(str, "&#%d;", (int)*p);
2599 }
2600 p = collend;
2601 break;
2602 default:
2603 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2604 encoding, reason, startp, size, &exc,
2605 collstart-startp, collend-startp, &newpos);
2606 if (repunicode == NULL)
2607 goto onError;
2608 /* need more space? (at least enough for what we
2609 have+the replacement+the rest of the string, so
2610 we won't have to check space for encodable characters) */
2611 respos = str-PyString_AS_STRING(res);
2612 repsize = PyUnicode_GET_SIZE(repunicode);
2613 requiredsize = respos+repsize+(endp-collend);
2614 if (requiredsize > ressize) {
2615 if (requiredsize<2*ressize)
2616 requiredsize = 2*ressize;
2617 if (_PyString_Resize(&res, requiredsize)) {
2618 Py_DECREF(repunicode);
2619 goto onError;
2620 }
2621 str = PyString_AS_STRING(res) + respos;
2622 ressize = requiredsize;
2623 }
2624 /* check if there is anything unencodable in the replacement
2625 and copy it to the output */
2626 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2627 c = *uni2;
2628 if (c >= limit) {
2629 raise_encode_exception(&exc, encoding, startp, size,
2630 unicodepos, unicodepos+1, reason);
2631 Py_DECREF(repunicode);
2632 goto onError;
2633 }
2634 *str = (char)c;
2635 }
2636 p = startp + newpos;
2637 Py_DECREF(repunicode);
2638 }
2639 }
2640 }
2641 /* Resize if we allocated to much */
2642 respos = str-PyString_AS_STRING(res);
2643 if (respos<ressize)
2644 /* If this falls res will be NULL */
2645 _PyString_Resize(&res, respos);
2646 Py_XDECREF(errorHandler);
2647 Py_XDECREF(exc);
2648 return res;
2649
2650 onError:
2651 Py_XDECREF(res);
2652 Py_XDECREF(errorHandler);
2653 Py_XDECREF(exc);
2654 return NULL;
2655}
2656
Guido van Rossumd57fd912000-03-10 22:53:23 +00002657PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
2658 int size,
2659 const char *errors)
2660{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002661 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002662}
2663
2664PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2665{
2666 if (!PyUnicode_Check(unicode)) {
2667 PyErr_BadArgument();
2668 return NULL;
2669 }
2670 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2671 PyUnicode_GET_SIZE(unicode),
2672 NULL);
2673}
2674
2675/* --- 7-bit ASCII Codec -------------------------------------------------- */
2676
Guido van Rossumd57fd912000-03-10 22:53:23 +00002677PyObject *PyUnicode_DecodeASCII(const char *s,
2678 int size,
2679 const char *errors)
2680{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002681 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002682 PyUnicodeObject *v;
2683 Py_UNICODE *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002684 int startinpos;
2685 int endinpos;
2686 int outpos;
2687 const char *e;
2688 PyObject *errorHandler = NULL;
2689 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002690
Guido van Rossumd57fd912000-03-10 22:53:23 +00002691 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002692 if (size == 1 && *(unsigned char*)s < 128) {
2693 Py_UNICODE r = *(unsigned char*)s;
2694 return PyUnicode_FromUnicode(&r, 1);
2695 }
Tim Petersced69f82003-09-16 20:30:58 +00002696
Guido van Rossumd57fd912000-03-10 22:53:23 +00002697 v = _PyUnicode_New(size);
2698 if (v == NULL)
2699 goto onError;
2700 if (size == 0)
2701 return (PyObject *)v;
2702 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002703 e = s + size;
2704 while (s < e) {
2705 register unsigned char c = (unsigned char)*s;
2706 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002707 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002708 ++s;
2709 }
2710 else {
2711 startinpos = s-starts;
2712 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00002713 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002714 if (unicode_decode_call_errorhandler(
2715 errors, &errorHandler,
2716 "ascii", "ordinal not in range(128)",
2717 starts, size, &startinpos, &endinpos, &exc, &s,
2718 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002719 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002720 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002721 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002722 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002723 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002724 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002725 Py_XDECREF(errorHandler);
2726 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002727 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002728
Guido van Rossumd57fd912000-03-10 22:53:23 +00002729 onError:
2730 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002731 Py_XDECREF(errorHandler);
2732 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002733 return NULL;
2734}
2735
Guido van Rossumd57fd912000-03-10 22:53:23 +00002736PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
2737 int size,
2738 const char *errors)
2739{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002740 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002741}
2742
2743PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2744{
2745 if (!PyUnicode_Check(unicode)) {
2746 PyErr_BadArgument();
2747 return NULL;
2748 }
2749 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2750 PyUnicode_GET_SIZE(unicode),
2751 NULL);
2752}
2753
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002754#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002755
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002756/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002757
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002758PyObject *PyUnicode_DecodeMBCS(const char *s,
2759 int size,
2760 const char *errors)
2761{
2762 PyUnicodeObject *v;
2763 Py_UNICODE *p;
2764
2765 /* First get the size of the result */
2766 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00002767 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002768 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2769
2770 v = _PyUnicode_New(usize);
2771 if (v == NULL)
2772 return NULL;
2773 if (usize == 0)
2774 return (PyObject *)v;
2775 p = PyUnicode_AS_UNICODE(v);
2776 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2777 Py_DECREF(v);
2778 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2779 }
2780
2781 return (PyObject *)v;
2782}
2783
2784PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
2785 int size,
2786 const char *errors)
2787{
2788 PyObject *repr;
2789 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00002790 DWORD mbcssize;
2791
2792 /* If there are no characters, bail now! */
2793 if (size==0)
2794 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002795
2796 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00002797 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002798 if (mbcssize==0)
2799 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2800
2801 repr = PyString_FromStringAndSize(NULL, mbcssize);
2802 if (repr == NULL)
2803 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002804 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002805 return repr;
2806
2807 /* Do the conversion */
2808 s = PyString_AS_STRING(repr);
2809 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2810 Py_DECREF(repr);
2811 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2812 }
2813 return repr;
2814}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002815
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002816PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
2817{
2818 if (!PyUnicode_Check(unicode)) {
2819 PyErr_BadArgument();
2820 return NULL;
2821 }
2822 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2823 PyUnicode_GET_SIZE(unicode),
2824 NULL);
2825}
2826
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002827#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002828
Guido van Rossumd57fd912000-03-10 22:53:23 +00002829/* --- Character Mapping Codec -------------------------------------------- */
2830
Guido van Rossumd57fd912000-03-10 22:53:23 +00002831PyObject *PyUnicode_DecodeCharmap(const char *s,
2832 int size,
2833 PyObject *mapping,
2834 const char *errors)
2835{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002836 const char *starts = s;
2837 int startinpos;
2838 int endinpos;
2839 int outpos;
2840 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002841 PyUnicodeObject *v;
2842 Py_UNICODE *p;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002843 int extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002844 PyObject *errorHandler = NULL;
2845 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002846 Py_UNICODE *mapstring = NULL;
2847 int maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00002848
Guido van Rossumd57fd912000-03-10 22:53:23 +00002849 /* Default to Latin-1 */
2850 if (mapping == NULL)
2851 return PyUnicode_DecodeLatin1(s, size, errors);
2852
2853 v = _PyUnicode_New(size);
2854 if (v == NULL)
2855 goto onError;
2856 if (size == 0)
2857 return (PyObject *)v;
2858 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002859 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002860 if (PyUnicode_CheckExact(mapping)) {
2861 mapstring = PyUnicode_AS_UNICODE(mapping);
2862 maplen = PyUnicode_GET_SIZE(mapping);
2863 while (s < e) {
2864 unsigned char ch = *s;
2865 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002866
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002867 if (ch < maplen)
2868 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002869
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002870 if (x == 0xfffe) {
2871 /* undefined mapping */
2872 outpos = p-PyUnicode_AS_UNICODE(v);
2873 startinpos = s-starts;
2874 endinpos = startinpos+1;
2875 if (unicode_decode_call_errorhandler(
2876 errors, &errorHandler,
2877 "charmap", "character maps to <undefined>",
2878 starts, size, &startinpos, &endinpos, &exc, &s,
2879 (PyObject **)&v, &outpos, &p)) {
2880 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002881 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002882 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002883 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002884 *p++ = x;
2885 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002886 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002887 }
2888 else {
2889 while (s < e) {
2890 unsigned char ch = *s;
2891 PyObject *w, *x;
2892
2893 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2894 w = PyInt_FromLong((long)ch);
2895 if (w == NULL)
2896 goto onError;
2897 x = PyObject_GetItem(mapping, w);
2898 Py_DECREF(w);
2899 if (x == NULL) {
2900 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2901 /* No mapping found means: mapping is undefined. */
2902 PyErr_Clear();
2903 x = Py_None;
2904 Py_INCREF(x);
2905 } else
2906 goto onError;
2907 }
2908
2909 /* Apply mapping */
2910 if (PyInt_Check(x)) {
2911 long value = PyInt_AS_LONG(x);
2912 if (value < 0 || value > 65535) {
2913 PyErr_SetString(PyExc_TypeError,
2914 "character mapping must be in range(65536)");
2915 Py_DECREF(x);
2916 goto onError;
2917 }
2918 *p++ = (Py_UNICODE)value;
2919 }
2920 else if (x == Py_None) {
2921 /* undefined mapping */
2922 outpos = p-PyUnicode_AS_UNICODE(v);
2923 startinpos = s-starts;
2924 endinpos = startinpos+1;
2925 if (unicode_decode_call_errorhandler(
2926 errors, &errorHandler,
2927 "charmap", "character maps to <undefined>",
2928 starts, size, &startinpos, &endinpos, &exc, &s,
2929 (PyObject **)&v, &outpos, &p)) {
2930 Py_DECREF(x);
2931 goto onError;
2932 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00002933 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002934 continue;
2935 }
2936 else if (PyUnicode_Check(x)) {
2937 int targetsize = PyUnicode_GET_SIZE(x);
2938
2939 if (targetsize == 1)
2940 /* 1-1 mapping */
2941 *p++ = *PyUnicode_AS_UNICODE(x);
2942
2943 else if (targetsize > 1) {
2944 /* 1-n mapping */
2945 if (targetsize > extrachars) {
2946 /* resize first */
2947 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2948 int needed = (targetsize - extrachars) + \
2949 (targetsize << 2);
2950 extrachars += needed;
2951 if (_PyUnicode_Resize(&v,
2952 PyUnicode_GET_SIZE(v) + needed) < 0) {
2953 Py_DECREF(x);
2954 goto onError;
2955 }
2956 p = PyUnicode_AS_UNICODE(v) + oldpos;
2957 }
2958 Py_UNICODE_COPY(p,
2959 PyUnicode_AS_UNICODE(x),
2960 targetsize);
2961 p += targetsize;
2962 extrachars -= targetsize;
2963 }
2964 /* 1-0 mapping: skip the character */
2965 }
2966 else {
2967 /* wrong return value */
2968 PyErr_SetString(PyExc_TypeError,
2969 "character mapping must return integer, None or unicode");
2970 Py_DECREF(x);
2971 goto onError;
2972 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002973 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002974 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002975 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002976 }
2977 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002978 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002979 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002980 Py_XDECREF(errorHandler);
2981 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002982 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002983
Guido van Rossumd57fd912000-03-10 22:53:23 +00002984 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002985 Py_XDECREF(errorHandler);
2986 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002987 Py_XDECREF(v);
2988 return NULL;
2989}
2990
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002991/* Lookup the character ch in the mapping. If the character
2992 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00002993 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002994static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002995{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002996 PyObject *w = PyInt_FromLong((long)c);
2997 PyObject *x;
2998
2999 if (w == NULL)
3000 return NULL;
3001 x = PyObject_GetItem(mapping, w);
3002 Py_DECREF(w);
3003 if (x == NULL) {
3004 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3005 /* No mapping found means: mapping is undefined. */
3006 PyErr_Clear();
3007 x = Py_None;
3008 Py_INCREF(x);
3009 return x;
3010 } else
3011 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003012 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00003013 else if (x == Py_None)
3014 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003015 else if (PyInt_Check(x)) {
3016 long value = PyInt_AS_LONG(x);
3017 if (value < 0 || value > 255) {
3018 PyErr_SetString(PyExc_TypeError,
3019 "character mapping must be in range(256)");
3020 Py_DECREF(x);
3021 return NULL;
3022 }
3023 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003024 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003025 else if (PyString_Check(x))
3026 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003027 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003028 /* wrong return value */
3029 PyErr_SetString(PyExc_TypeError,
3030 "character mapping must return integer, None or str");
3031 Py_DECREF(x);
3032 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003033 }
3034}
3035
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003036/* lookup the character, put the result in the output string and adjust
3037 various state variables. Reallocate the output string if not enough
3038 space is available. Return a new reference to the object that
3039 was put in the output buffer, or Py_None, if the mapping was undefined
3040 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00003041 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003042static
3043PyObject *charmapencode_output(Py_UNICODE c, PyObject *mapping,
3044 PyObject **outobj, int *outpos)
3045{
3046 PyObject *rep = charmapencode_lookup(c, mapping);
3047
3048 if (rep==NULL)
3049 return NULL;
3050 else if (rep==Py_None)
3051 return rep;
3052 else {
3053 char *outstart = PyString_AS_STRING(*outobj);
3054 int outsize = PyString_GET_SIZE(*outobj);
3055 if (PyInt_Check(rep)) {
3056 int requiredsize = *outpos+1;
3057 if (outsize<requiredsize) {
3058 /* exponentially overallocate to minimize reallocations */
3059 if (requiredsize < 2*outsize)
3060 requiredsize = 2*outsize;
3061 if (_PyString_Resize(outobj, requiredsize)) {
3062 Py_DECREF(rep);
3063 return NULL;
3064 }
3065 outstart = PyString_AS_STRING(*outobj);
3066 }
3067 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
3068 }
3069 else {
3070 const char *repchars = PyString_AS_STRING(rep);
3071 int repsize = PyString_GET_SIZE(rep);
3072 int requiredsize = *outpos+repsize;
3073 if (outsize<requiredsize) {
3074 /* exponentially overallocate to minimize reallocations */
3075 if (requiredsize < 2*outsize)
3076 requiredsize = 2*outsize;
3077 if (_PyString_Resize(outobj, requiredsize)) {
3078 Py_DECREF(rep);
3079 return NULL;
3080 }
3081 outstart = PyString_AS_STRING(*outobj);
3082 }
3083 memcpy(outstart + *outpos, repchars, repsize);
3084 *outpos += repsize;
3085 }
3086 }
3087 return rep;
3088}
3089
3090/* handle an error in PyUnicode_EncodeCharmap
3091 Return 0 on success, -1 on error */
3092static
3093int charmap_encoding_error(
3094 const Py_UNICODE *p, int size, int *inpos, PyObject *mapping,
3095 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003096 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003097 PyObject **res, int *respos)
3098{
3099 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
3100 int repsize;
3101 int newpos;
3102 Py_UNICODE *uni2;
3103 /* startpos for collecting unencodable chars */
3104 int collstartpos = *inpos;
3105 int collendpos = *inpos+1;
3106 int collpos;
3107 char *encoding = "charmap";
3108 char *reason = "character maps to <undefined>";
3109
3110 PyObject *x;
3111 /* find all unencodable characters */
3112 while (collendpos < size) {
3113 x = charmapencode_lookup(p[collendpos], mapping);
3114 if (x==NULL)
3115 return -1;
3116 else if (x!=Py_None) {
3117 Py_DECREF(x);
3118 break;
3119 }
3120 Py_DECREF(x);
3121 ++collendpos;
3122 }
3123 /* cache callback name lookup
3124 * (if not done yet, i.e. it's the first error) */
3125 if (*known_errorHandler==-1) {
3126 if ((errors==NULL) || (!strcmp(errors, "strict")))
3127 *known_errorHandler = 1;
3128 else if (!strcmp(errors, "replace"))
3129 *known_errorHandler = 2;
3130 else if (!strcmp(errors, "ignore"))
3131 *known_errorHandler = 3;
3132 else if (!strcmp(errors, "xmlcharrefreplace"))
3133 *known_errorHandler = 4;
3134 else
3135 *known_errorHandler = 0;
3136 }
3137 switch (*known_errorHandler) {
3138 case 1: /* strict */
3139 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3140 return -1;
3141 case 2: /* replace */
3142 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
3143 x = charmapencode_output('?', mapping, res, respos);
3144 if (x==NULL) {
3145 return -1;
3146 }
3147 else if (x==Py_None) {
3148 Py_DECREF(x);
3149 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3150 return -1;
3151 }
3152 Py_DECREF(x);
3153 }
3154 /* fall through */
3155 case 3: /* ignore */
3156 *inpos = collendpos;
3157 break;
3158 case 4: /* xmlcharrefreplace */
3159 /* generate replacement (temporarily (mis)uses p) */
3160 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
3161 char buffer[2+29+1+1];
3162 char *cp;
3163 sprintf(buffer, "&#%d;", (int)p[collpos]);
3164 for (cp = buffer; *cp; ++cp) {
3165 x = charmapencode_output(*cp, mapping, res, respos);
3166 if (x==NULL)
3167 return -1;
3168 else if (x==Py_None) {
3169 Py_DECREF(x);
3170 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3171 return -1;
3172 }
3173 Py_DECREF(x);
3174 }
3175 }
3176 *inpos = collendpos;
3177 break;
3178 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003179 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003180 encoding, reason, p, size, exceptionObject,
3181 collstartpos, collendpos, &newpos);
3182 if (repunicode == NULL)
3183 return -1;
3184 /* generate replacement */
3185 repsize = PyUnicode_GET_SIZE(repunicode);
3186 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3187 x = charmapencode_output(*uni2, mapping, res, respos);
3188 if (x==NULL) {
3189 Py_DECREF(repunicode);
3190 return -1;
3191 }
3192 else if (x==Py_None) {
3193 Py_DECREF(repunicode);
3194 Py_DECREF(x);
3195 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3196 return -1;
3197 }
3198 Py_DECREF(x);
3199 }
3200 *inpos = newpos;
3201 Py_DECREF(repunicode);
3202 }
3203 return 0;
3204}
3205
Guido van Rossumd57fd912000-03-10 22:53:23 +00003206PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
3207 int size,
3208 PyObject *mapping,
3209 const char *errors)
3210{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003211 /* output object */
3212 PyObject *res = NULL;
3213 /* current input position */
3214 int inpos = 0;
3215 /* current output position */
3216 int respos = 0;
3217 PyObject *errorHandler = NULL;
3218 PyObject *exc = NULL;
3219 /* the following variable is used for caching string comparisons
3220 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3221 * 3=ignore, 4=xmlcharrefreplace */
3222 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003223
3224 /* Default to Latin-1 */
3225 if (mapping == NULL)
3226 return PyUnicode_EncodeLatin1(p, size, errors);
3227
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003228 /* allocate enough for a simple encoding without
3229 replacements, if we need more, we'll resize */
3230 res = PyString_FromStringAndSize(NULL, size);
3231 if (res == NULL)
3232 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003233 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003234 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003235
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003236 while (inpos<size) {
3237 /* try to encode it */
3238 PyObject *x = charmapencode_output(p[inpos], mapping, &res, &respos);
3239 if (x==NULL) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003240 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003241 if (x==Py_None) { /* unencodable character */
3242 if (charmap_encoding_error(p, size, &inpos, mapping,
3243 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003244 &known_errorHandler, &errorHandler, errors,
Walter Dörwald9b30f202003-08-15 16:26:34 +00003245 &res, &respos)) {
3246 Py_DECREF(x);
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003247 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00003248 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003249 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003250 else
3251 /* done with this character => adjust input position */
3252 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003253 Py_DECREF(x);
3254 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003255
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003256 /* Resize if we allocated to much */
3257 if (respos<PyString_GET_SIZE(res)) {
3258 if (_PyString_Resize(&res, respos))
3259 goto onError;
3260 }
3261 Py_XDECREF(exc);
3262 Py_XDECREF(errorHandler);
3263 return res;
3264
3265 onError:
3266 Py_XDECREF(res);
3267 Py_XDECREF(exc);
3268 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003269 return NULL;
3270}
3271
3272PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
3273 PyObject *mapping)
3274{
3275 if (!PyUnicode_Check(unicode) || mapping == NULL) {
3276 PyErr_BadArgument();
3277 return NULL;
3278 }
3279 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
3280 PyUnicode_GET_SIZE(unicode),
3281 mapping,
3282 NULL);
3283}
3284
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003285/* create or adjust a UnicodeTranslateError */
3286static void make_translate_exception(PyObject **exceptionObject,
3287 const Py_UNICODE *unicode, int size,
3288 int startpos, int endpos,
3289 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003290{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003291 if (*exceptionObject == NULL) {
3292 *exceptionObject = PyUnicodeTranslateError_Create(
3293 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003294 }
3295 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003296 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
3297 goto onError;
3298 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
3299 goto onError;
3300 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
3301 goto onError;
3302 return;
3303 onError:
3304 Py_DECREF(*exceptionObject);
3305 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003306 }
3307}
3308
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003309/* raises a UnicodeTranslateError */
3310static void raise_translate_exception(PyObject **exceptionObject,
3311 const Py_UNICODE *unicode, int size,
3312 int startpos, int endpos,
3313 const char *reason)
3314{
3315 make_translate_exception(exceptionObject,
3316 unicode, size, startpos, endpos, reason);
3317 if (*exceptionObject != NULL)
3318 PyCodec_StrictErrors(*exceptionObject);
3319}
3320
3321/* error handling callback helper:
3322 build arguments, call the callback and check the arguments,
3323 put the result into newpos and return the replacement string, which
3324 has to be freed by the caller */
3325static PyObject *unicode_translate_call_errorhandler(const char *errors,
3326 PyObject **errorHandler,
3327 const char *reason,
3328 const Py_UNICODE *unicode, int size, PyObject **exceptionObject,
3329 int startpos, int endpos,
3330 int *newpos)
3331{
3332 static char *argparse = "O!i;translating error handler must return (unicode, int) tuple";
3333
3334 PyObject *restuple;
3335 PyObject *resunicode;
3336
3337 if (*errorHandler == NULL) {
3338 *errorHandler = PyCodec_LookupError(errors);
3339 if (*errorHandler == NULL)
3340 return NULL;
3341 }
3342
3343 make_translate_exception(exceptionObject,
3344 unicode, size, startpos, endpos, reason);
3345 if (*exceptionObject == NULL)
3346 return NULL;
3347
3348 restuple = PyObject_CallFunctionObjArgs(
3349 *errorHandler, *exceptionObject, NULL);
3350 if (restuple == NULL)
3351 return NULL;
3352 if (!PyTuple_Check(restuple)) {
3353 PyErr_Format(PyExc_TypeError, &argparse[4]);
3354 Py_DECREF(restuple);
3355 return NULL;
3356 }
3357 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3358 &resunicode, newpos)) {
3359 Py_DECREF(restuple);
3360 return NULL;
3361 }
3362 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003363 *newpos = size+*newpos;
3364 if (*newpos<0 || *newpos>size) {
3365 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", *newpos);
3366 Py_DECREF(restuple);
3367 return NULL;
3368 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003369 Py_INCREF(resunicode);
3370 Py_DECREF(restuple);
3371 return resunicode;
3372}
3373
3374/* Lookup the character ch in the mapping and put the result in result,
3375 which must be decrefed by the caller.
3376 Return 0 on success, -1 on error */
3377static
3378int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
3379{
3380 PyObject *w = PyInt_FromLong((long)c);
3381 PyObject *x;
3382
3383 if (w == NULL)
3384 return -1;
3385 x = PyObject_GetItem(mapping, w);
3386 Py_DECREF(w);
3387 if (x == NULL) {
3388 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3389 /* No mapping found means: use 1:1 mapping. */
3390 PyErr_Clear();
3391 *result = NULL;
3392 return 0;
3393 } else
3394 return -1;
3395 }
3396 else if (x == Py_None) {
3397 *result = x;
3398 return 0;
3399 }
3400 else if (PyInt_Check(x)) {
3401 long value = PyInt_AS_LONG(x);
3402 long max = PyUnicode_GetMax();
3403 if (value < 0 || value > max) {
3404 PyErr_Format(PyExc_TypeError,
3405 "character mapping must be in range(0x%lx)", max+1);
3406 Py_DECREF(x);
3407 return -1;
3408 }
3409 *result = x;
3410 return 0;
3411 }
3412 else if (PyUnicode_Check(x)) {
3413 *result = x;
3414 return 0;
3415 }
3416 else {
3417 /* wrong return value */
3418 PyErr_SetString(PyExc_TypeError,
3419 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00003420 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003421 return -1;
3422 }
3423}
3424/* ensure that *outobj is at least requiredsize characters long,
3425if not reallocate and adjust various state variables.
3426Return 0 on success, -1 on error */
3427static
Walter Dörwald4894c302003-10-24 14:25:28 +00003428int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003429 int requiredsize)
3430{
Walter Dörwald4894c302003-10-24 14:25:28 +00003431 int oldsize = PyUnicode_GET_SIZE(*outobj);
3432 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003433 /* remember old output position */
3434 int outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
3435 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00003436 if (requiredsize < 2 * oldsize)
3437 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003438 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003439 return -1;
3440 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003441 }
3442 return 0;
3443}
3444/* lookup the character, put the result in the output string and adjust
3445 various state variables. Return a new reference to the object that
3446 was put in the output buffer in *result, or Py_None, if the mapping was
3447 undefined (in which case no character was written).
3448 The called must decref result.
3449 Return 0 on success, -1 on error. */
3450static
Walter Dörwald4894c302003-10-24 14:25:28 +00003451int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
3452 int insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
3453 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003454{
Walter Dörwald4894c302003-10-24 14:25:28 +00003455 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003456 return -1;
3457 if (*res==NULL) {
3458 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00003459 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003460 }
3461 else if (*res==Py_None)
3462 ;
3463 else if (PyInt_Check(*res)) {
3464 /* no overflow check, because we know that the space is enough */
3465 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
3466 }
3467 else if (PyUnicode_Check(*res)) {
3468 int repsize = PyUnicode_GET_SIZE(*res);
3469 if (repsize==1) {
3470 /* no overflow check, because we know that the space is enough */
3471 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
3472 }
3473 else if (repsize!=0) {
3474 /* more than one character */
Walter Dörwald4894c302003-10-24 14:25:28 +00003475 int requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00003476 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00003477 repsize - 1;
3478 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003479 return -1;
3480 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
3481 *outp += repsize;
3482 }
3483 }
3484 else
3485 return -1;
3486 return 0;
3487}
3488
3489PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003490 int size,
3491 PyObject *mapping,
3492 const char *errors)
3493{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003494 /* output object */
3495 PyObject *res = NULL;
3496 /* pointers to the beginning and end+1 of input */
3497 const Py_UNICODE *startp = p;
3498 const Py_UNICODE *endp = p + size;
3499 /* pointer into the output */
3500 Py_UNICODE *str;
3501 /* current output position */
3502 int respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003503 char *reason = "character maps to <undefined>";
3504 PyObject *errorHandler = NULL;
3505 PyObject *exc = NULL;
3506 /* the following variable is used for caching string comparisons
3507 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3508 * 3=ignore, 4=xmlcharrefreplace */
3509 int known_errorHandler = -1;
3510
Guido van Rossumd57fd912000-03-10 22:53:23 +00003511 if (mapping == NULL) {
3512 PyErr_BadArgument();
3513 return NULL;
3514 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003515
3516 /* allocate enough for a simple 1:1 translation without
3517 replacements, if we need more, we'll resize */
3518 res = PyUnicode_FromUnicode(NULL, size);
3519 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00003520 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003521 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003522 return res;
3523 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003524
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003525 while (p<endp) {
3526 /* try to encode it */
3527 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00003528 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003529 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003530 goto onError;
3531 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00003532 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003533 if (x!=Py_None) /* it worked => adjust input pointer */
3534 ++p;
3535 else { /* untranslatable character */
3536 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
3537 int repsize;
3538 int newpos;
3539 Py_UNICODE *uni2;
3540 /* startpos for collecting untranslatable chars */
3541 const Py_UNICODE *collstart = p;
3542 const Py_UNICODE *collend = p+1;
3543 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003544
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003545 /* find all untranslatable characters */
3546 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00003547 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003548 goto onError;
3549 Py_XDECREF(x);
3550 if (x!=Py_None)
3551 break;
3552 ++collend;
3553 }
3554 /* cache callback name lookup
3555 * (if not done yet, i.e. it's the first error) */
3556 if (known_errorHandler==-1) {
3557 if ((errors==NULL) || (!strcmp(errors, "strict")))
3558 known_errorHandler = 1;
3559 else if (!strcmp(errors, "replace"))
3560 known_errorHandler = 2;
3561 else if (!strcmp(errors, "ignore"))
3562 known_errorHandler = 3;
3563 else if (!strcmp(errors, "xmlcharrefreplace"))
3564 known_errorHandler = 4;
3565 else
3566 known_errorHandler = 0;
3567 }
3568 switch (known_errorHandler) {
3569 case 1: /* strict */
3570 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
3571 goto onError;
3572 case 2: /* replace */
3573 /* No need to check for space, this is a 1:1 replacement */
3574 for (coll = collstart; coll<collend; ++coll)
3575 *str++ = '?';
3576 /* fall through */
3577 case 3: /* ignore */
3578 p = collend;
3579 break;
3580 case 4: /* xmlcharrefreplace */
3581 /* generate replacement (temporarily (mis)uses p) */
3582 for (p = collstart; p < collend; ++p) {
3583 char buffer[2+29+1+1];
3584 char *cp;
3585 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00003586 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003587 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
3588 goto onError;
3589 for (cp = buffer; *cp; ++cp)
3590 *str++ = *cp;
3591 }
3592 p = collend;
3593 break;
3594 default:
3595 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
3596 reason, startp, size, &exc,
3597 collstart-startp, collend-startp, &newpos);
3598 if (repunicode == NULL)
3599 goto onError;
3600 /* generate replacement */
3601 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00003602 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003603 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
3604 Py_DECREF(repunicode);
3605 goto onError;
3606 }
3607 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
3608 *str++ = *uni2;
3609 p = startp + newpos;
3610 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003611 }
3612 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003613 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003614 /* Resize if we allocated to much */
3615 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00003616 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003617 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003618 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003619 }
3620 Py_XDECREF(exc);
3621 Py_XDECREF(errorHandler);
3622 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003623
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003624 onError:
3625 Py_XDECREF(res);
3626 Py_XDECREF(exc);
3627 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003628 return NULL;
3629}
3630
3631PyObject *PyUnicode_Translate(PyObject *str,
3632 PyObject *mapping,
3633 const char *errors)
3634{
3635 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00003636
Guido van Rossumd57fd912000-03-10 22:53:23 +00003637 str = PyUnicode_FromObject(str);
3638 if (str == NULL)
3639 goto onError;
3640 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
3641 PyUnicode_GET_SIZE(str),
3642 mapping,
3643 errors);
3644 Py_DECREF(str);
3645 return result;
Tim Petersced69f82003-09-16 20:30:58 +00003646
Guido van Rossumd57fd912000-03-10 22:53:23 +00003647 onError:
3648 Py_XDECREF(str);
3649 return NULL;
3650}
Tim Petersced69f82003-09-16 20:30:58 +00003651
Guido van Rossum9e896b32000-04-05 20:11:21 +00003652/* --- Decimal Encoder ---------------------------------------------------- */
3653
3654int PyUnicode_EncodeDecimal(Py_UNICODE *s,
3655 int length,
3656 char *output,
3657 const char *errors)
3658{
3659 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003660 PyObject *errorHandler = NULL;
3661 PyObject *exc = NULL;
3662 const char *encoding = "decimal";
3663 const char *reason = "invalid decimal Unicode string";
3664 /* the following variable is used for caching string comparisons
3665 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3666 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003667
3668 if (output == NULL) {
3669 PyErr_BadArgument();
3670 return -1;
3671 }
3672
3673 p = s;
3674 end = s + length;
3675 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003676 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003677 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003678 PyObject *repunicode;
3679 int repsize;
3680 int newpos;
3681 Py_UNICODE *uni2;
3682 Py_UNICODE *collstart;
3683 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00003684
Guido van Rossum9e896b32000-04-05 20:11:21 +00003685 if (Py_UNICODE_ISSPACE(ch)) {
3686 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003687 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003688 continue;
3689 }
3690 decimal = Py_UNICODE_TODECIMAL(ch);
3691 if (decimal >= 0) {
3692 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003693 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003694 continue;
3695 }
Guido van Rossumba477042000-04-06 18:18:10 +00003696 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00003697 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003698 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003699 continue;
3700 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003701 /* All other characters are considered unencodable */
3702 collstart = p;
3703 collend = p+1;
3704 while (collend < end) {
3705 if ((0 < *collend && *collend < 256) ||
3706 !Py_UNICODE_ISSPACE(*collend) ||
3707 Py_UNICODE_TODECIMAL(*collend))
3708 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003709 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003710 /* cache callback name lookup
3711 * (if not done yet, i.e. it's the first error) */
3712 if (known_errorHandler==-1) {
3713 if ((errors==NULL) || (!strcmp(errors, "strict")))
3714 known_errorHandler = 1;
3715 else if (!strcmp(errors, "replace"))
3716 known_errorHandler = 2;
3717 else if (!strcmp(errors, "ignore"))
3718 known_errorHandler = 3;
3719 else if (!strcmp(errors, "xmlcharrefreplace"))
3720 known_errorHandler = 4;
3721 else
3722 known_errorHandler = 0;
3723 }
3724 switch (known_errorHandler) {
3725 case 1: /* strict */
3726 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
3727 goto onError;
3728 case 2: /* replace */
3729 for (p = collstart; p < collend; ++p)
3730 *output++ = '?';
3731 /* fall through */
3732 case 3: /* ignore */
3733 p = collend;
3734 break;
3735 case 4: /* xmlcharrefreplace */
3736 /* generate replacement (temporarily (mis)uses p) */
3737 for (p = collstart; p < collend; ++p)
3738 output += sprintf(output, "&#%d;", (int)*p);
3739 p = collend;
3740 break;
3741 default:
3742 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3743 encoding, reason, s, length, &exc,
3744 collstart-s, collend-s, &newpos);
3745 if (repunicode == NULL)
3746 goto onError;
3747 /* generate replacement */
3748 repsize = PyUnicode_GET_SIZE(repunicode);
3749 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3750 Py_UNICODE ch = *uni2;
3751 if (Py_UNICODE_ISSPACE(ch))
3752 *output++ = ' ';
3753 else {
3754 decimal = Py_UNICODE_TODECIMAL(ch);
3755 if (decimal >= 0)
3756 *output++ = '0' + decimal;
3757 else if (0 < ch && ch < 256)
3758 *output++ = (char)ch;
3759 else {
3760 Py_DECREF(repunicode);
3761 raise_encode_exception(&exc, encoding,
3762 s, length, collstart-s, collend-s, reason);
3763 goto onError;
3764 }
3765 }
3766 }
3767 p = s + newpos;
3768 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003769 }
3770 }
3771 /* 0-terminate the output string */
3772 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003773 Py_XDECREF(exc);
3774 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003775 return 0;
3776
3777 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003778 Py_XDECREF(exc);
3779 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003780 return -1;
3781}
3782
Guido van Rossumd57fd912000-03-10 22:53:23 +00003783/* --- Helpers ------------------------------------------------------------ */
3784
Tim Petersced69f82003-09-16 20:30:58 +00003785static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003786int count(PyUnicodeObject *self,
3787 int start,
3788 int end,
3789 PyUnicodeObject *substring)
3790{
3791 int count = 0;
3792
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003793 if (start < 0)
3794 start += self->length;
3795 if (start < 0)
3796 start = 0;
3797 if (end > self->length)
3798 end = self->length;
3799 if (end < 0)
3800 end += self->length;
3801 if (end < 0)
3802 end = 0;
3803
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00003804 if (substring->length == 0)
3805 return (end - start + 1);
3806
Guido van Rossumd57fd912000-03-10 22:53:23 +00003807 end -= substring->length;
3808
3809 while (start <= end)
3810 if (Py_UNICODE_MATCH(self, start, substring)) {
3811 count++;
3812 start += substring->length;
3813 } else
3814 start++;
3815
3816 return count;
3817}
3818
3819int PyUnicode_Count(PyObject *str,
3820 PyObject *substr,
3821 int start,
3822 int end)
3823{
3824 int result;
Tim Petersced69f82003-09-16 20:30:58 +00003825
Guido van Rossumd57fd912000-03-10 22:53:23 +00003826 str = PyUnicode_FromObject(str);
3827 if (str == NULL)
3828 return -1;
3829 substr = PyUnicode_FromObject(substr);
3830 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00003831 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003832 return -1;
3833 }
Tim Petersced69f82003-09-16 20:30:58 +00003834
Guido van Rossumd57fd912000-03-10 22:53:23 +00003835 result = count((PyUnicodeObject *)str,
3836 start, end,
3837 (PyUnicodeObject *)substr);
Tim Petersced69f82003-09-16 20:30:58 +00003838
Guido van Rossumd57fd912000-03-10 22:53:23 +00003839 Py_DECREF(str);
3840 Py_DECREF(substr);
3841 return result;
3842}
3843
Tim Petersced69f82003-09-16 20:30:58 +00003844static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003845int findstring(PyUnicodeObject *self,
3846 PyUnicodeObject *substring,
3847 int start,
3848 int end,
3849 int direction)
3850{
3851 if (start < 0)
3852 start += self->length;
3853 if (start < 0)
3854 start = 0;
3855
Guido van Rossumd57fd912000-03-10 22:53:23 +00003856 if (end > self->length)
3857 end = self->length;
3858 if (end < 0)
3859 end += self->length;
3860 if (end < 0)
3861 end = 0;
3862
Guido van Rossum76afbd92002-08-20 17:29:29 +00003863 if (substring->length == 0)
3864 return (direction > 0) ? start : end;
3865
Guido van Rossumd57fd912000-03-10 22:53:23 +00003866 end -= substring->length;
3867
3868 if (direction < 0) {
3869 for (; end >= start; end--)
3870 if (Py_UNICODE_MATCH(self, end, substring))
3871 return end;
3872 } else {
3873 for (; start <= end; start++)
3874 if (Py_UNICODE_MATCH(self, start, substring))
3875 return start;
3876 }
3877
3878 return -1;
3879}
3880
3881int PyUnicode_Find(PyObject *str,
3882 PyObject *substr,
3883 int start,
3884 int end,
3885 int direction)
3886{
3887 int result;
Tim Petersced69f82003-09-16 20:30:58 +00003888
Guido van Rossumd57fd912000-03-10 22:53:23 +00003889 str = PyUnicode_FromObject(str);
3890 if (str == NULL)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00003891 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003892 substr = PyUnicode_FromObject(substr);
3893 if (substr == NULL) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00003894 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00003895 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003896 }
Tim Petersced69f82003-09-16 20:30:58 +00003897
Guido van Rossumd57fd912000-03-10 22:53:23 +00003898 result = findstring((PyUnicodeObject *)str,
3899 (PyUnicodeObject *)substr,
3900 start, end, direction);
3901 Py_DECREF(str);
3902 Py_DECREF(substr);
3903 return result;
3904}
3905
Tim Petersced69f82003-09-16 20:30:58 +00003906static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003907int tailmatch(PyUnicodeObject *self,
3908 PyUnicodeObject *substring,
3909 int start,
3910 int end,
3911 int direction)
3912{
3913 if (start < 0)
3914 start += self->length;
3915 if (start < 0)
3916 start = 0;
3917
3918 if (substring->length == 0)
3919 return 1;
3920
3921 if (end > self->length)
3922 end = self->length;
3923 if (end < 0)
3924 end += self->length;
3925 if (end < 0)
3926 end = 0;
3927
3928 end -= substring->length;
3929 if (end < start)
3930 return 0;
3931
3932 if (direction > 0) {
3933 if (Py_UNICODE_MATCH(self, end, substring))
3934 return 1;
3935 } else {
3936 if (Py_UNICODE_MATCH(self, start, substring))
3937 return 1;
3938 }
3939
3940 return 0;
3941}
3942
3943int PyUnicode_Tailmatch(PyObject *str,
3944 PyObject *substr,
3945 int start,
3946 int end,
3947 int direction)
3948{
3949 int result;
Tim Petersced69f82003-09-16 20:30:58 +00003950
Guido van Rossumd57fd912000-03-10 22:53:23 +00003951 str = PyUnicode_FromObject(str);
3952 if (str == NULL)
3953 return -1;
3954 substr = PyUnicode_FromObject(substr);
3955 if (substr == NULL) {
3956 Py_DECREF(substr);
3957 return -1;
3958 }
Tim Petersced69f82003-09-16 20:30:58 +00003959
Guido van Rossumd57fd912000-03-10 22:53:23 +00003960 result = tailmatch((PyUnicodeObject *)str,
3961 (PyUnicodeObject *)substr,
3962 start, end, direction);
3963 Py_DECREF(str);
3964 Py_DECREF(substr);
3965 return result;
3966}
3967
Tim Petersced69f82003-09-16 20:30:58 +00003968static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003969const Py_UNICODE *findchar(const Py_UNICODE *s,
3970 int size,
3971 Py_UNICODE ch)
3972{
3973 /* like wcschr, but doesn't stop at NULL characters */
3974
3975 while (size-- > 0) {
3976 if (*s == ch)
3977 return s;
3978 s++;
3979 }
3980
3981 return NULL;
3982}
3983
3984/* Apply fixfct filter to the Unicode object self and return a
3985 reference to the modified object */
3986
Tim Petersced69f82003-09-16 20:30:58 +00003987static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003988PyObject *fixup(PyUnicodeObject *self,
3989 int (*fixfct)(PyUnicodeObject *s))
3990{
3991
3992 PyUnicodeObject *u;
3993
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003994 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003995 if (u == NULL)
3996 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003997
3998 Py_UNICODE_COPY(u->str, self->str, self->length);
3999
Tim Peters7a29bd52001-09-12 03:03:31 +00004000 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004001 /* fixfct should return TRUE if it modified the buffer. If
4002 FALSE, return a reference to the original buffer instead
4003 (to save space, not time) */
4004 Py_INCREF(self);
4005 Py_DECREF(u);
4006 return (PyObject*) self;
4007 }
4008 return (PyObject*) u;
4009}
4010
Tim Petersced69f82003-09-16 20:30:58 +00004011static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004012int fixupper(PyUnicodeObject *self)
4013{
4014 int len = self->length;
4015 Py_UNICODE *s = self->str;
4016 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004017
Guido van Rossumd57fd912000-03-10 22:53:23 +00004018 while (len-- > 0) {
4019 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004020
Guido van Rossumd57fd912000-03-10 22:53:23 +00004021 ch = Py_UNICODE_TOUPPER(*s);
4022 if (ch != *s) {
4023 status = 1;
4024 *s = ch;
4025 }
4026 s++;
4027 }
4028
4029 return status;
4030}
4031
Tim Petersced69f82003-09-16 20:30:58 +00004032static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004033int fixlower(PyUnicodeObject *self)
4034{
4035 int len = self->length;
4036 Py_UNICODE *s = self->str;
4037 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004038
Guido van Rossumd57fd912000-03-10 22:53:23 +00004039 while (len-- > 0) {
4040 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004041
Guido van Rossumd57fd912000-03-10 22:53:23 +00004042 ch = Py_UNICODE_TOLOWER(*s);
4043 if (ch != *s) {
4044 status = 1;
4045 *s = ch;
4046 }
4047 s++;
4048 }
4049
4050 return status;
4051}
4052
Tim Petersced69f82003-09-16 20:30:58 +00004053static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004054int fixswapcase(PyUnicodeObject *self)
4055{
4056 int len = self->length;
4057 Py_UNICODE *s = self->str;
4058 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004059
Guido van Rossumd57fd912000-03-10 22:53:23 +00004060 while (len-- > 0) {
4061 if (Py_UNICODE_ISUPPER(*s)) {
4062 *s = Py_UNICODE_TOLOWER(*s);
4063 status = 1;
4064 } else if (Py_UNICODE_ISLOWER(*s)) {
4065 *s = Py_UNICODE_TOUPPER(*s);
4066 status = 1;
4067 }
4068 s++;
4069 }
4070
4071 return status;
4072}
4073
Tim Petersced69f82003-09-16 20:30:58 +00004074static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004075int fixcapitalize(PyUnicodeObject *self)
4076{
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004077 int len = self->length;
4078 Py_UNICODE *s = self->str;
4079 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004080
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004081 if (len == 0)
4082 return 0;
4083 if (Py_UNICODE_ISLOWER(*s)) {
4084 *s = Py_UNICODE_TOUPPER(*s);
4085 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004086 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004087 s++;
4088 while (--len > 0) {
4089 if (Py_UNICODE_ISUPPER(*s)) {
4090 *s = Py_UNICODE_TOLOWER(*s);
4091 status = 1;
4092 }
4093 s++;
4094 }
4095 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004096}
4097
4098static
4099int fixtitle(PyUnicodeObject *self)
4100{
4101 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4102 register Py_UNICODE *e;
4103 int previous_is_cased;
4104
4105 /* Shortcut for single character strings */
4106 if (PyUnicode_GET_SIZE(self) == 1) {
4107 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
4108 if (*p != ch) {
4109 *p = ch;
4110 return 1;
4111 }
4112 else
4113 return 0;
4114 }
Tim Petersced69f82003-09-16 20:30:58 +00004115
Guido van Rossumd57fd912000-03-10 22:53:23 +00004116 e = p + PyUnicode_GET_SIZE(self);
4117 previous_is_cased = 0;
4118 for (; p < e; p++) {
4119 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00004120
Guido van Rossumd57fd912000-03-10 22:53:23 +00004121 if (previous_is_cased)
4122 *p = Py_UNICODE_TOLOWER(ch);
4123 else
4124 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00004125
4126 if (Py_UNICODE_ISLOWER(ch) ||
4127 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00004128 Py_UNICODE_ISTITLE(ch))
4129 previous_is_cased = 1;
4130 else
4131 previous_is_cased = 0;
4132 }
4133 return 1;
4134}
4135
Tim Peters8ce9f162004-08-27 01:49:32 +00004136PyObject *
4137PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004138{
Tim Peters8ce9f162004-08-27 01:49:32 +00004139 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00004140 const Py_UNICODE blank = ' ';
4141 const Py_UNICODE *sep = &blank;
4142 size_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00004143 PyUnicodeObject *res = NULL; /* the result */
4144 size_t res_alloc = 100; /* # allocated bytes for string in res */
4145 size_t res_used; /* # used bytes */
4146 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
4147 PyObject *fseq; /* PySequence_Fast(seq) */
4148 int seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00004149 PyObject *item;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004150 int i;
4151
Tim Peters05eba1f2004-08-27 21:32:02 +00004152 fseq = PySequence_Fast(seq, "");
4153 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00004154 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00004155 }
4156
Tim Peters91879ab2004-08-27 22:35:44 +00004157 /* Grrrr. A codec may be invoked to convert str objects to
4158 * Unicode, and so it's possible to call back into Python code
4159 * during PyUnicode_FromObject(), and so it's possible for a sick
4160 * codec to change the size of fseq (if seq is a list). Therefore
4161 * we have to keep refetching the size -- can't assume seqlen
4162 * is invariant.
4163 */
Tim Peters05eba1f2004-08-27 21:32:02 +00004164 seqlen = PySequence_Fast_GET_SIZE(fseq);
4165 /* If empty sequence, return u"". */
4166 if (seqlen == 0) {
4167 res = _PyUnicode_New(0); /* empty sequence; return u"" */
4168 goto Done;
4169 }
4170 /* If singleton sequence with an exact Unicode, return that. */
4171 if (seqlen == 1) {
4172 item = PySequence_Fast_GET_ITEM(fseq, 0);
4173 if (PyUnicode_CheckExact(item)) {
4174 Py_INCREF(item);
4175 res = (PyUnicodeObject *)item;
4176 goto Done;
4177 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004178 }
4179
Tim Peters05eba1f2004-08-27 21:32:02 +00004180 /* At least two items to join, or one that isn't exact Unicode. */
4181 if (seqlen > 1) {
4182 /* Set up sep and seplen -- they're needed. */
4183 if (separator == NULL) {
4184 sep = &blank;
4185 seplen = 1;
4186 }
4187 else {
4188 internal_separator = PyUnicode_FromObject(separator);
4189 if (internal_separator == NULL)
4190 goto onError;
4191 sep = PyUnicode_AS_UNICODE(internal_separator);
4192 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00004193 /* In case PyUnicode_FromObject() mutated seq. */
4194 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00004195 }
4196 }
4197
4198 /* Get space. */
4199 res = _PyUnicode_New((int)res_alloc);
4200 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00004201 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00004202 res_p = PyUnicode_AS_UNICODE(res);
4203 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00004204
Tim Peters05eba1f2004-08-27 21:32:02 +00004205 for (i = 0; i < seqlen; ++i) {
4206 size_t itemlen;
4207 size_t new_res_used;
4208
4209 item = PySequence_Fast_GET_ITEM(fseq, i);
4210 /* Convert item to Unicode. */
4211 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
4212 PyErr_Format(PyExc_TypeError,
4213 "sequence item %i: expected string or Unicode,"
4214 " %.80s found",
4215 i, item->ob_type->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00004216 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00004217 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004218 item = PyUnicode_FromObject(item);
4219 if (item == NULL)
4220 goto onError;
4221 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00004222
Tim Peters91879ab2004-08-27 22:35:44 +00004223 /* In case PyUnicode_FromObject() mutated seq. */
4224 seqlen = PySequence_Fast_GET_SIZE(fseq);
4225
Tim Peters8ce9f162004-08-27 01:49:32 +00004226 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004227 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004228 new_res_used = res_used + itemlen;
4229 if (new_res_used < res_used || new_res_used > INT_MAX)
Tim Peters8ce9f162004-08-27 01:49:32 +00004230 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004231 if (i < seqlen - 1) {
4232 new_res_used += seplen;
4233 if (new_res_used < res_used || new_res_used > INT_MAX)
4234 goto Overflow;
4235 }
4236 if (new_res_used > res_alloc) {
4237 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00004238 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00004239 size_t oldsize = res_alloc;
4240 res_alloc += res_alloc;
4241 if (res_alloc < oldsize || res_alloc > INT_MAX)
Tim Peters8ce9f162004-08-27 01:49:32 +00004242 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004243 } while (new_res_used > res_alloc);
4244 if (_PyUnicode_Resize(&res, (int)res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004245 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004246 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004247 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004248 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004249 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004250
4251 /* Copy item, and maybe the separator. */
4252 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), (int)itemlen);
4253 res_p += itemlen;
4254 if (i < seqlen - 1) {
4255 Py_UNICODE_COPY(res_p, sep, (int)seplen);
4256 res_p += seplen;
4257 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004258 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004259 res_used = new_res_used;
4260 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004261
Tim Peters05eba1f2004-08-27 21:32:02 +00004262 /* Shrink res to match the used area; this probably can't fail,
4263 * but it's cheap to check.
4264 */
4265 if (_PyUnicode_Resize(&res, (int)res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004266 goto onError;
4267
4268 Done:
4269 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004270 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004271 return (PyObject *)res;
4272
Tim Peters8ce9f162004-08-27 01:49:32 +00004273 Overflow:
4274 PyErr_SetString(PyExc_OverflowError,
4275 "join() is too long for a Python string");
4276 Py_DECREF(item);
4277 /* fall through */
4278
Guido van Rossumd57fd912000-03-10 22:53:23 +00004279 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00004280 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004281 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00004282 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004283 return NULL;
4284}
4285
Tim Petersced69f82003-09-16 20:30:58 +00004286static
4287PyUnicodeObject *pad(PyUnicodeObject *self,
4288 int left,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004289 int right,
4290 Py_UNICODE fill)
4291{
4292 PyUnicodeObject *u;
4293
4294 if (left < 0)
4295 left = 0;
4296 if (right < 0)
4297 right = 0;
4298
Tim Peters7a29bd52001-09-12 03:03:31 +00004299 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004300 Py_INCREF(self);
4301 return self;
4302 }
4303
4304 u = _PyUnicode_New(left + self->length + right);
4305 if (u) {
4306 if (left)
4307 Py_UNICODE_FILL(u->str, fill, left);
4308 Py_UNICODE_COPY(u->str + left, self->str, self->length);
4309 if (right)
4310 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
4311 }
4312
4313 return u;
4314}
4315
4316#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004317 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00004318 if (!str) \
4319 goto onError; \
4320 if (PyList_Append(list, str)) { \
4321 Py_DECREF(str); \
4322 goto onError; \
4323 } \
4324 else \
4325 Py_DECREF(str);
4326
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004327#define SPLIT_INSERT(data, left, right) \
4328 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
4329 if (!str) \
4330 goto onError; \
4331 if (PyList_Insert(list, 0, str)) { \
4332 Py_DECREF(str); \
4333 goto onError; \
4334 } \
4335 else \
4336 Py_DECREF(str);
4337
Guido van Rossumd57fd912000-03-10 22:53:23 +00004338static
4339PyObject *split_whitespace(PyUnicodeObject *self,
4340 PyObject *list,
4341 int maxcount)
4342{
4343 register int i;
4344 register int j;
4345 int len = self->length;
4346 PyObject *str;
4347
4348 for (i = j = 0; i < len; ) {
4349 /* find a token */
4350 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4351 i++;
4352 j = i;
4353 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
4354 i++;
4355 if (j < i) {
4356 if (maxcount-- <= 0)
4357 break;
4358 SPLIT_APPEND(self->str, j, i);
4359 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4360 i++;
4361 j = i;
4362 }
4363 }
4364 if (j < len) {
4365 SPLIT_APPEND(self->str, j, len);
4366 }
4367 return list;
4368
4369 onError:
4370 Py_DECREF(list);
4371 return NULL;
4372}
4373
4374PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00004375 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004376{
4377 register int i;
4378 register int j;
4379 int len;
4380 PyObject *list;
4381 PyObject *str;
4382 Py_UNICODE *data;
4383
4384 string = PyUnicode_FromObject(string);
4385 if (string == NULL)
4386 return NULL;
4387 data = PyUnicode_AS_UNICODE(string);
4388 len = PyUnicode_GET_SIZE(string);
4389
Guido van Rossumd57fd912000-03-10 22:53:23 +00004390 list = PyList_New(0);
4391 if (!list)
4392 goto onError;
4393
4394 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00004395 int eol;
Tim Petersced69f82003-09-16 20:30:58 +00004396
Guido van Rossumd57fd912000-03-10 22:53:23 +00004397 /* Find a line and append it */
4398 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
4399 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004400
4401 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00004402 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004403 if (i < len) {
4404 if (data[i] == '\r' && i + 1 < len &&
4405 data[i+1] == '\n')
4406 i += 2;
4407 else
4408 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00004409 if (keepends)
4410 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004411 }
Guido van Rossum86662912000-04-11 15:38:46 +00004412 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004413 j = i;
4414 }
4415 if (j < len) {
4416 SPLIT_APPEND(data, j, len);
4417 }
4418
4419 Py_DECREF(string);
4420 return list;
4421
4422 onError:
4423 Py_DECREF(list);
4424 Py_DECREF(string);
4425 return NULL;
4426}
4427
Tim Petersced69f82003-09-16 20:30:58 +00004428static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004429PyObject *split_char(PyUnicodeObject *self,
4430 PyObject *list,
4431 Py_UNICODE ch,
4432 int maxcount)
4433{
4434 register int i;
4435 register int j;
4436 int len = self->length;
4437 PyObject *str;
4438
4439 for (i = j = 0; i < len; ) {
4440 if (self->str[i] == ch) {
4441 if (maxcount-- <= 0)
4442 break;
4443 SPLIT_APPEND(self->str, j, i);
4444 i = j = i + 1;
4445 } else
4446 i++;
4447 }
4448 if (j <= len) {
4449 SPLIT_APPEND(self->str, j, len);
4450 }
4451 return list;
4452
4453 onError:
4454 Py_DECREF(list);
4455 return NULL;
4456}
4457
Tim Petersced69f82003-09-16 20:30:58 +00004458static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004459PyObject *split_substring(PyUnicodeObject *self,
4460 PyObject *list,
4461 PyUnicodeObject *substring,
4462 int maxcount)
4463{
4464 register int i;
4465 register int j;
4466 int len = self->length;
4467 int sublen = substring->length;
4468 PyObject *str;
4469
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00004470 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004471 if (Py_UNICODE_MATCH(self, i, substring)) {
4472 if (maxcount-- <= 0)
4473 break;
4474 SPLIT_APPEND(self->str, j, i);
4475 i = j = i + sublen;
4476 } else
4477 i++;
4478 }
4479 if (j <= len) {
4480 SPLIT_APPEND(self->str, j, len);
4481 }
4482 return list;
4483
4484 onError:
4485 Py_DECREF(list);
4486 return NULL;
4487}
4488
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004489static
4490PyObject *rsplit_whitespace(PyUnicodeObject *self,
4491 PyObject *list,
4492 int maxcount)
4493{
4494 register int i;
4495 register int j;
4496 int len = self->length;
4497 PyObject *str;
4498
4499 for (i = j = len - 1; i >= 0; ) {
4500 /* find a token */
4501 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4502 i--;
4503 j = i;
4504 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
4505 i--;
4506 if (j > i) {
4507 if (maxcount-- <= 0)
4508 break;
4509 SPLIT_INSERT(self->str, i + 1, j + 1);
4510 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4511 i--;
4512 j = i;
4513 }
4514 }
4515 if (j >= 0) {
4516 SPLIT_INSERT(self->str, 0, j + 1);
4517 }
4518 return list;
4519
4520 onError:
4521 Py_DECREF(list);
4522 return NULL;
4523}
4524
4525static
4526PyObject *rsplit_char(PyUnicodeObject *self,
4527 PyObject *list,
4528 Py_UNICODE ch,
4529 int maxcount)
4530{
4531 register int i;
4532 register int j;
4533 int len = self->length;
4534 PyObject *str;
4535
4536 for (i = j = len - 1; i >= 0; ) {
4537 if (self->str[i] == ch) {
4538 if (maxcount-- <= 0)
4539 break;
4540 SPLIT_INSERT(self->str, i + 1, j + 1);
4541 j = i = i - 1;
4542 } else
4543 i--;
4544 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00004545 if (j >= -1) {
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004546 SPLIT_INSERT(self->str, 0, j + 1);
4547 }
4548 return list;
4549
4550 onError:
4551 Py_DECREF(list);
4552 return NULL;
4553}
4554
4555static
4556PyObject *rsplit_substring(PyUnicodeObject *self,
4557 PyObject *list,
4558 PyUnicodeObject *substring,
4559 int maxcount)
4560{
4561 register int i;
4562 register int j;
4563 int len = self->length;
4564 int sublen = substring->length;
4565 PyObject *str;
4566
4567 for (i = len - sublen, j = len; i >= 0; ) {
4568 if (Py_UNICODE_MATCH(self, i, substring)) {
4569 if (maxcount-- <= 0)
4570 break;
4571 SPLIT_INSERT(self->str, i + sublen, j);
4572 j = i;
4573 i -= sublen;
4574 } else
4575 i--;
4576 }
4577 if (j >= 0) {
4578 SPLIT_INSERT(self->str, 0, j);
4579 }
4580 return list;
4581
4582 onError:
4583 Py_DECREF(list);
4584 return NULL;
4585}
4586
Guido van Rossumd57fd912000-03-10 22:53:23 +00004587#undef SPLIT_APPEND
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004588#undef SPLIT_INSERT
Guido van Rossumd57fd912000-03-10 22:53:23 +00004589
4590static
4591PyObject *split(PyUnicodeObject *self,
4592 PyUnicodeObject *substring,
4593 int maxcount)
4594{
4595 PyObject *list;
4596
4597 if (maxcount < 0)
4598 maxcount = INT_MAX;
4599
4600 list = PyList_New(0);
4601 if (!list)
4602 return NULL;
4603
4604 if (substring == NULL)
4605 return split_whitespace(self,list,maxcount);
4606
4607 else if (substring->length == 1)
4608 return split_char(self,list,substring->str[0],maxcount);
4609
4610 else if (substring->length == 0) {
4611 Py_DECREF(list);
4612 PyErr_SetString(PyExc_ValueError, "empty separator");
4613 return NULL;
4614 }
4615 else
4616 return split_substring(self,list,substring,maxcount);
4617}
4618
Tim Petersced69f82003-09-16 20:30:58 +00004619static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004620PyObject *rsplit(PyUnicodeObject *self,
4621 PyUnicodeObject *substring,
4622 int maxcount)
4623{
4624 PyObject *list;
4625
4626 if (maxcount < 0)
4627 maxcount = INT_MAX;
4628
4629 list = PyList_New(0);
4630 if (!list)
4631 return NULL;
4632
4633 if (substring == NULL)
4634 return rsplit_whitespace(self,list,maxcount);
4635
4636 else if (substring->length == 1)
4637 return rsplit_char(self,list,substring->str[0],maxcount);
4638
4639 else if (substring->length == 0) {
4640 Py_DECREF(list);
4641 PyErr_SetString(PyExc_ValueError, "empty separator");
4642 return NULL;
4643 }
4644 else
4645 return rsplit_substring(self,list,substring,maxcount);
4646}
4647
4648static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004649PyObject *replace(PyUnicodeObject *self,
4650 PyUnicodeObject *str1,
4651 PyUnicodeObject *str2,
4652 int maxcount)
4653{
4654 PyUnicodeObject *u;
4655
4656 if (maxcount < 0)
4657 maxcount = INT_MAX;
4658
4659 if (str1->length == 1 && str2->length == 1) {
4660 int i;
4661
4662 /* replace characters */
Tim Peters7a29bd52001-09-12 03:03:31 +00004663 if (!findchar(self->str, self->length, str1->str[0]) &&
4664 PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004665 /* nothing to replace, return original string */
4666 Py_INCREF(self);
4667 u = self;
4668 } else {
4669 Py_UNICODE u1 = str1->str[0];
4670 Py_UNICODE u2 = str2->str[0];
Tim Petersced69f82003-09-16 20:30:58 +00004671
Guido van Rossumd57fd912000-03-10 22:53:23 +00004672 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004673 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004674 self->length
4675 );
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004676 if (u != NULL) {
Tim Petersced69f82003-09-16 20:30:58 +00004677 Py_UNICODE_COPY(u->str, self->str,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004678 self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004679 for (i = 0; i < u->length; i++)
4680 if (u->str[i] == u1) {
4681 if (--maxcount < 0)
4682 break;
4683 u->str[i] = u2;
4684 }
4685 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004686 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004687
4688 } else {
4689 int n, i;
4690 Py_UNICODE *p;
4691
4692 /* replace strings */
4693 n = count(self, 0, self->length, str1);
4694 if (n > maxcount)
4695 n = maxcount;
Guido van Rossum2023c9b2002-08-23 18:50:21 +00004696 if (n == 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004697 /* nothing to replace, return original string */
Guido van Rossum2023c9b2002-08-23 18:50:21 +00004698 if (PyUnicode_CheckExact(self)) {
4699 Py_INCREF(self);
4700 u = self;
4701 }
4702 else {
4703 u = (PyUnicodeObject *)
4704 PyUnicode_FromUnicode(self->str, self->length);
4705 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004706 } else {
4707 u = _PyUnicode_New(
4708 self->length + n * (str2->length - str1->length));
4709 if (u) {
4710 i = 0;
4711 p = u->str;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004712 if (str1->length > 0) {
4713 while (i <= self->length - str1->length)
4714 if (Py_UNICODE_MATCH(self, i, str1)) {
4715 /* replace string segment */
4716 Py_UNICODE_COPY(p, str2->str, str2->length);
4717 p += str2->length;
4718 i += str1->length;
4719 if (--n <= 0) {
4720 /* copy remaining part */
4721 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4722 break;
4723 }
4724 } else
4725 *p++ = self->str[i++];
4726 } else {
4727 while (n > 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004728 Py_UNICODE_COPY(p, str2->str, str2->length);
4729 p += str2->length;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004730 if (--n <= 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004731 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004732 *p++ = self->str[i++];
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004733 }
4734 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4735 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004736 }
4737 }
4738 }
Tim Petersced69f82003-09-16 20:30:58 +00004739
Guido van Rossumd57fd912000-03-10 22:53:23 +00004740 return (PyObject *) u;
4741}
4742
4743/* --- Unicode Object Methods --------------------------------------------- */
4744
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004745PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004746"S.title() -> unicode\n\
4747\n\
4748Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004749characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004750
4751static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004752unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004753{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004754 return fixup(self, fixtitle);
4755}
4756
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004757PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004758"S.capitalize() -> unicode\n\
4759\n\
4760Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004761have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004762
4763static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004764unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004765{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004766 return fixup(self, fixcapitalize);
4767}
4768
4769#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004770PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004771"S.capwords() -> unicode\n\
4772\n\
4773Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004774normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004775
4776static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004777unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004778{
4779 PyObject *list;
4780 PyObject *item;
4781 int i;
4782
Guido van Rossumd57fd912000-03-10 22:53:23 +00004783 /* Split into words */
4784 list = split(self, NULL, -1);
4785 if (!list)
4786 return NULL;
4787
4788 /* Capitalize each word */
4789 for (i = 0; i < PyList_GET_SIZE(list); i++) {
4790 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
4791 fixcapitalize);
4792 if (item == NULL)
4793 goto onError;
4794 Py_DECREF(PyList_GET_ITEM(list, i));
4795 PyList_SET_ITEM(list, i, item);
4796 }
4797
4798 /* Join the words to form a new string */
4799 item = PyUnicode_Join(NULL, list);
4800
4801onError:
4802 Py_DECREF(list);
4803 return (PyObject *)item;
4804}
4805#endif
4806
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004807/* Argument converter. Coerces to a single unicode character */
4808
4809static int
4810convert_uc(PyObject *obj, void *addr)
4811{
4812 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
4813 PyObject *uniobj;
4814 Py_UNICODE *unistr;
4815
4816 uniobj = PyUnicode_FromObject(obj);
4817 if (uniobj == NULL) {
4818 PyErr_SetString(PyExc_TypeError,
4819 "The fill character cannot be converted to Unicode");
4820 return 0;
4821 }
4822 if (PyUnicode_GET_SIZE(uniobj) != 1) {
4823 PyErr_SetString(PyExc_TypeError,
4824 "The fill character must be exactly one character long");
4825 Py_DECREF(uniobj);
4826 return 0;
4827 }
4828 unistr = PyUnicode_AS_UNICODE(uniobj);
4829 *fillcharloc = unistr[0];
4830 Py_DECREF(uniobj);
4831 return 1;
4832}
4833
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004834PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004835"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004836\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004837Return S centered in a Unicode string of length width. Padding is\n\
4838done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004839
4840static PyObject *
4841unicode_center(PyUnicodeObject *self, PyObject *args)
4842{
4843 int marg, left;
4844 int width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004845 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00004846
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004847 if (!PyArg_ParseTuple(args, "i|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004848 return NULL;
4849
Tim Peters7a29bd52001-09-12 03:03:31 +00004850 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004851 Py_INCREF(self);
4852 return (PyObject*) self;
4853 }
4854
4855 marg = width - self->length;
4856 left = marg / 2 + (marg & width & 1);
4857
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004858 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004859}
4860
Marc-André Lemburge5034372000-08-08 08:04:29 +00004861#if 0
4862
4863/* This code should go into some future Unicode collation support
4864 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00004865 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00004866
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004867/* speedy UTF-16 code point order comparison */
4868/* gleaned from: */
4869/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
4870
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004871static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004872{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004873 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00004874 0, 0, 0, 0, 0, 0, 0, 0,
4875 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004876 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004877};
4878
Guido van Rossumd57fd912000-03-10 22:53:23 +00004879static int
4880unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4881{
4882 int len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004883
Guido van Rossumd57fd912000-03-10 22:53:23 +00004884 Py_UNICODE *s1 = str1->str;
4885 Py_UNICODE *s2 = str2->str;
4886
4887 len1 = str1->length;
4888 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00004889
Guido van Rossumd57fd912000-03-10 22:53:23 +00004890 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00004891 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004892
4893 c1 = *s1++;
4894 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00004895
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004896 if (c1 > (1<<11) * 26)
4897 c1 += utf16Fixup[c1>>11];
4898 if (c2 > (1<<11) * 26)
4899 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004900 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00004901
4902 if (c1 != c2)
4903 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00004904
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004905 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004906 }
4907
4908 return (len1 < len2) ? -1 : (len1 != len2);
4909}
4910
Marc-André Lemburge5034372000-08-08 08:04:29 +00004911#else
4912
4913static int
4914unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4915{
4916 register int len1, len2;
4917
4918 Py_UNICODE *s1 = str1->str;
4919 Py_UNICODE *s2 = str2->str;
4920
4921 len1 = str1->length;
4922 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00004923
Marc-André Lemburge5034372000-08-08 08:04:29 +00004924 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00004925 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00004926
Fredrik Lundh45714e92001-06-26 16:39:36 +00004927 c1 = *s1++;
4928 c2 = *s2++;
4929
4930 if (c1 != c2)
4931 return (c1 < c2) ? -1 : 1;
4932
Marc-André Lemburge5034372000-08-08 08:04:29 +00004933 len1--; len2--;
4934 }
4935
4936 return (len1 < len2) ? -1 : (len1 != len2);
4937}
4938
4939#endif
4940
Guido van Rossumd57fd912000-03-10 22:53:23 +00004941int PyUnicode_Compare(PyObject *left,
4942 PyObject *right)
4943{
4944 PyUnicodeObject *u = NULL, *v = NULL;
4945 int result;
4946
4947 /* Coerce the two arguments */
4948 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
4949 if (u == NULL)
4950 goto onError;
4951 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
4952 if (v == NULL)
4953 goto onError;
4954
Thomas Wouters7e474022000-07-16 12:04:32 +00004955 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004956 if (v == u) {
4957 Py_DECREF(u);
4958 Py_DECREF(v);
4959 return 0;
4960 }
4961
4962 result = unicode_compare(u, v);
4963
4964 Py_DECREF(u);
4965 Py_DECREF(v);
4966 return result;
4967
4968onError:
4969 Py_XDECREF(u);
4970 Py_XDECREF(v);
4971 return -1;
4972}
4973
Guido van Rossum403d68b2000-03-13 15:55:09 +00004974int PyUnicode_Contains(PyObject *container,
4975 PyObject *element)
4976{
4977 PyUnicodeObject *u = NULL, *v = NULL;
Barry Warsaw817918c2002-08-06 16:58:21 +00004978 int result, size;
4979 register const Py_UNICODE *lhs, *end, *rhs;
Guido van Rossum403d68b2000-03-13 15:55:09 +00004980
4981 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004982 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00004983 if (v == NULL) {
4984 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00004985 "'in <string>' requires string as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00004986 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00004987 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00004988 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
Marc-André Lemburg9cd87aa2002-10-23 09:02:46 +00004989 if (u == NULL)
Guido van Rossum9e896b32000-04-05 20:11:21 +00004990 goto onError;
Guido van Rossum403d68b2000-03-13 15:55:09 +00004991
Barry Warsaw817918c2002-08-06 16:58:21 +00004992 size = PyUnicode_GET_SIZE(v);
4993 rhs = PyUnicode_AS_UNICODE(v);
4994 lhs = PyUnicode_AS_UNICODE(u);
4995
Guido van Rossum403d68b2000-03-13 15:55:09 +00004996 result = 0;
Barry Warsaw817918c2002-08-06 16:58:21 +00004997 if (size == 1) {
4998 end = lhs + PyUnicode_GET_SIZE(u);
4999 while (lhs < end) {
5000 if (*lhs++ == *rhs) {
5001 result = 1;
5002 break;
5003 }
5004 }
5005 }
5006 else {
5007 end = lhs + (PyUnicode_GET_SIZE(u) - size);
5008 while (lhs <= end) {
Barry Warsaw6a043f32002-08-06 19:03:17 +00005009 if (memcmp(lhs++, rhs, size * sizeof(Py_UNICODE)) == 0) {
Barry Warsaw817918c2002-08-06 16:58:21 +00005010 result = 1;
5011 break;
5012 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00005013 }
5014 }
5015
5016 Py_DECREF(u);
5017 Py_DECREF(v);
5018 return result;
5019
5020onError:
5021 Py_XDECREF(u);
5022 Py_XDECREF(v);
5023 return -1;
5024}
5025
Guido van Rossumd57fd912000-03-10 22:53:23 +00005026/* Concat to string or Unicode object giving a new Unicode object. */
5027
5028PyObject *PyUnicode_Concat(PyObject *left,
5029 PyObject *right)
5030{
5031 PyUnicodeObject *u = NULL, *v = NULL, *w;
5032
5033 /* Coerce the two arguments */
5034 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5035 if (u == NULL)
5036 goto onError;
5037 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5038 if (v == NULL)
5039 goto onError;
5040
5041 /* Shortcuts */
5042 if (v == unicode_empty) {
5043 Py_DECREF(v);
5044 return (PyObject *)u;
5045 }
5046 if (u == unicode_empty) {
5047 Py_DECREF(u);
5048 return (PyObject *)v;
5049 }
5050
5051 /* Concat the two Unicode strings */
5052 w = _PyUnicode_New(u->length + v->length);
5053 if (w == NULL)
5054 goto onError;
5055 Py_UNICODE_COPY(w->str, u->str, u->length);
5056 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
5057
5058 Py_DECREF(u);
5059 Py_DECREF(v);
5060 return (PyObject *)w;
5061
5062onError:
5063 Py_XDECREF(u);
5064 Py_XDECREF(v);
5065 return NULL;
5066}
5067
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005068PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005069"S.count(sub[, start[, end]]) -> int\n\
5070\n\
5071Return the number of occurrences of substring sub in Unicode string\n\
5072S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005073interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005074
5075static PyObject *
5076unicode_count(PyUnicodeObject *self, PyObject *args)
5077{
5078 PyUnicodeObject *substring;
5079 int start = 0;
5080 int end = INT_MAX;
5081 PyObject *result;
5082
Guido van Rossumb8872e62000-05-09 14:14:27 +00005083 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
5084 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005085 return NULL;
5086
5087 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5088 (PyObject *)substring);
5089 if (substring == NULL)
5090 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005091
Guido van Rossumd57fd912000-03-10 22:53:23 +00005092 if (start < 0)
5093 start += self->length;
5094 if (start < 0)
5095 start = 0;
5096 if (end > self->length)
5097 end = self->length;
5098 if (end < 0)
5099 end += self->length;
5100 if (end < 0)
5101 end = 0;
5102
5103 result = PyInt_FromLong((long) count(self, start, end, substring));
5104
5105 Py_DECREF(substring);
5106 return result;
5107}
5108
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005109PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005110"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005111\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005112Encodes S using the codec registered for encoding. encoding defaults\n\
5113to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00005114handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005115a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
5116'xmlcharrefreplace' as well as any other name registered with\n\
5117codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005118
5119static PyObject *
5120unicode_encode(PyUnicodeObject *self, PyObject *args)
5121{
5122 char *encoding = NULL;
5123 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005124 PyObject *v;
5125
Guido van Rossumd57fd912000-03-10 22:53:23 +00005126 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
5127 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005128 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005129 if (v == NULL)
5130 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005131 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5132 PyErr_Format(PyExc_TypeError,
5133 "encoder did not return a string/unicode object "
5134 "(type=%.400s)",
5135 v->ob_type->tp_name);
5136 Py_DECREF(v);
5137 return NULL;
5138 }
5139 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005140
5141 onError:
5142 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005143}
5144
5145PyDoc_STRVAR(decode__doc__,
5146"S.decode([encoding[,errors]]) -> string or unicode\n\
5147\n\
5148Decodes S using the codec registered for encoding. encoding defaults\n\
5149to the default encoding. errors may be given to set a different error\n\
5150handling scheme. Default is 'strict' meaning that encoding errors raise\n\
5151a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
5152as well as any other name registerd with codecs.register_error that is\n\
5153able to handle UnicodeDecodeErrors.");
5154
5155static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00005156unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005157{
5158 char *encoding = NULL;
5159 char *errors = NULL;
5160 PyObject *v;
5161
5162 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
5163 return NULL;
5164 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005165 if (v == NULL)
5166 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005167 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5168 PyErr_Format(PyExc_TypeError,
5169 "decoder did not return a string/unicode object "
5170 "(type=%.400s)",
5171 v->ob_type->tp_name);
5172 Py_DECREF(v);
5173 return NULL;
5174 }
5175 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005176
5177 onError:
5178 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005179}
5180
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005181PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005182"S.expandtabs([tabsize]) -> unicode\n\
5183\n\
5184Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005185If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005186
5187static PyObject*
5188unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
5189{
5190 Py_UNICODE *e;
5191 Py_UNICODE *p;
5192 Py_UNICODE *q;
5193 int i, j;
5194 PyUnicodeObject *u;
5195 int tabsize = 8;
5196
5197 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
5198 return NULL;
5199
Thomas Wouters7e474022000-07-16 12:04:32 +00005200 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005201 i = j = 0;
5202 e = self->str + self->length;
5203 for (p = self->str; p < e; p++)
5204 if (*p == '\t') {
5205 if (tabsize > 0)
5206 j += tabsize - (j % tabsize);
5207 }
5208 else {
5209 j++;
5210 if (*p == '\n' || *p == '\r') {
5211 i += j;
5212 j = 0;
5213 }
5214 }
5215
5216 /* Second pass: create output string and fill it */
5217 u = _PyUnicode_New(i + j);
5218 if (!u)
5219 return NULL;
5220
5221 j = 0;
5222 q = u->str;
5223
5224 for (p = self->str; p < e; p++)
5225 if (*p == '\t') {
5226 if (tabsize > 0) {
5227 i = tabsize - (j % tabsize);
5228 j += i;
5229 while (i--)
5230 *q++ = ' ';
5231 }
5232 }
5233 else {
5234 j++;
5235 *q++ = *p;
5236 if (*p == '\n' || *p == '\r')
5237 j = 0;
5238 }
5239
5240 return (PyObject*) u;
5241}
5242
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005243PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005244"S.find(sub [,start [,end]]) -> int\n\
5245\n\
5246Return the lowest index in S where substring sub is found,\n\
5247such that sub is contained within s[start,end]. Optional\n\
5248arguments start and end are interpreted as in slice notation.\n\
5249\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005250Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005251
5252static PyObject *
5253unicode_find(PyUnicodeObject *self, PyObject *args)
5254{
5255 PyUnicodeObject *substring;
5256 int start = 0;
5257 int end = INT_MAX;
5258 PyObject *result;
5259
Guido van Rossumb8872e62000-05-09 14:14:27 +00005260 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
5261 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005262 return NULL;
5263 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5264 (PyObject *)substring);
5265 if (substring == NULL)
5266 return NULL;
5267
5268 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
5269
5270 Py_DECREF(substring);
5271 return result;
5272}
5273
5274static PyObject *
5275unicode_getitem(PyUnicodeObject *self, int index)
5276{
5277 if (index < 0 || index >= self->length) {
5278 PyErr_SetString(PyExc_IndexError, "string index out of range");
5279 return NULL;
5280 }
5281
5282 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
5283}
5284
5285static long
5286unicode_hash(PyUnicodeObject *self)
5287{
Fredrik Lundhdde61642000-07-10 18:27:47 +00005288 /* Since Unicode objects compare equal to their ASCII string
5289 counterparts, they should use the individual character values
5290 as basis for their hash value. This is needed to assure that
5291 strings and Unicode objects behave in the same way as
5292 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005293
Fredrik Lundhdde61642000-07-10 18:27:47 +00005294 register int len;
5295 register Py_UNICODE *p;
5296 register long x;
5297
Guido van Rossumd57fd912000-03-10 22:53:23 +00005298 if (self->hash != -1)
5299 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00005300 len = PyUnicode_GET_SIZE(self);
5301 p = PyUnicode_AS_UNICODE(self);
5302 x = *p << 7;
5303 while (--len >= 0)
5304 x = (1000003*x) ^ *p++;
5305 x ^= PyUnicode_GET_SIZE(self);
5306 if (x == -1)
5307 x = -2;
5308 self->hash = x;
5309 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005310}
5311
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005312PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005313"S.index(sub [,start [,end]]) -> int\n\
5314\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005315Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005316
5317static PyObject *
5318unicode_index(PyUnicodeObject *self, PyObject *args)
5319{
5320 int result;
5321 PyUnicodeObject *substring;
5322 int start = 0;
5323 int end = INT_MAX;
5324
Guido van Rossumb8872e62000-05-09 14:14:27 +00005325 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
5326 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005327 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005328
Guido van Rossumd57fd912000-03-10 22:53:23 +00005329 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5330 (PyObject *)substring);
5331 if (substring == NULL)
5332 return NULL;
5333
5334 result = findstring(self, substring, start, end, 1);
5335
5336 Py_DECREF(substring);
5337 if (result < 0) {
5338 PyErr_SetString(PyExc_ValueError, "substring not found");
5339 return NULL;
5340 }
5341 return PyInt_FromLong(result);
5342}
5343
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005344PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005345"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005346\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005347Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005348at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005349
5350static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005351unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005352{
5353 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5354 register const Py_UNICODE *e;
5355 int cased;
5356
Guido van Rossumd57fd912000-03-10 22:53:23 +00005357 /* Shortcut for single character strings */
5358 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005359 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005360
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005361 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005362 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005363 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005364
Guido van Rossumd57fd912000-03-10 22:53:23 +00005365 e = p + PyUnicode_GET_SIZE(self);
5366 cased = 0;
5367 for (; p < e; p++) {
5368 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005369
Guido van Rossumd57fd912000-03-10 22:53:23 +00005370 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005371 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005372 else if (!cased && Py_UNICODE_ISLOWER(ch))
5373 cased = 1;
5374 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005375 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005376}
5377
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005378PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005379"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005380\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005381Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005382at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005383
5384static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005385unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005386{
5387 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5388 register const Py_UNICODE *e;
5389 int cased;
5390
Guido van Rossumd57fd912000-03-10 22:53:23 +00005391 /* Shortcut for single character strings */
5392 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005393 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005394
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005395 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005396 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005397 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005398
Guido van Rossumd57fd912000-03-10 22:53:23 +00005399 e = p + PyUnicode_GET_SIZE(self);
5400 cased = 0;
5401 for (; p < e; p++) {
5402 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005403
Guido van Rossumd57fd912000-03-10 22:53:23 +00005404 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005405 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005406 else if (!cased && Py_UNICODE_ISUPPER(ch))
5407 cased = 1;
5408 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005409 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005410}
5411
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005412PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005413"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005414\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005415Return True if S is a titlecased string and there is at least one\n\
5416character in S, i.e. upper- and titlecase characters may only\n\
5417follow uncased characters and lowercase characters only cased ones.\n\
5418Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005419
5420static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005421unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005422{
5423 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5424 register const Py_UNICODE *e;
5425 int cased, previous_is_cased;
5426
Guido van Rossumd57fd912000-03-10 22:53:23 +00005427 /* Shortcut for single character strings */
5428 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005429 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
5430 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005431
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005432 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005433 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005434 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005435
Guido van Rossumd57fd912000-03-10 22:53:23 +00005436 e = p + PyUnicode_GET_SIZE(self);
5437 cased = 0;
5438 previous_is_cased = 0;
5439 for (; p < e; p++) {
5440 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005441
Guido van Rossumd57fd912000-03-10 22:53:23 +00005442 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
5443 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005444 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005445 previous_is_cased = 1;
5446 cased = 1;
5447 }
5448 else if (Py_UNICODE_ISLOWER(ch)) {
5449 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005450 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005451 previous_is_cased = 1;
5452 cased = 1;
5453 }
5454 else
5455 previous_is_cased = 0;
5456 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005457 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005458}
5459
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005460PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005461"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005462\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005463Return True if all characters in S are whitespace\n\
5464and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005465
5466static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005467unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005468{
5469 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5470 register const Py_UNICODE *e;
5471
Guido van Rossumd57fd912000-03-10 22:53:23 +00005472 /* Shortcut for single character strings */
5473 if (PyUnicode_GET_SIZE(self) == 1 &&
5474 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005475 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005476
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005477 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005478 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005479 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005480
Guido van Rossumd57fd912000-03-10 22:53:23 +00005481 e = p + PyUnicode_GET_SIZE(self);
5482 for (; p < e; p++) {
5483 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005484 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005485 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005486 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005487}
5488
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005489PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005490"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005491\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005492Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005493and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005494
5495static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005496unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005497{
5498 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5499 register const Py_UNICODE *e;
5500
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005501 /* Shortcut for single character strings */
5502 if (PyUnicode_GET_SIZE(self) == 1 &&
5503 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005504 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005505
5506 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005507 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005508 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005509
5510 e = p + PyUnicode_GET_SIZE(self);
5511 for (; p < e; p++) {
5512 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005513 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005514 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005515 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005516}
5517
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005518PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005519"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005520\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005521Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005522and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005523
5524static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005525unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005526{
5527 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5528 register const Py_UNICODE *e;
5529
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005530 /* Shortcut for single character strings */
5531 if (PyUnicode_GET_SIZE(self) == 1 &&
5532 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005533 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005534
5535 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005536 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005537 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005538
5539 e = p + PyUnicode_GET_SIZE(self);
5540 for (; p < e; p++) {
5541 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005542 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005543 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005544 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005545}
5546
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005547PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005548"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005549\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005550Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005551False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005552
5553static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005554unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005555{
5556 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5557 register const Py_UNICODE *e;
5558
Guido van Rossumd57fd912000-03-10 22:53:23 +00005559 /* Shortcut for single character strings */
5560 if (PyUnicode_GET_SIZE(self) == 1 &&
5561 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005562 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005563
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005564 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005565 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005566 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005567
Guido van Rossumd57fd912000-03-10 22:53:23 +00005568 e = p + PyUnicode_GET_SIZE(self);
5569 for (; p < e; p++) {
5570 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005571 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005572 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005573 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005574}
5575
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005576PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005577"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005578\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005579Return True if all characters in S are digits\n\
5580and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005581
5582static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005583unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005584{
5585 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5586 register const Py_UNICODE *e;
5587
Guido van Rossumd57fd912000-03-10 22:53:23 +00005588 /* Shortcut for single character strings */
5589 if (PyUnicode_GET_SIZE(self) == 1 &&
5590 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005591 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005592
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005593 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005594 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005595 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005596
Guido van Rossumd57fd912000-03-10 22:53:23 +00005597 e = p + PyUnicode_GET_SIZE(self);
5598 for (; p < e; p++) {
5599 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005600 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005601 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005602 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005603}
5604
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005605PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005606"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005607\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005608Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005609False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005610
5611static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005612unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005613{
5614 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5615 register const Py_UNICODE *e;
5616
Guido van Rossumd57fd912000-03-10 22:53:23 +00005617 /* Shortcut for single character strings */
5618 if (PyUnicode_GET_SIZE(self) == 1 &&
5619 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005620 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005621
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005622 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005623 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005624 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005625
Guido van Rossumd57fd912000-03-10 22:53:23 +00005626 e = p + PyUnicode_GET_SIZE(self);
5627 for (; p < e; p++) {
5628 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005629 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005630 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005631 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005632}
5633
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005634PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005635"S.join(sequence) -> unicode\n\
5636\n\
5637Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005638sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005639
5640static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005641unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005642{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005643 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005644}
5645
5646static int
5647unicode_length(PyUnicodeObject *self)
5648{
5649 return self->length;
5650}
5651
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005652PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00005653"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005654\n\
5655Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005656done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005657
5658static PyObject *
5659unicode_ljust(PyUnicodeObject *self, PyObject *args)
5660{
5661 int width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005662 Py_UNICODE fillchar = ' ';
5663
5664 if (!PyArg_ParseTuple(args, "i|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005665 return NULL;
5666
Tim Peters7a29bd52001-09-12 03:03:31 +00005667 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005668 Py_INCREF(self);
5669 return (PyObject*) self;
5670 }
5671
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005672 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005673}
5674
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005675PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005676"S.lower() -> unicode\n\
5677\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005678Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005679
5680static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005681unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005682{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005683 return fixup(self, fixlower);
5684}
5685
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005686#define LEFTSTRIP 0
5687#define RIGHTSTRIP 1
5688#define BOTHSTRIP 2
5689
5690/* Arrays indexed by above */
5691static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
5692
5693#define STRIPNAME(i) (stripformat[i]+3)
5694
5695static const Py_UNICODE *
5696unicode_memchr(const Py_UNICODE *s, Py_UNICODE c, size_t n)
5697{
Tim Peters030a5ce2002-04-22 19:00:10 +00005698 size_t i;
5699 for (i = 0; i < n; ++i)
5700 if (s[i] == c)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005701 return s+i;
5702 return NULL;
5703}
5704
5705/* externally visible for str.strip(unicode) */
5706PyObject *
5707_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
5708{
5709 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
5710 int len = PyUnicode_GET_SIZE(self);
5711 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
5712 int seplen = PyUnicode_GET_SIZE(sepobj);
5713 int i, j;
5714
5715 i = 0;
5716 if (striptype != RIGHTSTRIP) {
5717 while (i < len && unicode_memchr(sep, s[i], seplen)) {
5718 i++;
5719 }
5720 }
5721
5722 j = len;
5723 if (striptype != LEFTSTRIP) {
5724 do {
5725 j--;
5726 } while (j >= i && unicode_memchr(sep, s[j], seplen));
5727 j++;
5728 }
5729
5730 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5731 Py_INCREF(self);
5732 return (PyObject*)self;
5733 }
5734 else
5735 return PyUnicode_FromUnicode(s+i, j-i);
5736}
5737
Guido van Rossumd57fd912000-03-10 22:53:23 +00005738
5739static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005740do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005741{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005742 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
5743 int len = PyUnicode_GET_SIZE(self), i, j;
5744
5745 i = 0;
5746 if (striptype != RIGHTSTRIP) {
5747 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
5748 i++;
5749 }
5750 }
5751
5752 j = len;
5753 if (striptype != LEFTSTRIP) {
5754 do {
5755 j--;
5756 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
5757 j++;
5758 }
5759
5760 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5761 Py_INCREF(self);
5762 return (PyObject*)self;
5763 }
5764 else
5765 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005766}
5767
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005768
5769static PyObject *
5770do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
5771{
5772 PyObject *sep = NULL;
5773
5774 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
5775 return NULL;
5776
5777 if (sep != NULL && sep != Py_None) {
5778 if (PyUnicode_Check(sep))
5779 return _PyUnicode_XStrip(self, striptype, sep);
5780 else if (PyString_Check(sep)) {
5781 PyObject *res;
5782 sep = PyUnicode_FromObject(sep);
5783 if (sep==NULL)
5784 return NULL;
5785 res = _PyUnicode_XStrip(self, striptype, sep);
5786 Py_DECREF(sep);
5787 return res;
5788 }
5789 else {
5790 PyErr_Format(PyExc_TypeError,
5791 "%s arg must be None, unicode or str",
5792 STRIPNAME(striptype));
5793 return NULL;
5794 }
5795 }
5796
5797 return do_strip(self, striptype);
5798}
5799
5800
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005801PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005802"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005803\n\
5804Return a copy of the string S with leading and trailing\n\
5805whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005806If chars is given and not None, remove characters in chars instead.\n\
5807If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005808
5809static PyObject *
5810unicode_strip(PyUnicodeObject *self, PyObject *args)
5811{
5812 if (PyTuple_GET_SIZE(args) == 0)
5813 return do_strip(self, BOTHSTRIP); /* Common case */
5814 else
5815 return do_argstrip(self, BOTHSTRIP, args);
5816}
5817
5818
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005819PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005820"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005821\n\
5822Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005823If chars is given and not None, remove characters in chars instead.\n\
5824If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005825
5826static PyObject *
5827unicode_lstrip(PyUnicodeObject *self, PyObject *args)
5828{
5829 if (PyTuple_GET_SIZE(args) == 0)
5830 return do_strip(self, LEFTSTRIP); /* Common case */
5831 else
5832 return do_argstrip(self, LEFTSTRIP, args);
5833}
5834
5835
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005836PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005837"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005838\n\
5839Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005840If chars is given and not None, remove characters in chars instead.\n\
5841If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005842
5843static PyObject *
5844unicode_rstrip(PyUnicodeObject *self, PyObject *args)
5845{
5846 if (PyTuple_GET_SIZE(args) == 0)
5847 return do_strip(self, RIGHTSTRIP); /* Common case */
5848 else
5849 return do_argstrip(self, RIGHTSTRIP, args);
5850}
5851
5852
Guido van Rossumd57fd912000-03-10 22:53:23 +00005853static PyObject*
5854unicode_repeat(PyUnicodeObject *str, int len)
5855{
5856 PyUnicodeObject *u;
5857 Py_UNICODE *p;
Tim Peters8f422462000-09-09 06:13:41 +00005858 int nchars;
5859 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005860
5861 if (len < 0)
5862 len = 0;
5863
Tim Peters7a29bd52001-09-12 03:03:31 +00005864 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005865 /* no repeat, return original string */
5866 Py_INCREF(str);
5867 return (PyObject*) str;
5868 }
Tim Peters8f422462000-09-09 06:13:41 +00005869
5870 /* ensure # of chars needed doesn't overflow int and # of bytes
5871 * needed doesn't overflow size_t
5872 */
5873 nchars = len * str->length;
5874 if (len && nchars / len != str->length) {
5875 PyErr_SetString(PyExc_OverflowError,
5876 "repeated string is too long");
5877 return NULL;
5878 }
5879 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
5880 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
5881 PyErr_SetString(PyExc_OverflowError,
5882 "repeated string is too long");
5883 return NULL;
5884 }
5885 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005886 if (!u)
5887 return NULL;
5888
5889 p = u->str;
5890
5891 while (len-- > 0) {
5892 Py_UNICODE_COPY(p, str->str, str->length);
5893 p += str->length;
5894 }
5895
5896 return (PyObject*) u;
5897}
5898
5899PyObject *PyUnicode_Replace(PyObject *obj,
5900 PyObject *subobj,
5901 PyObject *replobj,
5902 int maxcount)
5903{
5904 PyObject *self;
5905 PyObject *str1;
5906 PyObject *str2;
5907 PyObject *result;
5908
5909 self = PyUnicode_FromObject(obj);
5910 if (self == NULL)
5911 return NULL;
5912 str1 = PyUnicode_FromObject(subobj);
5913 if (str1 == NULL) {
5914 Py_DECREF(self);
5915 return NULL;
5916 }
5917 str2 = PyUnicode_FromObject(replobj);
5918 if (str2 == NULL) {
5919 Py_DECREF(self);
5920 Py_DECREF(str1);
5921 return NULL;
5922 }
Tim Petersced69f82003-09-16 20:30:58 +00005923 result = replace((PyUnicodeObject *)self,
5924 (PyUnicodeObject *)str1,
5925 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005926 maxcount);
5927 Py_DECREF(self);
5928 Py_DECREF(str1);
5929 Py_DECREF(str2);
5930 return result;
5931}
5932
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005933PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005934"S.replace (old, new[, maxsplit]) -> unicode\n\
5935\n\
5936Return a copy of S with all occurrences of substring\n\
5937old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005938given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005939
5940static PyObject*
5941unicode_replace(PyUnicodeObject *self, PyObject *args)
5942{
5943 PyUnicodeObject *str1;
5944 PyUnicodeObject *str2;
5945 int maxcount = -1;
5946 PyObject *result;
5947
5948 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
5949 return NULL;
5950 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
5951 if (str1 == NULL)
5952 return NULL;
5953 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00005954 if (str2 == NULL) {
5955 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005956 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00005957 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005958
5959 result = replace(self, str1, str2, maxcount);
5960
5961 Py_DECREF(str1);
5962 Py_DECREF(str2);
5963 return result;
5964}
5965
5966static
5967PyObject *unicode_repr(PyObject *unicode)
5968{
5969 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
5970 PyUnicode_GET_SIZE(unicode),
5971 1);
5972}
5973
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005974PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005975"S.rfind(sub [,start [,end]]) -> int\n\
5976\n\
5977Return the highest index in S where substring sub is found,\n\
5978such that sub is contained within s[start,end]. Optional\n\
5979arguments start and end are interpreted as in slice notation.\n\
5980\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005981Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005982
5983static PyObject *
5984unicode_rfind(PyUnicodeObject *self, PyObject *args)
5985{
5986 PyUnicodeObject *substring;
5987 int start = 0;
5988 int end = INT_MAX;
5989 PyObject *result;
5990
Guido van Rossumb8872e62000-05-09 14:14:27 +00005991 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
5992 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005993 return NULL;
5994 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5995 (PyObject *)substring);
5996 if (substring == NULL)
5997 return NULL;
5998
5999 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
6000
6001 Py_DECREF(substring);
6002 return result;
6003}
6004
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006005PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006006"S.rindex(sub [,start [,end]]) -> int\n\
6007\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006008Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006009
6010static PyObject *
6011unicode_rindex(PyUnicodeObject *self, PyObject *args)
6012{
6013 int result;
6014 PyUnicodeObject *substring;
6015 int start = 0;
6016 int end = INT_MAX;
6017
Guido van Rossumb8872e62000-05-09 14:14:27 +00006018 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
6019 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006020 return NULL;
6021 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6022 (PyObject *)substring);
6023 if (substring == NULL)
6024 return NULL;
6025
6026 result = findstring(self, substring, start, end, -1);
6027
6028 Py_DECREF(substring);
6029 if (result < 0) {
6030 PyErr_SetString(PyExc_ValueError, "substring not found");
6031 return NULL;
6032 }
6033 return PyInt_FromLong(result);
6034}
6035
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006036PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006037"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006038\n\
6039Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006040done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006041
6042static PyObject *
6043unicode_rjust(PyUnicodeObject *self, PyObject *args)
6044{
6045 int width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006046 Py_UNICODE fillchar = ' ';
6047
6048 if (!PyArg_ParseTuple(args, "i|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006049 return NULL;
6050
Tim Peters7a29bd52001-09-12 03:03:31 +00006051 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006052 Py_INCREF(self);
6053 return (PyObject*) self;
6054 }
6055
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006056 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006057}
6058
Guido van Rossumd57fd912000-03-10 22:53:23 +00006059static PyObject*
6060unicode_slice(PyUnicodeObject *self, int start, int end)
6061{
6062 /* standard clamping */
6063 if (start < 0)
6064 start = 0;
6065 if (end < 0)
6066 end = 0;
6067 if (end > self->length)
6068 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00006069 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006070 /* full slice, return original string */
6071 Py_INCREF(self);
6072 return (PyObject*) self;
6073 }
6074 if (start > end)
6075 start = end;
6076 /* copy slice */
6077 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
6078 end - start);
6079}
6080
6081PyObject *PyUnicode_Split(PyObject *s,
6082 PyObject *sep,
6083 int maxsplit)
6084{
6085 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00006086
Guido van Rossumd57fd912000-03-10 22:53:23 +00006087 s = PyUnicode_FromObject(s);
6088 if (s == NULL)
6089 return NULL;
6090 if (sep != NULL) {
6091 sep = PyUnicode_FromObject(sep);
6092 if (sep == NULL) {
6093 Py_DECREF(s);
6094 return NULL;
6095 }
6096 }
6097
6098 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6099
6100 Py_DECREF(s);
6101 Py_XDECREF(sep);
6102 return result;
6103}
6104
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006105PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006106"S.split([sep [,maxsplit]]) -> list of strings\n\
6107\n\
6108Return a list of the words in S, using sep as the\n\
6109delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00006110splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00006111any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006112
6113static PyObject*
6114unicode_split(PyUnicodeObject *self, PyObject *args)
6115{
6116 PyObject *substring = Py_None;
6117 int maxcount = -1;
6118
6119 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
6120 return NULL;
6121
6122 if (substring == Py_None)
6123 return split(self, NULL, maxcount);
6124 else if (PyUnicode_Check(substring))
6125 return split(self, (PyUnicodeObject *)substring, maxcount);
6126 else
6127 return PyUnicode_Split((PyObject *)self, substring, maxcount);
6128}
6129
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006130PyObject *PyUnicode_RSplit(PyObject *s,
6131 PyObject *sep,
6132 int maxsplit)
6133{
6134 PyObject *result;
6135
6136 s = PyUnicode_FromObject(s);
6137 if (s == NULL)
6138 return NULL;
6139 if (sep != NULL) {
6140 sep = PyUnicode_FromObject(sep);
6141 if (sep == NULL) {
6142 Py_DECREF(s);
6143 return NULL;
6144 }
6145 }
6146
6147 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6148
6149 Py_DECREF(s);
6150 Py_XDECREF(sep);
6151 return result;
6152}
6153
6154PyDoc_STRVAR(rsplit__doc__,
6155"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
6156\n\
6157Return a list of the words in S, using sep as the\n\
6158delimiter string, starting at the end of the string and\n\
6159working to the front. If maxsplit is given, at most maxsplit\n\
6160splits are done. If sep is not specified, any whitespace string\n\
6161is a separator.");
6162
6163static PyObject*
6164unicode_rsplit(PyUnicodeObject *self, PyObject *args)
6165{
6166 PyObject *substring = Py_None;
6167 int maxcount = -1;
6168
6169 if (!PyArg_ParseTuple(args, "|Oi:rsplit", &substring, &maxcount))
6170 return NULL;
6171
6172 if (substring == Py_None)
6173 return rsplit(self, NULL, maxcount);
6174 else if (PyUnicode_Check(substring))
6175 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
6176 else
6177 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
6178}
6179
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006180PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00006181"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006182\n\
6183Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00006184Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006185is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006186
6187static PyObject*
6188unicode_splitlines(PyUnicodeObject *self, PyObject *args)
6189{
Guido van Rossum86662912000-04-11 15:38:46 +00006190 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006191
Guido van Rossum86662912000-04-11 15:38:46 +00006192 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006193 return NULL;
6194
Guido van Rossum86662912000-04-11 15:38:46 +00006195 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006196}
6197
6198static
6199PyObject *unicode_str(PyUnicodeObject *self)
6200{
Fred Drakee4315f52000-05-09 19:53:39 +00006201 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006202}
6203
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006204PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006205"S.swapcase() -> unicode\n\
6206\n\
6207Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006208and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006209
6210static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006211unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006212{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006213 return fixup(self, fixswapcase);
6214}
6215
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006216PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006217"S.translate(table) -> unicode\n\
6218\n\
6219Return a copy of the string S, where all characters have been mapped\n\
6220through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00006221Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
6222Unmapped characters are left untouched. Characters mapped to None\n\
6223are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006224
6225static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006226unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006227{
Tim Petersced69f82003-09-16 20:30:58 +00006228 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006229 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00006230 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006231 "ignore");
6232}
6233
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006234PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006235"S.upper() -> unicode\n\
6236\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006237Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006238
6239static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006240unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006241{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006242 return fixup(self, fixupper);
6243}
6244
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006245PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006246"S.zfill(width) -> unicode\n\
6247\n\
6248Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006249of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006250
6251static PyObject *
6252unicode_zfill(PyUnicodeObject *self, PyObject *args)
6253{
6254 int fill;
6255 PyUnicodeObject *u;
6256
6257 int width;
6258 if (!PyArg_ParseTuple(args, "i:zfill", &width))
6259 return NULL;
6260
6261 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00006262 if (PyUnicode_CheckExact(self)) {
6263 Py_INCREF(self);
6264 return (PyObject*) self;
6265 }
6266 else
6267 return PyUnicode_FromUnicode(
6268 PyUnicode_AS_UNICODE(self),
6269 PyUnicode_GET_SIZE(self)
6270 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006271 }
6272
6273 fill = width - self->length;
6274
6275 u = pad(self, fill, 0, '0');
6276
Walter Dörwald068325e2002-04-15 13:36:47 +00006277 if (u == NULL)
6278 return NULL;
6279
Guido van Rossumd57fd912000-03-10 22:53:23 +00006280 if (u->str[fill] == '+' || u->str[fill] == '-') {
6281 /* move sign to beginning of string */
6282 u->str[0] = u->str[fill];
6283 u->str[fill] = '0';
6284 }
6285
6286 return (PyObject*) u;
6287}
Guido van Rossumd57fd912000-03-10 22:53:23 +00006288
6289#if 0
6290static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006291unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006292{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006293 return PyInt_FromLong(unicode_freelist_size);
6294}
6295#endif
6296
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006297PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006298"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006299\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006300Return True if S starts with the specified prefix, False otherwise.\n\
6301With optional start, test S beginning at that position.\n\
6302With optional end, stop comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006303
6304static PyObject *
6305unicode_startswith(PyUnicodeObject *self,
6306 PyObject *args)
6307{
6308 PyUnicodeObject *substring;
6309 int start = 0;
6310 int end = INT_MAX;
6311 PyObject *result;
6312
Guido van Rossumb8872e62000-05-09 14:14:27 +00006313 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
6314 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006315 return NULL;
6316 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6317 (PyObject *)substring);
6318 if (substring == NULL)
6319 return NULL;
6320
Guido van Rossum77f6a652002-04-03 22:41:51 +00006321 result = PyBool_FromLong(tailmatch(self, substring, start, end, -1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006322
6323 Py_DECREF(substring);
6324 return result;
6325}
6326
6327
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006328PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006329"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006330\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006331Return True if S ends with the specified suffix, False otherwise.\n\
6332With optional start, test S beginning at that position.\n\
6333With optional end, stop comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006334
6335static PyObject *
6336unicode_endswith(PyUnicodeObject *self,
6337 PyObject *args)
6338{
6339 PyUnicodeObject *substring;
6340 int start = 0;
6341 int end = INT_MAX;
6342 PyObject *result;
6343
Guido van Rossumb8872e62000-05-09 14:14:27 +00006344 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
6345 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006346 return NULL;
6347 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6348 (PyObject *)substring);
6349 if (substring == NULL)
6350 return NULL;
6351
Guido van Rossum77f6a652002-04-03 22:41:51 +00006352 result = PyBool_FromLong(tailmatch(self, substring, start, end, +1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006353
6354 Py_DECREF(substring);
6355 return result;
6356}
6357
6358
Guido van Rossum5d9113d2003-01-29 17:58:45 +00006359
6360static PyObject *
6361unicode_getnewargs(PyUnicodeObject *v)
6362{
6363 return Py_BuildValue("(u#)", v->str, v->length);
6364}
6365
6366
Guido van Rossumd57fd912000-03-10 22:53:23 +00006367static PyMethodDef unicode_methods[] = {
6368
6369 /* Order is according to common usage: often used methods should
6370 appear first, since lookup is done sequentially. */
6371
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006372 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
6373 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
6374 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006375 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006376 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
6377 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
6378 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
6379 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
6380 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
6381 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
6382 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
6383 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
6384 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
6385 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006386 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006387 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006388/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
6389 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
6390 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
6391 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006392 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006393 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006394 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006395 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
6396 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
6397 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
6398 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
6399 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
6400 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
6401 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
6402 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
6403 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
6404 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
6405 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
6406 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
6407 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
6408 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006409 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00006410#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006411 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006412#endif
6413
6414#if 0
6415 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006416 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006417#endif
6418
Guido van Rossum5d9113d2003-01-29 17:58:45 +00006419 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006420 {NULL, NULL}
6421};
6422
Neil Schemenauerce30bc92002-11-18 16:10:18 +00006423static PyObject *
6424unicode_mod(PyObject *v, PyObject *w)
6425{
6426 if (!PyUnicode_Check(v)) {
6427 Py_INCREF(Py_NotImplemented);
6428 return Py_NotImplemented;
6429 }
6430 return PyUnicode_Format(v, w);
6431}
6432
6433static PyNumberMethods unicode_as_number = {
6434 0, /*nb_add*/
6435 0, /*nb_subtract*/
6436 0, /*nb_multiply*/
6437 0, /*nb_divide*/
6438 unicode_mod, /*nb_remainder*/
6439};
6440
Guido van Rossumd57fd912000-03-10 22:53:23 +00006441static PySequenceMethods unicode_as_sequence = {
6442 (inquiry) unicode_length, /* sq_length */
6443 (binaryfunc) PyUnicode_Concat, /* sq_concat */
6444 (intargfunc) unicode_repeat, /* sq_repeat */
6445 (intargfunc) unicode_getitem, /* sq_item */
6446 (intintargfunc) unicode_slice, /* sq_slice */
6447 0, /* sq_ass_item */
6448 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00006449 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00006450};
6451
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006452static PyObject*
6453unicode_subscript(PyUnicodeObject* self, PyObject* item)
6454{
6455 if (PyInt_Check(item)) {
6456 long i = PyInt_AS_LONG(item);
6457 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006458 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006459 return unicode_getitem(self, i);
6460 } else if (PyLong_Check(item)) {
6461 long i = PyLong_AsLong(item);
6462 if (i == -1 && PyErr_Occurred())
6463 return NULL;
6464 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006465 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006466 return unicode_getitem(self, i);
6467 } else if (PySlice_Check(item)) {
6468 int start, stop, step, slicelength, cur, i;
6469 Py_UNICODE* source_buf;
6470 Py_UNICODE* result_buf;
6471 PyObject* result;
6472
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006473 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006474 &start, &stop, &step, &slicelength) < 0) {
6475 return NULL;
6476 }
6477
6478 if (slicelength <= 0) {
6479 return PyUnicode_FromUnicode(NULL, 0);
6480 } else {
6481 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
6482 result_buf = PyMem_MALLOC(slicelength*sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006483
6484 if (result_buf == NULL)
6485 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006486
6487 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
6488 result_buf[i] = source_buf[cur];
6489 }
Tim Petersced69f82003-09-16 20:30:58 +00006490
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006491 result = PyUnicode_FromUnicode(result_buf, slicelength);
6492 PyMem_FREE(result_buf);
6493 return result;
6494 }
6495 } else {
6496 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
6497 return NULL;
6498 }
6499}
6500
6501static PyMappingMethods unicode_as_mapping = {
6502 (inquiry)unicode_length, /* mp_length */
6503 (binaryfunc)unicode_subscript, /* mp_subscript */
6504 (objobjargproc)0, /* mp_ass_subscript */
6505};
6506
Guido van Rossumd57fd912000-03-10 22:53:23 +00006507static int
6508unicode_buffer_getreadbuf(PyUnicodeObject *self,
6509 int index,
6510 const void **ptr)
6511{
6512 if (index != 0) {
6513 PyErr_SetString(PyExc_SystemError,
6514 "accessing non-existent unicode segment");
6515 return -1;
6516 }
6517 *ptr = (void *) self->str;
6518 return PyUnicode_GET_DATA_SIZE(self);
6519}
6520
6521static int
6522unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
6523 const void **ptr)
6524{
6525 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00006526 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006527 return -1;
6528}
6529
6530static int
6531unicode_buffer_getsegcount(PyUnicodeObject *self,
6532 int *lenp)
6533{
6534 if (lenp)
6535 *lenp = PyUnicode_GET_DATA_SIZE(self);
6536 return 1;
6537}
6538
6539static int
6540unicode_buffer_getcharbuf(PyUnicodeObject *self,
6541 int index,
6542 const void **ptr)
6543{
6544 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00006545
Guido van Rossumd57fd912000-03-10 22:53:23 +00006546 if (index != 0) {
6547 PyErr_SetString(PyExc_SystemError,
6548 "accessing non-existent unicode segment");
6549 return -1;
6550 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00006551 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006552 if (str == NULL)
6553 return -1;
6554 *ptr = (void *) PyString_AS_STRING(str);
6555 return PyString_GET_SIZE(str);
6556}
6557
6558/* Helpers for PyUnicode_Format() */
6559
6560static PyObject *
Thomas Wouters78890102000-07-22 19:25:51 +00006561getnextarg(PyObject *args, int arglen, int *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006562{
6563 int argidx = *p_argidx;
6564 if (argidx < arglen) {
6565 (*p_argidx)++;
6566 if (arglen < 0)
6567 return args;
6568 else
6569 return PyTuple_GetItem(args, argidx);
6570 }
6571 PyErr_SetString(PyExc_TypeError,
6572 "not enough arguments for format string");
6573 return NULL;
6574}
6575
6576#define F_LJUST (1<<0)
6577#define F_SIGN (1<<1)
6578#define F_BLANK (1<<2)
6579#define F_ALT (1<<3)
6580#define F_ZERO (1<<4)
6581
6582static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006583int usprintf(register Py_UNICODE *buffer, char *format, ...)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006584{
6585 register int i;
6586 int len;
6587 va_list va;
6588 char *charbuffer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006589 va_start(va, format);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006590
6591 /* First, format the string as char array, then expand to Py_UNICODE
6592 array. */
6593 charbuffer = (char *)buffer;
6594 len = vsprintf(charbuffer, format, va);
6595 for (i = len - 1; i >= 0; i--)
6596 buffer[i] = (Py_UNICODE) charbuffer[i];
6597
6598 va_end(va);
6599 return len;
6600}
6601
Guido van Rossum078151d2002-08-11 04:24:12 +00006602/* XXX To save some code duplication, formatfloat/long/int could have been
6603 shared with stringobject.c, converting from 8-bit to Unicode after the
6604 formatting is done. */
6605
Guido van Rossumd57fd912000-03-10 22:53:23 +00006606static int
6607formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006608 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006609 int flags,
6610 int prec,
6611 int type,
6612 PyObject *v)
6613{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006614 /* fmt = '%#.' + `prec` + `type`
6615 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00006616 char fmt[20];
6617 double x;
Tim Petersced69f82003-09-16 20:30:58 +00006618
Guido van Rossumd57fd912000-03-10 22:53:23 +00006619 x = PyFloat_AsDouble(v);
6620 if (x == -1.0 && PyErr_Occurred())
6621 return -1;
6622 if (prec < 0)
6623 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006624 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
6625 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006626 /* Worst case length calc to ensure no buffer overrun:
6627
6628 'g' formats:
6629 fmt = %#.<prec>g
6630 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
6631 for any double rep.)
6632 len = 1 + prec + 1 + 2 + 5 = 9 + prec
6633
6634 'f' formats:
6635 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
6636 len = 1 + 50 + 1 + prec = 52 + prec
6637
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006638 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00006639 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006640
6641 */
6642 if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
6643 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006644 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006645 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006646 return -1;
6647 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006648 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
6649 (flags&F_ALT) ? "#" : "",
6650 prec, type);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006651 return usprintf(buf, fmt, x);
6652}
6653
Tim Peters38fd5b62000-09-21 05:43:11 +00006654static PyObject*
6655formatlong(PyObject *val, int flags, int prec, int type)
6656{
6657 char *buf;
6658 int i, len;
6659 PyObject *str; /* temporary string object. */
6660 PyUnicodeObject *result;
6661
6662 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
6663 if (!str)
6664 return NULL;
6665 result = _PyUnicode_New(len);
6666 for (i = 0; i < len; i++)
6667 result->str[i] = buf[i];
6668 result->str[len] = 0;
6669 Py_DECREF(str);
6670 return (PyObject*)result;
6671}
6672
Guido van Rossumd57fd912000-03-10 22:53:23 +00006673static int
6674formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006675 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006676 int flags,
6677 int prec,
6678 int type,
6679 PyObject *v)
6680{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006681 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006682 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
6683 * + 1 + 1
6684 * = 24
6685 */
Tim Peters38fd5b62000-09-21 05:43:11 +00006686 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006687 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006688 long x;
6689
6690 x = PyInt_AsLong(v);
6691 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006692 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006693 if (x < 0 && type == 'u') {
6694 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00006695 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006696 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
6697 sign = "-";
6698 else
6699 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00006700 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006701 prec = 1;
6702
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006703 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
6704 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006705 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006706 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006707 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006708 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006709 return -1;
6710 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006711
6712 if ((flags & F_ALT) &&
6713 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00006714 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006715 * of issues that cause pain:
6716 * - when 0 is being converted, the C standard leaves off
6717 * the '0x' or '0X', which is inconsistent with other
6718 * %#x/%#X conversions and inconsistent with Python's
6719 * hex() function
6720 * - there are platforms that violate the standard and
6721 * convert 0 with the '0x' or '0X'
6722 * (Metrowerks, Compaq Tru64)
6723 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00006724 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006725 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00006726 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006727 * We can achieve the desired consistency by inserting our
6728 * own '0x' or '0X' prefix, and substituting %x/%X in place
6729 * of %#x/%#X.
6730 *
6731 * Note that this is the same approach as used in
6732 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006733 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006734 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
6735 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006736 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006737 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006738 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
6739 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006740 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00006741 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006742 if (sign[0])
6743 return usprintf(buf, fmt, -x);
6744 else
6745 return usprintf(buf, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006746}
6747
6748static int
6749formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006750 size_t buflen,
6751 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006752{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006753 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006754 if (PyUnicode_Check(v)) {
6755 if (PyUnicode_GET_SIZE(v) != 1)
6756 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006757 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006758 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006759
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006760 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00006761 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006762 goto onError;
6763 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
6764 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006765
6766 else {
6767 /* Integer input truncated to a character */
6768 long x;
6769 x = PyInt_AsLong(v);
6770 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006771 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006772#ifdef Py_UNICODE_WIDE
6773 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00006774 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006775 "%c arg not in range(0x110000) "
6776 "(wide Python build)");
6777 return -1;
6778 }
6779#else
6780 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00006781 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006782 "%c arg not in range(0x10000) "
6783 "(narrow Python build)");
6784 return -1;
6785 }
6786#endif
6787 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006788 }
6789 buf[1] = '\0';
6790 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006791
6792 onError:
6793 PyErr_SetString(PyExc_TypeError,
6794 "%c requires int or char");
6795 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006796}
6797
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006798/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
6799
6800 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
6801 chars are formatted. XXX This is a magic number. Each formatting
6802 routine does bounds checking to ensure no overflow, but a better
6803 solution may be to malloc a buffer of appropriate size for each
6804 format. For now, the current solution is sufficient.
6805*/
6806#define FORMATBUFLEN (size_t)120
6807
Guido van Rossumd57fd912000-03-10 22:53:23 +00006808PyObject *PyUnicode_Format(PyObject *format,
6809 PyObject *args)
6810{
6811 Py_UNICODE *fmt, *res;
6812 int fmtcnt, rescnt, reslen, arglen, argidx;
6813 int args_owned = 0;
6814 PyUnicodeObject *result = NULL;
6815 PyObject *dict = NULL;
6816 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00006817
Guido van Rossumd57fd912000-03-10 22:53:23 +00006818 if (format == NULL || args == NULL) {
6819 PyErr_BadInternalCall();
6820 return NULL;
6821 }
6822 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00006823 if (uformat == NULL)
6824 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006825 fmt = PyUnicode_AS_UNICODE(uformat);
6826 fmtcnt = PyUnicode_GET_SIZE(uformat);
6827
6828 reslen = rescnt = fmtcnt + 100;
6829 result = _PyUnicode_New(reslen);
6830 if (result == NULL)
6831 goto onError;
6832 res = PyUnicode_AS_UNICODE(result);
6833
6834 if (PyTuple_Check(args)) {
6835 arglen = PyTuple_Size(args);
6836 argidx = 0;
6837 }
6838 else {
6839 arglen = -1;
6840 argidx = -2;
6841 }
Neal Norwitz80a1bf42002-11-12 23:01:12 +00006842 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
6843 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006844 dict = args;
6845
6846 while (--fmtcnt >= 0) {
6847 if (*fmt != '%') {
6848 if (--rescnt < 0) {
6849 rescnt = fmtcnt + 100;
6850 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006851 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006852 return NULL;
6853 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
6854 --rescnt;
6855 }
6856 *res++ = *fmt++;
6857 }
6858 else {
6859 /* Got a format specifier */
6860 int flags = 0;
6861 int width = -1;
6862 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006863 Py_UNICODE c = '\0';
6864 Py_UNICODE fill;
6865 PyObject *v = NULL;
6866 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006867 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006868 Py_UNICODE sign;
6869 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006870 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006871
6872 fmt++;
6873 if (*fmt == '(') {
6874 Py_UNICODE *keystart;
6875 int keylen;
6876 PyObject *key;
6877 int pcount = 1;
6878
6879 if (dict == NULL) {
6880 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00006881 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006882 goto onError;
6883 }
6884 ++fmt;
6885 --fmtcnt;
6886 keystart = fmt;
6887 /* Skip over balanced parentheses */
6888 while (pcount > 0 && --fmtcnt >= 0) {
6889 if (*fmt == ')')
6890 --pcount;
6891 else if (*fmt == '(')
6892 ++pcount;
6893 fmt++;
6894 }
6895 keylen = fmt - keystart - 1;
6896 if (fmtcnt < 0 || pcount > 0) {
6897 PyErr_SetString(PyExc_ValueError,
6898 "incomplete format key");
6899 goto onError;
6900 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00006901#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00006902 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00006903 then looked up since Python uses strings to hold
6904 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00006905 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006906 key = PyUnicode_EncodeUTF8(keystart,
6907 keylen,
6908 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00006909#else
6910 key = PyUnicode_FromUnicode(keystart, keylen);
6911#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00006912 if (key == NULL)
6913 goto onError;
6914 if (args_owned) {
6915 Py_DECREF(args);
6916 args_owned = 0;
6917 }
6918 args = PyObject_GetItem(dict, key);
6919 Py_DECREF(key);
6920 if (args == NULL) {
6921 goto onError;
6922 }
6923 args_owned = 1;
6924 arglen = -1;
6925 argidx = -2;
6926 }
6927 while (--fmtcnt >= 0) {
6928 switch (c = *fmt++) {
6929 case '-': flags |= F_LJUST; continue;
6930 case '+': flags |= F_SIGN; continue;
6931 case ' ': flags |= F_BLANK; continue;
6932 case '#': flags |= F_ALT; continue;
6933 case '0': flags |= F_ZERO; continue;
6934 }
6935 break;
6936 }
6937 if (c == '*') {
6938 v = getnextarg(args, arglen, &argidx);
6939 if (v == NULL)
6940 goto onError;
6941 if (!PyInt_Check(v)) {
6942 PyErr_SetString(PyExc_TypeError,
6943 "* wants int");
6944 goto onError;
6945 }
6946 width = PyInt_AsLong(v);
6947 if (width < 0) {
6948 flags |= F_LJUST;
6949 width = -width;
6950 }
6951 if (--fmtcnt >= 0)
6952 c = *fmt++;
6953 }
6954 else if (c >= '0' && c <= '9') {
6955 width = c - '0';
6956 while (--fmtcnt >= 0) {
6957 c = *fmt++;
6958 if (c < '0' || c > '9')
6959 break;
6960 if ((width*10) / 10 != width) {
6961 PyErr_SetString(PyExc_ValueError,
6962 "width too big");
6963 goto onError;
6964 }
6965 width = width*10 + (c - '0');
6966 }
6967 }
6968 if (c == '.') {
6969 prec = 0;
6970 if (--fmtcnt >= 0)
6971 c = *fmt++;
6972 if (c == '*') {
6973 v = getnextarg(args, arglen, &argidx);
6974 if (v == NULL)
6975 goto onError;
6976 if (!PyInt_Check(v)) {
6977 PyErr_SetString(PyExc_TypeError,
6978 "* wants int");
6979 goto onError;
6980 }
6981 prec = PyInt_AsLong(v);
6982 if (prec < 0)
6983 prec = 0;
6984 if (--fmtcnt >= 0)
6985 c = *fmt++;
6986 }
6987 else if (c >= '0' && c <= '9') {
6988 prec = c - '0';
6989 while (--fmtcnt >= 0) {
6990 c = Py_CHARMASK(*fmt++);
6991 if (c < '0' || c > '9')
6992 break;
6993 if ((prec*10) / 10 != prec) {
6994 PyErr_SetString(PyExc_ValueError,
6995 "prec too big");
6996 goto onError;
6997 }
6998 prec = prec*10 + (c - '0');
6999 }
7000 }
7001 } /* prec */
7002 if (fmtcnt >= 0) {
7003 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007004 if (--fmtcnt >= 0)
7005 c = *fmt++;
7006 }
7007 }
7008 if (fmtcnt < 0) {
7009 PyErr_SetString(PyExc_ValueError,
7010 "incomplete format");
7011 goto onError;
7012 }
7013 if (c != '%') {
7014 v = getnextarg(args, arglen, &argidx);
7015 if (v == NULL)
7016 goto onError;
7017 }
7018 sign = 0;
7019 fill = ' ';
7020 switch (c) {
7021
7022 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007023 pbuf = formatbuf;
7024 /* presume that buffer length is at least 1 */
7025 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007026 len = 1;
7027 break;
7028
7029 case 's':
7030 case 'r':
7031 if (PyUnicode_Check(v) && c == 's') {
7032 temp = v;
7033 Py_INCREF(temp);
7034 }
7035 else {
7036 PyObject *unicode;
7037 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007038 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007039 else
7040 temp = PyObject_Repr(v);
7041 if (temp == NULL)
7042 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007043 if (PyUnicode_Check(temp))
7044 /* nothing to do */;
7045 else if (PyString_Check(temp)) {
7046 /* convert to string to Unicode */
Fred Drakee4315f52000-05-09 19:53:39 +00007047 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00007048 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00007049 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007050 "strict");
7051 Py_DECREF(temp);
7052 temp = unicode;
7053 if (temp == NULL)
7054 goto onError;
7055 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007056 else {
7057 Py_DECREF(temp);
7058 PyErr_SetString(PyExc_TypeError,
7059 "%s argument has non-string str()");
7060 goto onError;
7061 }
7062 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007063 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007064 len = PyUnicode_GET_SIZE(temp);
7065 if (prec >= 0 && len > prec)
7066 len = prec;
7067 break;
7068
7069 case 'i':
7070 case 'd':
7071 case 'u':
7072 case 'o':
7073 case 'x':
7074 case 'X':
7075 if (c == 'i')
7076 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00007077 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007078 temp = formatlong(v, flags, prec, c);
7079 if (!temp)
7080 goto onError;
7081 pbuf = PyUnicode_AS_UNICODE(temp);
7082 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00007083 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007084 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007085 else {
7086 pbuf = formatbuf;
7087 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7088 flags, prec, c, v);
7089 if (len < 0)
7090 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007091 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007092 }
7093 if (flags & F_ZERO)
7094 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007095 break;
7096
7097 case 'e':
7098 case 'E':
7099 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007100 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00007101 case 'g':
7102 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007103 if (c == 'F')
7104 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007105 pbuf = formatbuf;
7106 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7107 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007108 if (len < 0)
7109 goto onError;
7110 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007111 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007112 fill = '0';
7113 break;
7114
7115 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007116 pbuf = formatbuf;
7117 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007118 if (len < 0)
7119 goto onError;
7120 break;
7121
7122 default:
7123 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00007124 "unsupported format character '%c' (0x%x) "
7125 "at index %i",
Tim Petersced69f82003-09-16 20:30:58 +00007126 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00007127 (int)c,
Guido van Rossumefc11882002-09-12 14:43:41 +00007128 (int)(fmt -1 - PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007129 goto onError;
7130 }
7131 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007132 if (*pbuf == '-' || *pbuf == '+') {
7133 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007134 len--;
7135 }
7136 else if (flags & F_SIGN)
7137 sign = '+';
7138 else if (flags & F_BLANK)
7139 sign = ' ';
7140 else
7141 sign = 0;
7142 }
7143 if (width < len)
7144 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007145 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007146 reslen -= rescnt;
7147 rescnt = width + fmtcnt + 100;
7148 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007149 if (reslen < 0) {
7150 Py_DECREF(result);
7151 return PyErr_NoMemory();
7152 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007153 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007154 return NULL;
7155 res = PyUnicode_AS_UNICODE(result)
7156 + reslen - rescnt;
7157 }
7158 if (sign) {
7159 if (fill != ' ')
7160 *res++ = sign;
7161 rescnt--;
7162 if (width > len)
7163 width--;
7164 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007165 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
7166 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007167 assert(pbuf[1] == c);
7168 if (fill != ' ') {
7169 *res++ = *pbuf++;
7170 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00007171 }
Tim Petersfff53252001-04-12 18:38:48 +00007172 rescnt -= 2;
7173 width -= 2;
7174 if (width < 0)
7175 width = 0;
7176 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00007177 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007178 if (width > len && !(flags & F_LJUST)) {
7179 do {
7180 --rescnt;
7181 *res++ = fill;
7182 } while (--width > len);
7183 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007184 if (fill == ' ') {
7185 if (sign)
7186 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00007187 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007188 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007189 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00007190 *res++ = *pbuf++;
7191 *res++ = *pbuf++;
7192 }
7193 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007194 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007195 res += len;
7196 rescnt -= len;
7197 while (--width >= len) {
7198 --rescnt;
7199 *res++ = ' ';
7200 }
7201 if (dict && (argidx < arglen) && c != '%') {
7202 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007203 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007204 goto onError;
7205 }
7206 Py_XDECREF(temp);
7207 } /* '%' */
7208 } /* until end */
7209 if (argidx < arglen && !dict) {
7210 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007211 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007212 goto onError;
7213 }
7214
7215 if (args_owned) {
7216 Py_DECREF(args);
7217 }
7218 Py_DECREF(uformat);
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00007219 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00007220 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007221 return (PyObject *)result;
7222
7223 onError:
7224 Py_XDECREF(result);
7225 Py_DECREF(uformat);
7226 if (args_owned) {
7227 Py_DECREF(args);
7228 }
7229 return NULL;
7230}
7231
7232static PyBufferProcs unicode_as_buffer = {
7233 (getreadbufferproc) unicode_buffer_getreadbuf,
7234 (getwritebufferproc) unicode_buffer_getwritebuf,
7235 (getsegcountproc) unicode_buffer_getsegcount,
7236 (getcharbufferproc) unicode_buffer_getcharbuf,
7237};
7238
Jeremy Hylton938ace62002-07-17 16:30:39 +00007239static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00007240unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
7241
Tim Peters6d6c1a32001-08-02 04:15:00 +00007242static PyObject *
7243unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7244{
7245 PyObject *x = NULL;
Jeremy Hyltonaf68c872005-12-10 18:50:16 +00007246 static const char *kwlist[] = {"string", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00007247 char *encoding = NULL;
7248 char *errors = NULL;
7249
Guido van Rossume023fe02001-08-30 03:12:59 +00007250 if (type != &PyUnicode_Type)
7251 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00007252 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
7253 kwlist, &x, &encoding, &errors))
7254 return NULL;
7255 if (x == NULL)
7256 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00007257 if (encoding == NULL && errors == NULL)
7258 return PyObject_Unicode(x);
7259 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00007260 return PyUnicode_FromEncodedObject(x, encoding, errors);
7261}
7262
Guido van Rossume023fe02001-08-30 03:12:59 +00007263static PyObject *
7264unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7265{
Tim Petersaf90b3e2001-09-12 05:18:58 +00007266 PyUnicodeObject *tmp, *pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00007267 int n;
7268
7269 assert(PyType_IsSubtype(type, &PyUnicode_Type));
7270 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
7271 if (tmp == NULL)
7272 return NULL;
7273 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00007274 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007275 if (pnew == NULL) {
7276 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00007277 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00007278 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007279 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
7280 if (pnew->str == NULL) {
7281 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007282 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007283 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00007284 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00007285 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007286 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
7287 pnew->length = n;
7288 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00007289 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00007290 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00007291}
7292
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007293PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00007294"unicode(string [, encoding[, errors]]) -> object\n\
7295\n\
7296Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00007297encoding defaults to the current default string encoding.\n\
7298errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00007299
Guido van Rossumd57fd912000-03-10 22:53:23 +00007300PyTypeObject PyUnicode_Type = {
7301 PyObject_HEAD_INIT(&PyType_Type)
7302 0, /* ob_size */
7303 "unicode", /* tp_name */
7304 sizeof(PyUnicodeObject), /* tp_size */
7305 0, /* tp_itemsize */
7306 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00007307 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007308 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007309 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007310 0, /* tp_setattr */
7311 (cmpfunc) unicode_compare, /* tp_compare */
7312 (reprfunc) unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007313 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007314 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007315 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007316 (hashfunc) unicode_hash, /* tp_hash*/
7317 0, /* tp_call*/
7318 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007319 PyObject_GenericGetAttr, /* tp_getattro */
7320 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007321 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007322 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
7323 Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007324 unicode_doc, /* tp_doc */
7325 0, /* tp_traverse */
7326 0, /* tp_clear */
7327 0, /* tp_richcompare */
7328 0, /* tp_weaklistoffset */
7329 0, /* tp_iter */
7330 0, /* tp_iternext */
7331 unicode_methods, /* tp_methods */
7332 0, /* tp_members */
7333 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00007334 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007335 0, /* tp_dict */
7336 0, /* tp_descr_get */
7337 0, /* tp_descr_set */
7338 0, /* tp_dictoffset */
7339 0, /* tp_init */
7340 0, /* tp_alloc */
7341 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007342 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007343};
7344
7345/* Initialize the Unicode implementation */
7346
Thomas Wouters78890102000-07-22 19:25:51 +00007347void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007348{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007349 int i;
7350
Fred Drakee4315f52000-05-09 19:53:39 +00007351 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007352 unicode_freelist = NULL;
7353 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007354 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00007355 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007356 for (i = 0; i < 256; i++)
7357 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00007358 if (PyType_Ready(&PyUnicode_Type) < 0)
7359 Py_FatalError("Can't initialize 'unicode'");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007360}
7361
7362/* Finalize the Unicode implementation */
7363
7364void
Thomas Wouters78890102000-07-22 19:25:51 +00007365_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007366{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007367 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007368 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007369
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00007370 Py_XDECREF(unicode_empty);
7371 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007372
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007373 for (i = 0; i < 256; i++) {
7374 if (unicode_latin1[i]) {
7375 Py_DECREF(unicode_latin1[i]);
7376 unicode_latin1[i] = NULL;
7377 }
7378 }
7379
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007380 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007381 PyUnicodeObject *v = u;
7382 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00007383 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00007384 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007385 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007386 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007387 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007388 unicode_freelist = NULL;
7389 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007390}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00007391
7392/*
7393Local variables:
7394c-basic-offset: 4
7395indent-tabs-mode: nil
7396End:
7397*/